org.apache.hadoop.fs.FileSystem Scala Examples
The following examples show how to use org.apache.hadoop.fs.FileSystem.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).map { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val t = creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .head val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 2
Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0 | 6 votes |
package io.eels.component.parquet import java.nio.file.Paths import io.eels.component.parquet.avro.AvroParquetSource import io.eels.component.parquet.util.ParquetLogMute import io.eels.schema._ import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{Matchers, WordSpec} class AvroParquetSourceTest extends WordSpec with Matchers { ParquetLogMute() private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(conf) private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI) private val resourcesDir = personFile.getParent "AvroParquetSource" should { "read schema" in { val people = AvroParquetSource(personFile) people.schema shouldBe StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) } "read parquet files" in { val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } "read multiple parquet files using file expansion" in { import io.eels.FilePattern._ val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner"), Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } // todo add merge to parquet source "merge schemas" ignore { try { fs.delete(new Path("merge1.pq"), false) } catch { case t: Throwable => } try { fs.delete(new Path("merge2.pq"), false) } catch { case t: Throwable => } val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord() val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord() val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build() val record1 = new GenericData.Record(schema1) record1.put("a", "aaaaa") record1.put("b", 124.3) writer1.write(record1) writer1.close() val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build() val record2 = new GenericData.Record(schema2) record2.put("a", 111) record2.put("c", true) writer2.write(record2) writer2.close() ParquetSource(new Path("merge*")).schema shouldBe StructType( Field("a", StringType, nullable = false), Field("b", DoubleType, nullable = false), Field("c", BooleanType, nullable = false) ) fs.delete(new Path(".merge1.pq.crc"), false) fs.delete(new Path(".merge2.pq.crc"), false) fs.delete(new Path("merge1.pq"), false) fs.delete(new Path("merge2.pq"), false) } } }
Example 3
Source File: DStreamCheckpointData.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.Logging import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) extends Serializable with Logging { protected val data = new HashMap[Time, AnyRef]() // Mapping of the batch time to the checkpointed RDD file of that time @transient private var timeToCheckpointFile = new HashMap[Time, String] // Mapping of the batch time to the time of the oldest checkpointed RDD // in that batch's checkpoint data @transient private var timeToOldestCheckpointFileTime = new HashMap[Time, Time] @transient private var fileSystem: FileSystem = null protected[streaming] def currentCheckpointFiles = data.asInstanceOf[HashMap[Time, String]] def restore() { // Create RDDs from the checkpoint data currentCheckpointFiles.foreach { case(time, file) => logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'") dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file))) } } override def toString: String = { "[\n" + currentCheckpointFiles.size + " checkpoint files \n" + currentCheckpointFiles.mkString("\n") + "\n]" } @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".writeObject used") if (dstream.context.graph != null) { dstream.context.graph.synchronized { if (dstream.context.graph.checkpointInProgress) { oos.defaultWriteObject() } else { val msg = "Object of " + this.getClass.getName + " is being serialized " + " possibly as a part of closure of an RDD operation. This is because " + " the DStream object is being referred to from within the closure. " + " Please rewrite the RDD operation inside this DStream to avoid this. " + " This has been enforced to avoid bloating of Spark tasks " + " with unnecessary objects." throw new java.io.NotSerializableException(msg) } } } else { throw new java.io.NotSerializableException( "Graph is unexpectedly null when DStream is being serialized.") } } @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".readObject used") ois.defaultReadObject() timeToOldestCheckpointFileTime = new HashMap[Time, Time] timeToCheckpointFile = new HashMap[Time, String] } }
Example 4
Source File: ExecutorSource.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import java.util.concurrent.ThreadPoolExecutor import scala.collection.JavaConverters._ import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.hadoop.fs.FileSystem import org.apache.spark.metrics.source.Source private[spark] class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source { private def fileStats(scheme: String) : Option[FileSystem.Statistics] = FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme)) private def registerFileSystemStat[T]( scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = { metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] { override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue) }) } override val metricRegistry = new MetricRegistry() override val sourceName = "executor" // Gauge for executor thread pool's actively executing task counts metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] { override def getValue: Int = threadPool.getActiveCount() }) // Gauge for executor thread pool's approximate total number of tasks that have been completed metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] { override def getValue: Long = threadPool.getCompletedTaskCount() }) // Gauge for executor thread pool's current number of threads metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getPoolSize() }) // Gauge got executor thread pool's largest number of threads that have ever simultaneously // been in th pool metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getMaximumPoolSize() }) // Gauge for file system stats of this executor for (scheme <- Array("hdfs", "file")) { registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L) registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L) registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0) registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0) registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0) } }
Example 5
Source File: SentenceTokenizer.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.text import java.io.FileInputStream import java.net.{URI, URL} import com.intel.analytics.bigdl.dataset.Transformer import scala.collection.Iterator import opennlp.tools.tokenize.{SimpleTokenizer, Tokenizer, TokenizerME, TokenizerModel} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} class SentenceTokenizer(tokenFile: Option[String] = None) extends Transformer[String, Array[String]] { var modelIn: FileInputStream = _ var model: TokenizerModel = _ var tokenizer: Tokenizer = _ def this(tokenFile: URL) { this(Some(tokenFile.getPath)) } def close(): Unit = { if (modelIn != null) { modelIn.close() } } override def apply(prev: Iterator[String]): Iterator[Array[String]] = prev.map(x => { if (tokenizer == null) { if (!tokenFile.isDefined) { tokenizer = SimpleTokenizer.INSTANCE } else { val src: Path = new Path(tokenFile.get) val fs = src.getFileSystem(new Configuration()) val in = fs.open(src) model = new TokenizerModel(in) tokenizer = new TokenizerME(model) } } val words = tokenizer.tokenize(x) words }) } object SentenceTokenizer { def apply(tokenFile: Option[String] = None): SentenceTokenizer = new SentenceTokenizer(tokenFile) def apply(tokenFile: URL): SentenceTokenizer = new SentenceTokenizer(tokenFile) }
Example 6
Source File: SentenceSplitter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.text import java.io.FileInputStream import java.net.{URI, URL} import com.intel.analytics.bigdl.dataset.Transformer import opennlp.tools.sentdetect.{SentenceDetector, SentenceDetectorME, SentenceModel} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.collection.Iterator class SentenceSplitter(sentFile: Option[String] = None) extends Transformer[String, Array[String]] { var modelIn: FileInputStream = _ var model: SentenceModel = _ var sentenceDetector: SentenceDetector = _ def this(sentFileURL: URL) { this(Some(sentFileURL.getPath)) } def this(sentFile: String) { this(Some(sentFile)) } def close(): Unit = { if (modelIn != null) { modelIn.close() } } override def apply(prev: Iterator[String]): Iterator[Array[String]] = prev.map(x => { if (!sentFile.isDefined) { x.split('.') } else { if (sentenceDetector == null) { val src: Path = new Path(sentFile.get) val fs = src.getFileSystem(new Configuration()) val in = fs.open(src) model = new SentenceModel(in) sentenceDetector = new SentenceDetectorME(model) } sentenceDetector.sentDetect(x) } }) } object SentenceSplitter { def apply(sentFile: Option[String] = None): SentenceSplitter = new SentenceSplitter(sentFile) def apply(sentFileURL: URL): SentenceSplitter = new SentenceSplitter(sentFileURL) def apply(sentFile: String): SentenceSplitter = new SentenceSplitter(sentFile) }
Example 7
Source File: RecordWriter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.visualization.tensorboard import java.io.{File, FileOutputStream} import com.google.common.primitives.{Ints, Longs} import com.intel.analytics.bigdl.utils.Crc32 import netty.Crc32c import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path} import org.tensorflow.util.Event private[bigdl] class RecordWriter(file: Path, fs: FileSystem) { val outputStream = if (file.toString.startsWith("hdfs://")) { // FSDataOutputStream couldn't flush data to localFileSystem in time. So reading summaries // will throw exception. fs.create(file, true, 1024) } else { // Using FileOutputStream when write to local. new FileOutputStream(new File(file.toString)) } val crc32 = new Crc32c() def write(event: Event): Unit = { val eventString = event.toByteArray val header = Longs.toByteArray(eventString.length.toLong).reverse outputStream.write(header) outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, header).toInt).reverse) outputStream.write(eventString) outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, eventString).toInt).reverse) if (outputStream.isInstanceOf[FSDataOutputStream]) { // Flush data to HDFS. outputStream.asInstanceOf[FSDataOutputStream].hflush() } } def close(): Unit = { outputStream.close() } }
Example 8
Source File: FileReader.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.visualization.tensorboard import java.io.{BufferedInputStream} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.tensorflow.util.Event import scala.collection.mutable.ArrayBuffer import scala.util.matching.Regex private[bigdl] object FileReader { val fileNameRegex = """bigdl.tfevents.*""".r def readScalar(file: Path, tag: String, fs: FileSystem): Array[(Long, Float, Double)] = { require(fs.isFile(file), s"FileReader: ${file} should be a file") val bis = new BufferedInputStream(fs.open(file)) val longBuffer = new Array[Byte](8) val crcBuffer = new Array[Byte](4) val bf = new ArrayBuffer[(Long, Float, Double)] while (bis.read(longBuffer) > 0) { val l = ByteBuffer.wrap(longBuffer.reverse).getLong() bis.read(crcBuffer) // TODO: checksum // val crc1 = ByteBuffer.wrap(crcBuffer.reverse).getInt() val eventBuffer = new Array[Byte](l.toInt) bis.read(eventBuffer) val e = Event.parseFrom(eventBuffer) if (e.getSummary.getValueCount == 1 && tag.equals(e.getSummary.getValue(0).getTag())) { bf.append((e.getStep, e.getSummary.getValue(0).getSimpleValue, e.getWallTime)) } bis.read(crcBuffer) // val crc2 = ByteBuffer.wrap(crcBuffer.reverse).getInt() } bis.close() bf.toArray.sortWith(_._1 < _._1) } }
Example 9
Source File: HdfsFileAccessor.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.dataspecs.access import java.io.InputStream import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import org.archive.archivespark.sparkling.io.IOUtil class HdfsFileAccessor(path: String, decompress: Boolean = true) extends CloseableDataAccessor[InputStream] { override def get: Option[InputStream] = { val fs = FileSystem.get(SparkHadoopUtil.get.conf) var stream: InputStream = null try { val raw = fs.open(new Path(path)) stream = if (decompress) IOUtil.decompress(raw, Some(path)) else raw Some(stream) } catch { case e: Exception => e.printStackTrace() if (stream != null) stream.close() None } } }
Example 10
Source File: HdfsStreamAccessor.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.dataspecs.access import java.io.InputStream import org.apache.commons.io.input.BoundedInputStream import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] { override def get: Option[InputStream] = { if (location.length < 0 || location.offset < 0) None else { val fs = FileSystem.get(SparkHadoopUtil.get.conf) var stream: FSDataInputStream = null try { stream = fs.open(new Path(location.path)) stream.seek(location.offset) Some(new BoundedInputStream(stream, location.length)) } catch { case e: Exception => e.printStackTrace() if (stream != null) stream.close() None } } } }
Example 11
Source File: HdfsBlockStream.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.io import java.io.{ByteArrayInputStream, InputStream} import org.apache.hadoop.fs.{FileSystem, Path} import org.archive.archivespark.sparkling.logging.LogContext import org.archive.archivespark.sparkling.util.Common import scala.util.Try class HdfsBlockStream (fs: FileSystem, file: String, offset: Long = 0, length: Long = -1, retries: Int = 60, sleepMillis: Int = 1000 * 60) extends InputStream { implicit val logContext: LogContext = LogContext(this) val path = new Path(file) val (blockSize: Int, fileSize: Long) = { val status = fs.getFileStatus(path) (status.getBlockSize.min(Int.MaxValue).toInt, status.getLen) } private var pos: Long = offset.max(0) private val max: Long = if (length > 0) fileSize.min(pos + length) else fileSize private val buffer = new Array[Byte](blockSize) private val emptyBlock = new ByteArrayInputStream(Array.emptyByteArray) private var block: ByteArrayInputStream = emptyBlock def ensureNextBlock(): InputStream = { if (block.available() == 0 && pos < max) { val end = pos + blockSize val blockLength = ((end - (end % blockSize)).min(max) - pos).toInt Common.retry(retries, sleepMillis, (retry, e) => { "File access failed (" + retry + "/" + retries + "): " + path + " (Offset: " + pos + ") - " + e.getMessage }) { retry => val in = fs.open(path, blockLength) if (retry > 0) Try(in.seekToNewSource(pos)) else if (pos > 0) in.seek(pos) var read = 0 while (read < blockLength) read += in.read(buffer, read, blockLength - read) Try(in.close()) } pos += blockLength block = new ByteArrayInputStream(buffer, 0, blockLength) } block } override def read(): Int = ensureNextBlock().read() override def read(b: Array[Byte]): Int = ensureNextBlock().read(b) override def read(b: Array[Byte], off: Int, len: Int): Int = ensureNextBlock().read(b, off, len) override def skip(n: Long): Long = { val available = block.available() if (n <= available) block.skip(n) else { block = emptyBlock val currentPos = pos - available val skip = n.min(max - currentPos) pos += skip - available skip } } override def available(): Int = block.available() override def close(): Unit = {} override def markSupported(): Boolean = false }
Example 12
Source File: FilePathMap.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.util import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import scala.util.Try case class FilePathMap(path: String, patterns: Seq[String] = Seq.empty) { val pathMap: Map[String, String] = { var map = collection.mutable.Map[String, String]() val fs = FileSystem.get(SparkHadoopUtil.get.conf) val files = fs.listFiles(new Path(path), true) while (files.hasNext) { val path = files.next.getPath val filename = path.getName if (patterns.isEmpty || patterns.exists(filename.matches)) { if (map.contains(filename)) throw new RuntimeException("duplicate filename: " + filename) map += filename -> path.getParent.toString.intern } } map.toMap } def pathToFile(file: String): Option[Path] = Try {new Path(file).getName}.toOption match { case Some(f) => pathMap.get(f).map(dir => new Path(dir, f)) case None => None } }
Example 13
Source File: 2-CommonFunctions.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License | 5 votes |
// Databricks notebook source import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.conf.Configuration // COMMAND ---------- val prqShrinkageFactor = 0.19 //We found a saving in space of 81% with Parquet // COMMAND ---------- def analyzeTables(databaseAndTable: String) { println("Table: " + databaseAndTable) println("....refresh table") sql("REFRESH TABLE " + databaseAndTable) println("....analyze table") sql("ANALYZE TABLE " + databaseAndTable + " COMPUTE STATISTICS") println("....done") } // COMMAND ---------- def calcOutputFileCountTxtToPrq(srcDataFile: String, targetedFileSizeMB: Int): Int = { val fs = FileSystem.get(new Configuration()) val estFileCount: Int = Math.floor((fs.getContentSummary(new Path(srcDataFile)).getLength * prqShrinkageFactor) / (targetedFileSizeMB * 1024 * 1024)).toInt if(estFileCount == 0) 1 else estFileCount } // COMMAND ---------- // Get recursive file collection you can iterate on def getRecursiveFileCollection(directoryPath: String): Seq[String] = dbutils.fs.ls(directoryPath).map(directoryItem => { // Work around double encoding bug val directoryItemPath = directoryItem.path.replace("%25", "%").replace("%25", "%") if (directoryItem.isDir) getRecursiveFileCollection(directoryItemPath) else Seq[String](directoryItemPath) }).reduce(_ ++ _) // COMMAND ---------- //Delete residual files from job operation (_SUCCESS, _start*, _committed*) def recursivelyDeleteSparkJobFlagFiles(directoryPath: String) { getRecursiveFileCollection(directoryPath).foreach(directoryItemPath => { if (directoryItemPath.indexOf("parquet") == -1) { println("Deleting...." + directoryItemPath) dbutils.fs.rm(directoryItemPath) }}) } // COMMAND ---------- dbutils.notebook.exit("Pass")
Example 14
Source File: TopWORDSApp.scala From topwords with GNU General Public License v3.0 | 5 votes |
package io.github.qf6101.topwords import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession object TopWORDSApp extends Serializable { @transient private[this] val LOGGER = Logger.getLogger(this.getClass.toString) def main(args: Array[String]) { // setup spark session val spark = SparkSession.builder().getOrCreate() try { TopWORDSParser.parse(args).foreach { args => // remove output location files if exist val files = FileSystem.get(spark.sparkContext.hadoopConfiguration) if (files.exists(new Path(args.outputLoc))) files.delete(new Path(args.outputLoc), true) // read input corpus val corpus = if (args.numPartitions > 0) spark.sparkContext.textFile(args.inputLoc).repartition(args.numPartitions) else spark.sparkContext.textFile(args.inputLoc) LOGGER.info("Number of lines of input corpus: " + corpus.count()) // run TopWORDS with the parsed arguments new TopWORDS( tauL = args.tauL, tauF = args.tauF, textLenThld = args.textLenThld, useProbThld = args.useProbThld, numIterations = args.numIterations, convergeTol = args.convergeTol, wordBoundaryThld = args.wordBoundaryThld) .run(corpus, args.outputLoc + "/dictionary", args.outputLoc + "/segmented_texts") } //exit normally LOGGER.info("Running TopWORDS successfully!") if (spark.sparkContext.master.contains("local")) sys.exit(0) } catch { case ex: Throwable => LOGGER.error("Running TopWORDS fail!", ex) //signal to external process if (spark.sparkContext.master.contains("local")) sys.exit(1) } finally spark.stop() } }
Example 15
Source File: TestTopWORDS.scala From topwords with GNU General Public License v3.0 | 5 votes |
package io.github.qf6101.topwords import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession def main(args: Array[String]) { // setup spark session val spark = SparkSession.builder().master("local[1]").appName(this.getClass.toString).getOrCreate() val inputFile = "test_data/story_of_stone.txt" val outputFile = "test_data/test_output" val files = FileSystem.get(spark.sparkContext.hadoopConfiguration) if (files.exists(new Path(outputFile))) files.delete(new Path(outputFile), true) val corpus = spark.sparkContext.textFile(inputFile) new TopWORDS( tauL = 10, tauF = 5, textLenThld = 2000, useProbThld = 1E-8, numIterations = 10, convergeTol = 1E-3, wordBoundaryThld = 0.0) .run(corpus, outputFile + "/dictionary", outputFile + "/segmented_texts") } }
Example 16
Source File: PostUrl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.http import java.io.{BufferedReader, InputStreamReader} import java.net.URI import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.commons.httpclient.HttpClient import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.http.client.methods.HttpPost import org.apache.http.entity.StringEntity import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.sql.SparkSession class PostUrl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override val description: String = "Send a post request to the specified http" var url : String= _ var jsonPath : String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //read json from hdfs val conf = new Configuration() val fs = FileSystem.get(URI.create(jsonPath),conf) val stream: FSDataInputStream = fs.open(new Path(jsonPath)) val bufferReader = new BufferedReader(new InputStreamReader(stream)) var lineTxt = bufferReader.readLine() val buffer = new StringBuffer() while (lineTxt != null ){ buffer.append(lineTxt.mkString) lineTxt=bufferReader.readLine() } // post val client = HttpClients.createDefault() val httpClient = new HttpClient() httpClient.getParams().setContentCharset("utf-8") val post = new HttpPost(url) post.addHeader("content-Type","application/json") post.setEntity(new StringEntity(buffer.toString)) val response = client.execute(post) val entity = response.getEntity val str = EntityUtils.toString(entity,"UTF-8") println("Code is " + str) } override def setProperties(map: Map[String, Any]): Unit = { url = MapUtil.get(map,key="url").asInstanceOf[String] jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val url = new PropertyDescriptor() .name("url") .displayName("Url") .defaultValue("") .description("http request address") .required(true) .example("http://master:8002/flow/start") val jsonPath = new PropertyDescriptor() .name("jsonPath") .displayName("JsonPath") .defaultValue("") .description("json parameter path for post request") .required(true) .example("hdfs://master:9000/work/flow.json") descriptor = url :: descriptor descriptor = jsonPath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/http/PostUrl.png") } override def getGroup(): List[String] = { List(StopGroup.HttpGroup.toString) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 17
Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.json.JSONObject class Pathway extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse Pathway data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/pathway").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Pathway.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val inDf: DataFrame = in.read() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val configuration: Configuration = new Configuration() val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var fdis: FSDataInputStream = null var br: BufferedReader = null var doc: JSONObject = null var hasAnotherSequence:Boolean = true inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var count = 0 while (hasAnotherSequence) { count += 1 doc = new JSONObject hasAnotherSequence = util.KeggPathway.process(br, doc) doc.write(hdfsWriter) hdfsWriter.write("\n") } br.close() fdis.close() }) hdfsWriter.close() val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary) df.schema.printTreeString() println(df.count) out.write(df) } }
Example 18
Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.PDB import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class PDBData extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse PDB data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var doc: JSONObject = null var pdb: PDB = null var count:Int=0 inDf.collect().foreach(row => { count += 1 pathStr = row.get(0).asInstanceOf[String] pdb = new PDB(pathStr,fs) doc = pdb.getDoc doc.write(hdfsWriter) hdfsWriter.write("\n") doc = null }) hdfsWriter.close() val df: DataFrame = session.read.json(hdfsPathTemporary) out.write(df) } def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/PDB").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/PDBData.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 19
Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.ParserGff3Data import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class Ensembl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse ensembl data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/ensembl").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Ensembl.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) val parser: ParserGff3Data = new ParserGff3Data var fdis: FSDataInputStream =null var br: BufferedReader = null var doc: JSONObject = null var count:Int = 0 inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var eachStr:String=null while((eachStr = br.readLine()) != null && eachStr != null ){ doc = parser.parserGff3(eachStr) if(doc.toString.length > 2){ count += 1 doc.write(hdfsWriter) hdfsWriter.write("\n") } } br.close() fdis.close() }) hdfsWriter.close() out.write(session.read.json(hdfsPathTemporary)) } }
Example 20
Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.filesystem import java.io.{ Closeable, InputStream } import java.util.Scanner import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path } import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec } import scala.collection.convert.decorateAsScala._ import scala.util.{ Random, Try } class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val fileSystem = FileSystem.getLocal(new Configuration) private val numFiles = 10 private val baseDir = "test-dir".asHadoop private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d" private def safely[A <: Closeable, U](f: A => U) = { stream: A => val attempt = Try { f(stream) } stream.close() attempt } private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path) private def readFiles = Try { fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get } } private def openFiles = Try { fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) } } private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream => Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row => stream.writeUTF { row.mkString("", ",", "\n") } } } apply fileSystem.create { workingDir / fileName } private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match { case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings case (head, tail) => randomSplits(tail, head.mkString +: strings) } private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) } private def createFiles = Try { 0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse` } private def prepareData = for { _ <- createWorkingDir _ <- createFiles } yield () private def purgeData = Try { fileSystem.delete(workingDir, true) } override def beforeAll() = prepareData.get override def afterAll() = purgeData.get "MergeStrategies info" when { "given compressed format files" must { "throw an exception" in { an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) } } } "given data as csv" must { "drop one line and merge the rest" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size - numFiles + 1 } } apply MergeStrategies.csv.merge { openFiles.get } } } "given data as json" must { "just merge the files into one" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size } } apply MergeStrategies.json.merge { openFiles.get } } } } }
Example 21
Source File: HDFSBase.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.util import better.files.{ File, _ } import daf.util.DataFrameClasses.{ Address, Person } import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hdfs.{ HdfsConfiguration, MiniDFSCluster } import org.apache.hadoop.test.PathUtils import org.apache.spark.sql.{ SaveMode, SparkSession } import org.scalatest.{ BeforeAndAfterAll, FlatSpec, Matchers } import org.slf4j.LoggerFactory import scala.util.{ Failure, Random, Try } abstract class HDFSBase extends FlatSpec with Matchers with BeforeAndAfterAll { var miniCluster: Try[MiniDFSCluster] = Failure[MiniDFSCluster](new Exception) var fileSystem: Try[FileSystem] = Failure[FileSystem](new Exception) val sparkSession: SparkSession = SparkSession.builder().master("local").getOrCreate() val alogger = LoggerFactory.getLogger(this.getClass) val (testDataPath, confPath) = { val testDataPath = s"${PathUtils.getTestDir(this.getClass).getCanonicalPath}/MiniCluster" val confPath = s"$testDataPath/conf" ( testDataPath.toFile.createIfNotExists(asDirectory = true, createParents = false), confPath.toFile.createIfNotExists(asDirectory = true, createParents = false) ) } def pathAvro = "opendata/test.avro" def pathParquet = "opendata/test.parquet" def pathCsv = "opendata/test.csv" def getSparkSession = sparkSession override def beforeAll(): Unit = { val conf = new HdfsConfiguration() conf.setBoolean("dfs.permissions", true) System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA) conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, testDataPath.pathAsString) //FileUtil.fullyDelete(testDataPath.toJava) conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.groups", "*") conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.hosts", "*") val builder = new MiniDFSCluster.Builder(conf) miniCluster = Try(builder.build()) fileSystem = miniCluster.map(_.getFileSystem) fileSystem.foreach(fs => { val confFile: File = confPath / "hdfs-site.xml" for { os <- confFile.newOutputStream.autoClosed } fs.getConf.writeXml(os) }) writeDf() } override def afterAll(): Unit = { miniCluster.foreach(_.shutdown(true)) val _ = testDataPath.parent.parent.delete(true) sparkSession.stop() } private def writeDf(): Unit = { import sparkSession.implicits._ alogger.info(s"TestDataPath ${testDataPath.toJava.getAbsolutePath}") alogger.info(s"ConfPath ${confPath.toJava.getAbsolutePath}") val persons = (1 to 10).map(i => Person(s"Andy$i", Random.nextInt(85), Address("Via Ciccio Cappuccio"))) val caseClassDS = persons.toDS() caseClassDS.write.format("parquet").mode(SaveMode.Overwrite).save(pathParquet) caseClassDS.write.format("com.databricks.spark.avro").mode(SaveMode.Overwrite).save(pathAvro) //writing directly the Person dataframe generates an exception caseClassDS.toDF.select("name", "age").write.format("csv").mode(SaveMode.Overwrite).option("header", "true").save(pathCsv) } } object DataFrameClasses { final case class Address(street: String) final case class Person(name: String, age: Int, address: Address) }
Example 22
Source File: HDFSMiniCluster.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package it.teamdigitale.miniclusters import better.files.File import org.apache.logging.log4j.LogManager import org.apache.hadoop.hdfs.HdfsConfiguration import org.apache.hadoop.test.PathUtils import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hdfs.{HdfsConfiguration, MiniDFSCluster} import better.files._ import scala.util.{Failure, Try} class HDFSMiniCluster extends AutoCloseable { val alogger = LogManager.getLogger(this.getClass) var hdfsCluster: Try[MiniDFSCluster] = Failure[MiniDFSCluster](new Exception) var fileSystem: Try[FileSystem] = Failure[FileSystem](new Exception) val (testDataPath, confPath) = { val testDataPath = s"${PathUtils.getTestDir(classOf[HDFSMiniCluster]).getCanonicalPath}/MiniCluster" val confPath = s"$testDataPath/conf" ( testDataPath.toFile.createIfNotExists(asDirectory = true, createParents = false), confPath.toFile.createIfNotExists(asDirectory = true, createParents = false) ) } def start(): Unit = { alogger.info("Starting HDFS mini cluster") val conf = new HdfsConfiguration() conf.setBoolean("dfs.permissions", true) System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA) conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, testDataPath.pathAsString) //FileUtil.fullyDelete(testDataPath.toJava) conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.groups", "*") conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.hosts", "*") val builder = new MiniDFSCluster.Builder(conf) hdfsCluster = Try(builder.build()) fileSystem = hdfsCluster.map(_.getFileSystem) fileSystem.foreach(fs => { val confFile: File = confPath / "hdfs-site.xml" for {os <- confFile.newOutputStream.autoClosed} fs.getConf.writeXml(os) }) } override def close() = { alogger.info("Stopping HDFS mini cluster") hdfsCluster.foreach(_.shutdown(true)) val _ = testDataPath.parent.parent.delete(true) } }
Example 23
Source File: HDFSUtils.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.hadoop.common.utils import java.io.File import java.nio.file.Paths import java.security.PrivilegedExceptionAction import com.webank.wedatasphere.linkis.common.conf.Configuration.hadoopConfDir import com.webank.wedatasphere.linkis.hadoop.common.conf.HadoopConf._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.UserGroupInformation object HDFSUtils { def getConfiguration(user: String): Configuration = getConfiguration(user, hadoopConfDir) def getConfiguration(user: String, hadoopConfDir: String): Configuration = { val confPath = new File(hadoopConfDir) if(!confPath.exists() || confPath.isFile) { throw new RuntimeException(s"Create hadoop configuration failed, path $hadoopConfDir not exists.") } val conf = new Configuration() conf.addResource(new Path(Paths.get(hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf.addResource(new Path(Paths.get(hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf.addResource(new Path(Paths.get(hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf } def getHDFSRootUserFileSystem: FileSystem = getHDFSRootUserFileSystem(getConfiguration(HADOOP_ROOT_USER.getValue)) def getHDFSRootUserFileSystem(conf: org.apache.hadoop.conf.Configuration): FileSystem = getHDFSUserFileSystem(HADOOP_ROOT_USER.getValue, conf) def getHDFSUserFileSystem(userName: String): FileSystem = getHDFSUserFileSystem(userName, getConfiguration(userName)) def getHDFSUserFileSystem(userName: String, conf: org.apache.hadoop.conf.Configuration): FileSystem = getUserGroupInformation(userName) .doAs(new PrivilegedExceptionAction[FileSystem]{ def run = FileSystem.get(conf) }) def getUserGroupInformation(userName: String): UserGroupInformation ={ if(KERBEROS_ENABLE.getValue) { val path = new File(KEYTAB_FILE.getValue , userName + ".keytab").getPath val user = getKerberosUser(userName) UserGroupInformation.setConfiguration(getConfiguration(userName)) UserGroupInformation.loginUserFromKeytabAndReturnUGI(user, path) } else { UserGroupInformation.createRemoteUser(userName) } } def getKerberosUser(userName: String): String = { var user = userName if(KEYTAB_HOST_ENABLED.getValue){ user = user+ "/" + KEYTAB_HOST.getValue } user } }
Example 24
Source File: DeltaOutputFormat.scala From connectors with Apache License 2.0 | 5 votes |
package io.delta.hive import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.io.{ArrayWritable, NullWritable} import org.apache.hadoop.mapred.{JobConf, OutputFormat, RecordWriter} import org.apache.hadoop.util.Progressable class DeltaOutputFormat extends OutputFormat[NullWritable, ArrayWritable] { private def writingNotSupported[T](): T = { throw new UnsupportedOperationException( "Writing to a Delta table in Hive is not supported. Please use Spark to write.") } override def getRecordWriter( ignored: FileSystem, job: JobConf, name: String, progress: Progressable): RecordWriter[NullWritable, ArrayWritable] = writingNotSupported() override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = writingNotSupported() }
Example 25
Source File: ModelSource.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import java.io.{InputStreamReader, BufferedReader} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, FileSystem} case class ModelSource( root: String, fs: FileSystem ) { def readFile(path: String): String = { val fsPath = filePath(path) val reader = new BufferedReader(new InputStreamReader(fs.open(fsPath))) val builder = new StringBuilder() var line: String = null while ({ line = reader.readLine(); line != null }) { builder.append(line + "\n") } builder.mkString } def findFile(dir: String, recursive: Boolean, f: String => Boolean): Option[Path] = { val dirPath = filePath(dir) if (fs.exists(dirPath) & fs.isDirectory(dirPath)) { val iter = fs.listFiles(dirPath, recursive) while (iter.hasNext) { val st = iter.next() if (st.isFile && f(st.getPath.getName)) return Some(st.getPath) } None } else { None } } def filePath(path: String): Path = { new Path(s"$root/$path") } } object ModelSource { def local(path: String): ModelSource = { ModelSource(path, FileSystem.getLocal(new Configuration())) } def hadoop(path: String, conf: Configuration): ModelSource = { val fs = FileSystem.get(conf) ModelSource(path, fs) } }
Example 26
Source File: Util.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def getTempFilePath(conf: Configuration, prefix: String): String = { val fileSystem = FileSystem.get(conf) val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") if (fileSystem.exists(path)) { fileSystem.delete(path, true) } path.getName } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 27
Source File: FilePattern.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import com.sksamuel.exts.Logging import io.eels.util.HdfsIterator import org.apache.hadoop.fs.{FileSystem, Path} import scala.language.implicitConversions object FilePattern { def apply(path: Path)(implicit fs: FileSystem): FilePattern = apply(path.toString()) def apply(path: java.nio.file.Path)(implicit fs: FileSystem): FilePattern = apply(path.toAbsolutePath().toString(), { _ => true }) implicit def stringToFilePattern(str: String)(implicit fs: FileSystem): FilePattern = FilePattern(str) } case class FilePattern(pattern: String, filter: org.apache.hadoop.fs.Path => Boolean = { _ => true }) extends Logging { def isRegex(): Boolean = pattern.contains("*") def isDirectory(): Boolean = pattern.endsWith("/") def toPaths()(implicit fs: FileSystem): List[Path] = { val paths = if (isRegex) { val regex = new Path(pattern).getName.replace("*", ".*?") val dir = new Path(pattern).getParent logger.debug(s"File expansion will check path $dir for files matching $regex") HdfsIterator.remote(fs.listFiles(dir, false)).toList .map(_.getPath) .filter { path => path.getName.matches(regex) } .filter(filter) } else if (fs.isDirectory(new Path(pattern))) { val path = new Path(pattern.stripSuffix("/")) logger.debug(s"File expansion will search directory $path") HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toList.filter(fs.isFile).filter(filter) } else { List(new Path(pattern)) } logger.debug(s"toPaths has returned ${paths.size} paths, first 5: ${paths.take(5).mkString(",")}") paths } def withFilter(p: Path => Boolean): FilePattern = copy(filter = p) }
Example 28
Source File: HdfsMkdir.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.util import org.apache.hadoop.fs.{FileSystem, Path} object HdfsMkdir { def apply(path: Path, inheritPermissionsDefault: Boolean)(implicit fs: FileSystem): Unit = { if (!fs.exists(path)) { // iterate through the parents until we hit a parent that exists, then take that, which will give // us the first folder that exists val parent = Iterator.iterate(path)(_.getParent).dropWhile(false == fs.exists(_)).take(1).toList.head // using the folder that exists, get its permissions val permission = fs.getFileStatus(parent).getPermission fs.create(path, false) if (inheritPermissionsDefault) fs.setPermission(path, permission) } } }
Example 29
Source File: HdfsOps.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import com.sksamuel.exts.Logging import io.eels.util.{HdfsIterator, PathIterator} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} object HdfsOps extends Logging { def makePathVisible(path: Path)(implicit fs: FileSystem): Unit = { if (path.getName.startsWith(".")) { logger.info(s"Making $path visible by stripping leading .") val dest = new Path(path.getParent, path.getName.drop(1)) fs.rename(path, dest) } } def findFiles(path: Path, recursive: Boolean, fs: FileSystem): Iterator[LocatedFileStatus] = { HdfsIterator.remote(fs.listFiles(path, recursive)) } def mkdirsp(path: Path, fs: FileSystem): Boolean = PathIterator(path).forall(fs.mkdirs) }
Example 30
Source File: AvroSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.io.File import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} case class AvroSource(path: Path) (implicit conf: Configuration, fs: FileSystem) extends Source with Using { override lazy val schema: StructType = { using(AvroReaderFns.createAvroReader(path)) { reader => val record = reader.next() AvroSchemaFns.fromAvroSchema(record.getSchema) } } override def parts(): Seq[Publisher[Seq[Row]]] = Seq(AvroSourcePublisher(path)) } case class AvroSourcePublisher(path: Path) (implicit conf: Configuration, fs: FileSystem) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { val deserializer = new AvroDeserializer() try { using(AvroReaderFns.createAvroReader(path)) { reader => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) AvroRecordIterator(reader) .takeWhile(_ => running.get) .map(deserializer.toRow) .grouped(DataStream.DefaultBatchSize) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } } object AvroSource { def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSource = AvroSource(new Path(file.getAbsoluteFile.toString)) def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSource = apply(path.toFile) }
Example 31
Source File: AvroSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.io.File import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} case class AvroSink(path: Path, overwrite: Boolean = false, permission: Option[FsPermission] = None, inheritPermissions: Option[Boolean] = None) (implicit conf: Configuration, fs: FileSystem) extends Sink { def withOverwrite(overwrite: Boolean): AvroSink = copy(overwrite = overwrite) def withPermission(permission: FsPermission): AvroSink = copy(permission = Option(permission)) def withInheritPermission(inheritPermissions: Boolean): AvroSink = copy(inheritPermissions = Option(inheritPermissions)) override def open(schema: StructType): SinkWriter = new SinkWriter { private val writer = new AvroWriter(schema, fs.create(path, overwrite)) override def write(row: Row): Unit = writer.write(row) override def close(): Unit = { writer.close() permission match { case Some(perm) => fs.setPermission(path, perm) case None => if (inheritPermissions.getOrElse(false)) { val permission = fs.getFileStatus(path.getParent).getPermission fs.setPermission(path, permission) } } } } } object AvroSink { def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSink = AvroSink(new Path(file.getAbsoluteFile.toString)) def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSink = apply(path.toFile) }
Example 32
Source File: JsonSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.json import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.fs.{FileSystem, Path} case class JsonSink(path: Path)(implicit fs: FileSystem) extends Sink { override def open(schema: StructType): SinkWriter = new SinkWriter { private val lock = new AnyRef() private val out = fs.create(path) val mapper = new ObjectMapper with ScalaObjectMapper mapper.registerModule(DefaultScalaModule) override def write(row: Row) { val map = schema.fieldNames.zip(row.values).toMap val json = mapper.writeValueAsString(map) lock.synchronized { out.writeBytes(json) out.writeBytes("\n") } } override def close() { out.close() } } }
Example 33
Source File: AvroParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.avro.{AvroSchemaFns, AvroSchemaMerge} import io.eels.component.parquet._ import io.eels.datastream.Publisher import io.eels.schema.StructType import io.eels.{FilePattern, Predicate, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object AvroParquetSource { def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) } case class AvroParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { private lazy val paths = pattern.toPaths() def withPredicate(pred: Predicate): AvroParquetSource = copy(predicate = pred.some) // the schema returned by the parquet source should be a merged version of the // schemas contained in all the files. override def schema: StructType = { val schemas = paths.map { path => using(AvroParquetReaderFn.apply(path, predicate, None)) { reader => val record = Option(reader.read()).getOrElse { sys.error(s"Cannot read $path for schema; file contains no records") } record.getSchema } } val avroSchema = AvroSchemaMerge("record", "namspace", schemas) AvroSchemaFns.fromAvroSchema(avroSchema) } // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"AvroParquetSource source has ${paths.size} files: $paths") paths.map { it => new AvroParquetPublisher(it, predicate) } } def footers(): List[Footer] = { logger.debug(s"AvroParquetSource source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 34
Source File: AvroParquetRowWriter.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.typesafe.config.{Config, ConfigFactory} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.{FileSystem, Path} class AvroParquetRowWriter(path: Path, avroSchema: Schema)(implicit fs: FileSystem) extends Logging { private val config: Config = ConfigFactory.load() private val skipCrc = config.getBoolean("eel.parquet.skipCrc") logger.info(s"Parquet writer will skipCrc = $skipCrc") private val writer = AvroParquetWriterFn(path, avroSchema) def write(record: GenericRecord): Unit = { writer.write(record) } def close(): Unit = { writer.close() if (skipCrc) { val crc = new Path("." + path.toString() + ".crc") logger.debug("Deleting crc $crc") if (fs.exists(crc)) fs.delete(crc, false) } } }
Example 35
Source File: AvroParquetSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.component.avro.{AvroSchemaFns, RowSerializer} import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.fs.{FileSystem, Path} object AvroParquetSink { def apply(path: String)(implicit fs: FileSystem): AvroParquetSink = AvroParquetSink(new Path(path)) } case class AvroParquetSink(path: Path, overwrite: Boolean = false)(implicit fs: FileSystem) extends Sink with Logging { def withOverwrite(overwrite: Boolean): AvroParquetSink = copy(overwrite = overwrite) override def open(schema: StructType): SinkWriter = new SinkWriter { private val config = ConfigFactory.load() private val caseSensitive = config.getBoolean("eel.parquet.caseSensitive") if (overwrite && fs.exists(path)) fs.delete(path, false) private val avroSchema = AvroSchemaFns.toAvroSchema(schema, caseSensitive = caseSensitive) private val writer = new AvroParquetRowWriter(path, avroSchema) private val serializer = new RowSerializer(avroSchema) override def write(row: Row): Unit = { this.synchronized { val record = serializer.serialize(row) writer.write(record) } } override def close(): Unit = { writer.close() } } }
Example 36
Source File: ParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.datastream.Publisher import io.eels.{Predicate, _} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object ParquetSource { def apply(string: String)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(string)) def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) } case class ParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None, projection: Seq[String] = Nil, dictionaryFiltering: Boolean = true, caseSensitive: Boolean = true) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { logger.debug(s"Created parquet source with pattern=$pattern") lazy val paths: List[Path] = pattern.toPaths() def withDictionaryFiltering(dictionary: Boolean): ParquetSource = copy(dictionaryFiltering = dictionary) def withCaseSensitivity(caseSensitive: Boolean): ParquetSource = copy(caseSensitive = caseSensitive) def withPredicate(pred: => Predicate): ParquetSource = copy(predicate = pred.some) def withProjection(first: String, rest: String*): ParquetSource = withProjection(first +: rest) def withProjection(fields: Seq[String]): ParquetSource = { require(fields.nonEmpty) copy(projection = fields.toList) } // returns the metadata in the parquet file, or an empty map if none def metadata(): Map[String, String] = { paths.foldLeft(Map.empty[String, String]) { (metadata, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) metadata ++ footer.getFileMetaData.getKeyValueMetaData.asScala } } // todo should take the merged schema from all files lazy val schema: StructType = RowParquetReaderFn.schema(paths.headOption.getOrError("No paths found for source")) // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"Parquet source has ${paths.size} files: ${paths.mkString(", ")}") paths.map { it => new ParquetPublisher(it, predicate, projection, caseSensitive, dictionaryFiltering) } } def footers(): List[Footer] = { logger.debug(s"Parquet source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 37
Source File: ParquetSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import scala.math.BigDecimal.RoundingMode import scala.math.BigDecimal.RoundingMode.RoundingMode case class ParquetWriteOptions(overwrite: Boolean = false, permission: Option[FsPermission] = None, dictionary: Boolean = true, inheritPermissions: Option[Boolean] = None, roundingMode: RoundingMode = RoundingMode.UNNECESSARY, metadata: Map[String, String] = Map.empty) { def withOverwrite(overwrite: Boolean): ParquetWriteOptions = copy(overwrite = overwrite) def withDictionary(dictionary: Boolean): ParquetWriteOptions = copy(dictionary = dictionary) def withMetaData(map: Map[String, String]): ParquetWriteOptions = copy(metadata = map) def withPermission(permission: FsPermission): ParquetWriteOptions = copy(permission = permission.some) def withInheritPermission(inheritPermissions: Boolean): ParquetWriteOptions = copy(inheritPermissions = inheritPermissions.some) def withRoundingMode(mode: RoundingMode): ParquetWriteOptions = copy(roundingMode = mode) } case class ParquetSink(path: Path, options: ParquetWriteOptions = ParquetWriteOptions()) (implicit fs: FileSystem) extends Sink with Logging { // -- convenience methods -- def withOverwrite(overwrite: Boolean): ParquetSink = copy(options = options.withOverwrite(overwrite)) def withDictionary(dictionary: Boolean): ParquetSink = copy(options = options.copy(dictionary = dictionary)) def withMetaData(map: Map[String, String]): ParquetSink = copy(options = options.copy(metadata = map)) def withPermission(permission: FsPermission): ParquetSink = copy(options = options.copy(permission = permission.some)) def withInheritPermission(inheritPermissions: Boolean): ParquetSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some)) def withRoundingMode(mode: RoundingMode): ParquetSink = copy(options = options.copy(roundingMode = mode)) private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter { if (options.overwrite && fs.exists(path)) fs.delete(path, false) val writer = RowParquetWriterFn(path, schema, options.metadata, options.dictionary, options.roundingMode, fs.getConf) override def write(row: Row): Unit = { writer.write(row) } override def close(): Unit = { writer.close() options.permission match { case Some(perm) => fs.setPermission(path, perm) case None => if (options.inheritPermissions.getOrElse(false)) { val permission = fs.getFileStatus(path.getParent).getPermission fs.setPermission(path, permission) } } } } override def open(schema: StructType, n: Int): Seq[SinkWriter] = { if (n == 1) Seq(create(schema, path)) else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) } } override def open(schema: StructType): SinkWriter = create(schema, path) } object ParquetSink { def apply(path: String)(implicit fs: FileSystem): ParquetSink = ParquetSink(new Path(path)) }
Example 38
Source File: HdfsWatcher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hdfs import java.util.concurrent.Executors import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import io.eels.util.HdfsIterator import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.client.HdfsAdmin import org.apache.hadoop.hdfs.inotify.Event import scala.concurrent.duration._ import scala.util.control.NonFatal class HdfsWatcher(path: Path, callback: FileCallback) (implicit fs: FileSystem, conf: Configuration) extends Logging { private val files = HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toBuffer files.foreach(callback.onStart) private val executor = Executors.newSingleThreadExecutor() private val running = new AtomicBoolean(true) private val interval = 5.seconds private val admin = new HdfsAdmin(path.toUri, conf) private val eventStream = admin.getInotifyEventStream executor.submit(new Runnable { override def run(): Unit = { while (running.get) { try { Thread.sleep(interval.toMillis) val events = eventStream.take for (event <- events.getEvents) { event match { case create: Event.CreateEvent => callback.onCreate(create) case append: Event.AppendEvent => callback.onAppend(append) case rename: Event.RenameEvent => callback.onRename(rename) case close: Event.CloseEvent => callback.onClose(close) case _ => } } } catch { case NonFatal(e) => logger.error("Error while polling fs", e) } } } }) def stop(): Unit = { running.set(false) executor.shutdownNow() } } trait FileCallback { def onStart(path: Path): Unit def onClose(close: Event.CloseEvent): Unit def onRename(rename: Event.RenameEvent): Unit def onAppend(append: Event.AppendEvent): Unit def onCreate(path: Event.CreateEvent): Unit }
Example 39
Source File: HdfsSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hdfs import io.eels.FilePattern import org.apache.hadoop.fs.permission.{AclEntryScope, AclEntryType, FsAction, FsPermission, AclEntry => HdfsAclEntry} import org.apache.hadoop.fs.{BlockLocation, FileSystem, Path} import scala.collection.JavaConverters._ case class HdfsSource(pattern: FilePattern)(implicit fs: FileSystem) { def permissions(): Vector[(Path, FsPermission)] = pattern.toPaths().map(fs.getFileStatus) .map(status => (status.getPath, status.getPermission)).toVector def setPermissions(permission: FsPermission): Unit = { pattern.toPaths().foreach(fs.setPermission(_, permission)) } def blocks(): Map[Path, Seq[BlockLocation]] = pattern.toPaths().map { path => path -> fs.getFileBlockLocations(path, 0, fs.getFileLinkStatus(path).getLen).toSeq }.toMap def setAcl(spec: AclSpec): Unit = { pattern.toPaths().foreach { path => val hadoopAclEntries = spec.entries.map { entry => val `type` = entry.`type`.toLowerCase match { case "user" => AclEntryType.USER case "group" => AclEntryType.GROUP case "other" => AclEntryType.OTHER } new HdfsAclEntry.Builder().setName(entry.name).setPermission(FsAction.getFsAction(entry.action)).setType(`type`).setScope(AclEntryScope.ACCESS).build() } fs.setAcl(path, hadoopAclEntries.asJava) } } } object HdfsSource { def apply(path: String)(implicit fs: FileSystem): HdfsSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem): HdfsSource = HdfsSource(FilePattern(path)) }
Example 40
Source File: CsvSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.csv import com.univocity.parsers.csv.CsvWriter import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} case class CsvSink(path: Path, overwrite: Boolean = false, headers: Header = Header.FirstRow, format: CsvFormat = CsvFormat(), ignoreLeadingWhitespaces: Boolean = false, ignoreTrailingWhitespaces: Boolean = false) (implicit conf: Configuration, fs: FileSystem) extends Sink { override def open(schema: StructType): SinkWriter = new CsvSinkWriter(schema, path, headers, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces) def withOverwrite(overwrite: Boolean): CsvSink = copy(overwrite = overwrite) def withHeaders(headers: Header): CsvSink = copy(headers = headers) def withIgnoreLeadingWhitespaces(ignoreLeadingWhitespaces: Boolean): CsvSink = copy(ignoreLeadingWhitespaces = ignoreLeadingWhitespaces) def withIgnoreTrailingWhitespaces(ignoreTrailingWhitespaces: Boolean): CsvSink = copy(ignoreTrailingWhitespaces = ignoreTrailingWhitespaces) def withFormat(format: CsvFormat): CsvSink = copy(format = format) class CsvSinkWriter(schema: StructType, path: Path, headers: Header, format: CsvFormat, ignoreLeadingWhitespaces: Boolean = false, ignoreTrailingWhitespaces: Boolean = false) extends SinkWriter { private val lock = new AnyRef {} if (overwrite && fs.exists(path)) fs.delete(path, false) import scala.collection.JavaConverters._ private lazy val writer: CsvWriter = { val output = fs.create(path) val writer = CsvSupport.createWriter(output, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces) headers match { case Header.FirstComment => writer.commentRow(schema.fieldNames().mkString(format.delimiter.toString())) case Header.FirstRow => writer.writeHeaders(schema.fieldNames().asJava) case _ => } writer } override def close(): Unit = writer.close() override def write(row: Row): Unit = { lock.synchronized { // nulls should be written as empty strings val array = row.values.map { case null => "" case other => other.toString } writer.writeRow(array: _*) } } } } object CsvSink { def apply(path: java.nio.file.Path) (implicit conf: Configuration, fs: FileSystem): CsvSink = CsvSink(new Path(path.toString)) }
Example 41
Source File: ReadParquetEEL.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.sql.Timestamp import io.eels.component.parquet.{ParquetSink, ParquetSource} import io.eels.datastream.DataStream import io.eels.schema.{ArrayType, DecimalType, Field, IntType, Precision, Scale, StringType, StructType, TimestampMillisType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} object ReadParquetEEL extends App { def readParquet(path: Path): Unit = { implicit val hadoopConfiguration = new Configuration() implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration) val rows = ParquetSource(parquetFilePath).toDataStream().collect rows.foreach(row => println(row)) } val parquetFilePath = new Path("file:///home/sam/development/person2.parquet") implicit val hadoopConfiguration = new Configuration() implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration) val friendStruct = Field.createStructField("FRIEND", Seq( Field("NAME", StringType), Field("AGE", IntType.Signed) ) ) val personDetailsStruct = Field.createStructField("PERSON_DETAILS", Seq( Field("NAME", StringType), Field("AGE", IntType.Signed), Field("SALARY", DecimalType(Precision(38), Scale(5))), Field("CREATION_TIME", TimestampMillisType) ) ) val friendType = StructType(friendStruct) val schema = StructType(personDetailsStruct, Field("FRIENDS", ArrayType(friendType), nullable = false)) val friends = Vector( Vector(Vector("John", 25)), Vector(Vector("Adam", 26)), Vector(Vector("Steven", 27)) ) val rows = Vector( Vector(Vector("Fred", 50, BigDecimal("50000.99000"), new Timestamp(System.currentTimeMillis())), friends) ) try { DataStream.fromValues(schema, rows).to(ParquetSink(parquetFilePath).withOverwrite(true)) } catch { case e: Exception => e.printStackTrace() } try { readParquet(parquetFilePath) } catch { case e: Exception => e.printStackTrace() } }
Example 42
Source File: FilePatternTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.nio.file.Files import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class FilePatternTest extends WordSpec with Matchers { implicit val fs = FileSystem.get(new Configuration()) "FilePattern" should { "detect single hdfs path without name server" ignore { FilePattern("hdfs:///mypath").toPaths() shouldBe List(new Path("hdfs:///mypath")) } "detect single hdfs path with name server" ignore { FilePattern("hdfs://nameserver/mypath").toPaths() shouldBe List(new Path("hdfs://nameserver/mypath")) } "detect absolute local file" in { FilePattern("file:///absolute/file").toPaths() shouldBe List(new Path("file:///absolute/file")) } "detect relative local file" in { FilePattern("file:///local/file").toPaths() shouldBe List(new Path("file:///local/file")) } "detect relative local file expansion" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } val hdfsPaths = files.map { it => new Path(it.toUri) } files.foreach(file => Files.createFile(file)) FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet files.foreach(Files.deleteIfExists) Files.deleteIfExists(dir) } //not working on windows "detect relative local file expansion with schema" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } val hdfsPaths = files.map { it => new Path(it.toUri) } files.foreach(file => Files.createFile(file)) FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet files.foreach(Files.deleteIfExists) Files.deleteIfExists(dir) } "use filter if supplied" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } files.foreach { it => Files.createFile(it) } val a = FilePattern(dir.toAbsolutePath().toString() + "/*") .withFilter(_.toString().endsWith("a")) .toPaths.toSet a shouldBe Set(new Path("file:///" + dir.resolve("a"))) files.foreach { it => Files.deleteIfExists(it) } Files.deleteIfExists(dir) } } }
Example 43
Source File: ListenerTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.util.concurrent.{CountDownLatch, TimeUnit} import io.eels.component.csv.{CsvSink, CsvSource} import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} import scala.util.Random class ListenerTest extends WordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.get(conf) val schema = StructType("a", "b", "c", "d", "e") val rows = List.fill(1000)(Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(10))) val ds = DataStream.fromRows(schema, rows) val path = new Path("listener_test.csv") "DataStream" should { "support user's listeners" in { val latch = new CountDownLatch(1000) fs.delete(path, false) ds.listener(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).to(CsvSink(path)) latch.await(20, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } "propagate errors in listeners" in { class TestSink extends Sink { override def open(schema: StructType): SinkWriter = new SinkWriter { override def close(): Unit = () override def write(row: Row): Unit = () } } try { ds.listener(new Listener { override def onNext(value: Row): Unit = sys.error("boom") override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).to(new TestSink) assert(false) } catch { case _: Throwable => } } } "Source.toDataStream" should { "call on next for each row" in { val latch = new CountDownLatch(1000) fs.delete(path, false) ds.to(CsvSink(path)) CsvSource(path).toDataStream(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).collect latch.await(5, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } "call on complete once finished" in { val latch = new CountDownLatch(1001) fs.delete(path, false) ds.to(CsvSink(path)) CsvSource(path).toDataStream(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = latch.countDown() }).collect latch.await(5, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } } }
Example 44
Source File: AvroSourceTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.nio.file.Paths import com.typesafe.config.ConfigFactory import io.eels.schema.{Field, StructType} import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.scalatest.{Matchers, WordSpec} class AvroSourceTest extends WordSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) "AvroSource" should { "read schema" in { val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath) people.schema shouldBe StructType(Field("name", nullable = false), Field("job", nullable = false), Field("location", nullable = false)) } "read strings as java.lang.String when eel.avro.java.string is true" in { System.setProperty("eel.avro.java.string", "true") ConfigFactory.invalidateCaches() val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet people.map(_.values) shouldBe Set( List("clint eastwood", "actor", "carmel"), List("elton john", "musician", "pinner"), List("issac newton", "scientist", "heaven") ) System.setProperty("eel.avro.java.string", "false") ConfigFactory.invalidateCaches() } "read strings as utf8 when eel.avro.java.string is false" in { System.setProperty("eel.avro.java.string", "false") ConfigFactory.invalidateCaches() val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet people.map(_.values) shouldBe Set( List(new Utf8("clint eastwood"), new Utf8("actor"), new Utf8("carmel")), List(new Utf8("elton john"), new Utf8("musician"), new Utf8("pinner")), List(new Utf8("issac newton"), new Utf8("scientist"), new Utf8("heaven")) ) System.setProperty("eel.avro.java.string", "true") ConfigFactory.invalidateCaches() } } }
Example 45
Source File: AvroSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import io.eels.Row import io.eels.datastream.DataStream import io.eels.schema.{ArrayType, Field, MapType, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class AvroSinkTest extends WordSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val ds = DataStream.fromValues( StructType("name", "job", "location"), Seq( List("clint eastwood", "actor", "carmel"), List("elton john", "musician", "pinner"), List("issac newton", "scientist", "heaven") ) ) "AvroSink" should { "write to avro" in { val path = new Path("avro.test") fs.delete(path, false) ds.to(AvroSink(path)) fs.delete(path, false) } "support overwrite option" in { val path = new Path("overwrite_test", ".avro") fs.delete(path, false) ds.to(AvroSink(path)) ds.to(AvroSink(path).withOverwrite(true)) fs.delete(path, false) } "write lists and maps" in { val ds = DataStream.fromValues( StructType( Field("name"), Field("movies", ArrayType(StringType)), Field("characters", MapType(StringType, StringType)) ), Seq( List( "clint eastwood", List("fistful of dollars", "high plains drifters"), Map("preacher" -> "high plains", "no name" -> "good bad ugly") ) ) ) val path = new Path("array_map_avro", ".avro") fs.delete(path, false) ds.to(AvroSink(path)) AvroSource(path).toDataStream().collect shouldBe Seq( Row( ds.schema, Seq( "clint eastwood", List("fistful of dollars", "high plains drifters"), Map("preacher" -> "high plains", "no name" -> "good bad ugly") ) ) ) fs.delete(path, true) } } }
Example 46
Source File: JsonSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.json import io.eels.datastream.DataStream import io.eels.schema.{Field, StructType} import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class JsonSinkTest extends WordSpec with Matchers { val path = new Path("test.json") implicit val fs: FileSystem = FileSystem.get(new Configuration()) "JsonSink" should { "write multiple json docs to a file" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("location")) val ds = DataStream.fromValues( schema, Seq( Vector("sam", "aylesbury"), Vector("jam", "aylesbury"), Vector("ham", "buckingham") ) ) ds.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input should include("""{"name":"sam","location":"aylesbury"}""") input should include("""{"name":"jam","location":"aylesbury"}""") input should include("""{"name":"ham","location":"buckingham"}""") fs.delete(path, false) } "support arrays" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("skills")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Array("karate", "kung fu"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","skills":["karate","kung fu"]}""" fs.delete(path, false) } "support maps" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("locations")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Map("home" -> "boro", "work" -> "london"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}""" fs.delete(path, false) } "support structs" in { case class Foo(home: String, work: String) if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("locations")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Foo("boro", "london"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}""" fs.delete(path, false) } } }
Example 47
Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} import org.scalatest.{Matchers, WordSpec} class SequenceSinkTest extends WordSpec with Matchers { private val ds = DataStream.fromValues( StructType("a", "b", "c", "d"), Seq( List("1", "2", "3", "4"), List("5", "6", "7", "8") ) ) "SequenceSink" should { "write sequence files" in { implicit val conf = new Configuration implicit val fs = FileSystem.get(conf) val path = new Path("seqsink.seq") if (fs.exists(path)) fs.delete(path, true) ds.to(SequenceSink(path)) val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path)) val k = new IntWritable val v = new BytesWritable val set = for (_ <- 1 to 3) yield { reader.next(k, v) new String(v.copyBytes) } set.toSet shouldBe Set( "a,b,c,d", "1,2,3,4", "5,6,7,8" ) reader.close() fs.delete(path, true) } } }
Example 48
Source File: ParquetProjectionTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.{File, FilenameFilter} import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{FlatSpec, Matchers} class ParquetProjectionTest extends FlatSpec with Matchers { cleanUpResidualParquetTestFiles private val schema = StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) private val ds = DataStream.fromValues( schema, Seq( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) ) private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val file = new File(s"test_${System.currentTimeMillis()}.pq") file.deleteOnExit() private val path = new Path(file.toURI) if (fs.exists(path)) fs.delete(path, false) ds.to(ParquetSink(path).withOverwrite(true)) "ParquetSource" should "support projections" in { val rows = ParquetSource(path).withProjection("name").toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood"), Vector("elton john")) } it should "return all data when no projection is set" in { val rows = ParquetSource(path).toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner")) } private def cleanUpResidualParquetTestFiles = { new File(".").listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { (name.startsWith("test_") && name.endsWith(".pq")) || (name.startsWith(".test_") && name.endsWith(".pq.crc")) } }).foreach(_.delete()) } }
Example 49
Source File: AvroAndParquetCrossCompatibilityTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{FlatSpec, Matchers} // tests that avro source/sink and avro parquet source/sink can write/read each others files class AvroAndParquetCrossCompatibilityTest extends FlatSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) "AvroParquetSource and ParquetSource" should "be compatible" in { val path = new Path("cross.pq") if (fs.exists(path)) fs.delete(path, false) val structType = StructType( Field("name", StringType, nullable = false), Field("location", StringType, nullable = false) ) val ds = DataStream.fromValues( structType, Seq( Vector("clint eastwood", "carmel"), Vector("elton john", "pinner") ) ) ds.to(ParquetSink(path)) AvroParquetSource(path).toDataStream().collect shouldBe ds.collect fs.delete(path, false) ds.to(AvroParquetSink(path)) ParquetSource(path).toDataStream().collect shouldBe ds.collect fs.delete(path, false) } }
Example 50
Source File: ParquetSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object ParquetSpeedTest extends App with Timed { ParquetLogMute() val size = 2000000 val schema = StructType("a", "b", "c", "d", "e") val createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val path = new Path("parquet_speed.pq") fs.delete(path, false) new File(path.toString).deleteOnExit() timed("Insertion") { ds.to(AvroParquetSink(path).withOverwrite(true)) } while (true) { timed("Reading with ParquetSource") { val actual = ParquetSource(path).toDataStream().size assert(actual == size) } println("") println("---------") println("") Thread.sleep(2000) timed("Reading with AvroParquetSource") { val actual = AvroParquetSource(path).toDataStream().size assert(actual == size) } } }
Example 51
Source File: ParquetMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.StructType import io.eels.{FilePattern, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object ParquetMultipleFileSpeedTest extends App with Timed { ParquetLogMute() val size = 5000000 val count = 20 val schema = StructType("a", "b", "c", "d", "e") def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val dir = new Path("parquet-speed-test") new File(dir.toString).mkdirs() new File(dir.toString).listFiles().foreach(_.delete) timed("Insertion") { val ds = DataStream.fromRowIterator(schema, Iterator.continually(createRow).take(size)) ds.to(ParquetSink(new Path("parquet-speed-test/parquet_speed.pq")), count) } for (_ <- 1 to 25) { assert(count == FilePattern("parquet-speed-test/*").toPaths().size) timed("Reading with ParquetSource") { val actual = ParquetSource("parquet-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size assert(actual == size, s"Expected $size but was $actual") } println("") println("---------") println("") } }
Example 52
Source File: AvroParquetSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class AvroParquetSinkTest extends WordSpec with Matchers { ParquetLogMute() private val schema = StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) private val ds = DataStream.fromValues( schema, Seq( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) ) private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path("test.pq") "ParquetSink" should { "write schema" in { if (fs.exists(path)) fs.delete(path, false) ds.to(AvroParquetSink(path)) val people = ParquetSource(path) people.schema shouldBe StructType( Field("name", StringType, false), Field("job", StringType, false), Field("location", StringType, false) ) fs.delete(path, false) } "write data" in { if (fs.exists(path)) fs.delete(path, false) ds.to(AvroParquetSink(path)) AvroParquetSource(path).toDataStream().toSet.map(_.values) shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) fs.delete(path, false) } "support overwrite" in { val path = new Path("overwrite_test.pq") fs.delete(path, false) val schema = StructType(Field("a", StringType)) val ds = DataStream.fromRows(schema, Row(schema, Vector("x")), Row(schema, Vector("y")) ) ds.to(AvroParquetSink(path)) ds.to(AvroParquetSink(path).withOverwrite(true)) fs.delete(path, false) } } }
Example 53
Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.UUID import io.eels.component.avro.AvroSchemaFns import io.eels.component.parquet.avro.AvroParquetReaderFn import io.eels.schema.{DoubleType, Field, LongType, StructType} import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec} class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path(UUID.randomUUID().toString()) override def afterAll(): Unit = { val fs = FileSystem.get(new Configuration()) fs.delete(path, false) } private val avroSchema = SchemaBuilder.record("com.chuckle").fields() .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord() private val writer = AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .build() private val record = new GenericData.Record(avroSchema) record.put("str", "wibble") record.put("looong", 999L) record.put("dooble", 12.34) writer.write(record) writer.close() val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true)) "AvroParquetReaderFn" should { "support projections on doubles" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong")))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("dooble") shouldBe 12.34 } "support projections on longs" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str")))) val record = reader.read() reader.close() record.get("looong") shouldBe 999L } "support full projections" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("looong") shouldBe 999L record.get("dooble") shouldBe 12.34 } "support non projections" in { val reader = AvroParquetReaderFn(path, None, None) val group = reader.read() reader.close() group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" group.get("looong") shouldBe 999L group.get("dooble") shouldBe 12.34 } } }
Example 54
Source File: CsvSourceTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.csv import java.nio.file.Paths import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.scalatest.{Matchers, WordSpec} class CsvSourceTest extends WordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(conf) "CsvSource" should { "read schema" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).schema shouldBe StructType( Field("a", StringType, true), Field("b", StringType, true), Field("c", StringType, true) ) } "support null cell value option as null" in { val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI() val path = Paths.get(file) CsvSource(path).withNullValue(null).toDataStream().toSet.map(_.values) shouldBe Set(Vector("1", null, "3")) } "support null cell value replacement value" in { val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI() val path = Paths.get(file) CsvSource(path).withNullValue("foo").toDataStream().toSet.map(_.values) shouldBe Set(Vector("1", "foo", "3")) } "read from path" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstRow).toDataStream().size shouldBe 3 CsvSource(path).withHeader(Header.None).toDataStream().size shouldBe 4 } "allow specifying manual schema" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) val schema = StructType( Field("test1", StringType, true), Field("test2", StringType, true), Field("test3", StringType, true) ) CsvSource(path).withSchema(schema).toDataStream().schema shouldBe schema } "support reading header" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstRow).toDataStream().collect.map(_.values).toSet shouldBe Set(Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6")) } "support skipping header" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe Set(Vector("a", "b", "c"), Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6")) } "support delimiters" in { val file = getClass.getResource("/io/eels/component/csv/psv.psv").toURI() val path = Paths.get(file) CsvSource(path).withDelimiter('|').toDataStream().collect.map(_.values).toSet shouldBe Set(Vector("e", "f", "g")) CsvSource(path).withDelimiter('|').withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe Set(Vector("a", "b", "c"), Vector("e", "f", "g")) } "support comments for headers" in { val file = getClass.getResource("/io/eels/component/csv/comments.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType( Field("a", StringType, true), Field("b", StringType, true), Field("c", StringType, true) ) CsvSource(path).withHeader(Header.FirstComment).toDataStream().toSet.map(_.values) shouldBe Set(Vector("1", "2", "3"), Vector("e", "f", "g"), Vector("4", "5", "6")) } "terminate if asking for first comment but no comments" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType( Field("", StringType, true) ) } "support skipping corrupt rows" ignore { val file = getClass.getResource("/io/eels/component/csv/corrupt.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstRow).toDataStream().toVector.map(_.values) shouldBe Vector(Vector("1", "2", "3")) } } }
Example 55
Source File: Main.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object Main extends App { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf // the first parameter determines the command to run, just like in git, eg git pull, or in hadoop, eg hadoop fs val command = args.head val params = args.tail command match { case "schema" => ShowSchemaMain(params) case "stream" => StreamMain(params) case "apply-spec" => ApplySpecMain(params) case "fetch-spec" => FetchSpecMain(params) case "analyze" => AnalyzeMain(params) case other => System.err.println(s"Unknown command $other") } } case class Options(from: String = "", to: String = "", workerThreads: Int = 1, sourceIOThreads: Int = 1)
Example 56
Source File: FetchSpecMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import io.eels.{Constants, SourceParser} import io.eels.component.hive.{HiveSource, HiveSpec} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object FetchSpecMain { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel fetch-spec", Constants.EelVersion) opt[String]("dataset") required() action { (source, o) => o.copy(source = source) } text "specify dataset, eg hive:database:table" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val source = builder() source match { case hive: HiveSource => val spec = hive.spec val json = HiveSpec.writeAsJson(spec.copy(tables = spec.tables.filter(_.tableName == hive.tableName))) println(json) case _ => sys.error(s"Unsupported source $source") } case _ => } } case class Options(source: String = null) }
Example 57
Source File: ApplySpecMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import java.nio.file.{Path, Paths} import io.eels.{Constants, SourceParser} import io.eels.component.hive.{HiveOps, HiveSource, HiveSpec} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient object ApplySpecMain { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf implicit val client = new HiveMetaStoreClient(hiveConf) def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel apply-spec", Constants.EelVersion) opt[String]("dataset") required() action { (source, o) => o.copy(source = source) } text "specify dataset, eg hive:database:table" opt[String]("spec") required() action { (schema, o) => o.copy(specPath = Paths.get(schema)) } text "specify path to eel spec" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val source = builder() source match { case hive: HiveSource => HiveOps.applySpec(HiveSpec(options.specPath), false) case _ => sys.error(s"Unsupported source $source") } case _ => } } case class Options(source: String = null, specPath: Path = null) }
Example 58
Source File: ShowSchemaMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import io.eels.{Constants, SourceParser} import io.eels.component.avro.AvroSchemaFn import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object ShowSchemaMain { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel schema", Constants.EelVersion) opt[String]("source") required() action { (source, o) => o.copy(source = source) } text "specify source, eg hive:database:table or parquet:/path/to/file" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val source = builder() val schema = source.schema val avroSchema = AvroSchemaFn.toAvro(schema) out.println(avroSchema) case _ => } } case class Options(source: String = "") }
Example 59
Source File: AnalyzeMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import io.eels.{Constants, SourceParser} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object AnalyzeMain { import scala.concurrent.ExecutionContext.Implicits.global implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel analyze", Constants.EelVersion) opt[String]("dataset") required() action { (source, o) => o.copy(source = source) } text "specify dataset, eg hive:database:table" opt[Boolean]("reverse") optional() action { (reverse, o) => o.copy(reverse = reverse) } text "specify reverse ordering of columns, eg most distinct first" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val result = builder().counts.toSeq.sortBy(_._2.size) val orderedResults = if (options.reverse) result.reverse else result for ((columnName, columnCounts) <- orderedResults) { println(columnName) for ((value, counts) <- columnCounts) { println(s"\t$value ($counts)") } } case _ => } } case class Options(source: String = null, reverse: Boolean = false) }
Example 60
Source File: StreamMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import io.eels.{Constants, Sink, SinkParser, SourceParser} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object StreamMain { import scala.concurrent.ExecutionContext.Implicits.global implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String]): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel", Constants.EelVersion) opt[String]("source") required() action { (source, o) => o.copy(from = source) } text "specify source, eg hive:database:table" opt[String]("sink") required() action { (sink, o) => o.copy(to = sink) } text "specify sink, eg hive:database:table" opt[Int]("sourceThreads") optional() action { (threads, options) => options.copy(sourceIOThreads = threads) } text "number of source io threads, defaults to 1" opt[Int]("workerThreads") optional() action { (threads, options) => options.copy(workerThreads = threads) } text "number of worker threads, defaults to 1" } parser.parse(args, Options()) match { case Some(options) => val sourceBuilder = SourceParser(options.from).orNull val source = sourceBuilder() val sinkBuilder = SinkParser(options.to).orNull val sink = sinkBuilder() val result = source.toFrame(options.sourceIOThreads).to(sink) println(s"Completed with $result rows") case _ => } } }
Example 61
Source File: HivePartitionScanner.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.typesafe.config.{Config, ConfigFactory} import io.eels.component.hive.partition.PartitionMetaData import io.eels.schema.PartitionConstraint import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus} // scans partitions for files, returning the files and the meta data object for each partition class HivePartitionScanner(implicit fs: FileSystem) extends Logging { private val config: Config = ConfigFactory.load() private val missingPartitionAction: String = config.getString("eel.hive.source.missingPartitionAction") def scan(partitions: Seq[PartitionMetaData], constraints: Seq[PartitionConstraint] = Nil): Map[PartitionMetaData, Seq[LocatedFileStatus]] = { logger.debug(s"Scanning ${partitions.size} partitions for applicable files ${partitions.map(_.location).mkString(", ").take(100)}") // first we filter out any partitions not matching the constraints val filteredPartitions = partitions.filter { meta => constraints.forall(_.eval(meta.partition)) } logger.debug(s"Filtered partitions: ${filteredPartitions.map(_.location).mkString(", ")})") // next, we check that the directories that the partitions point to actually exist // this will avoid a situation where a location exists in the metastore but not on disk val exantPartitions = filteredPartitions.filter { partition => if (fs.exists(partition.location)) { true } else { if (missingPartitionAction == "error") { throw new IllegalStateException(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these exceptions set eel.hive.source.missingPartitionAction=warn or eel.hive.source.missingPartitionAction=none") } else if (missingPartitionAction == "warn") { logger.warn(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these warnings set eel.hive.source.missingPartitionAction=none") false } else { false } } } // next we grab all the data files from each of these partitions exantPartitions.map { meta => meta -> HiveFileScanner(meta.location, false) }.toMap } }
Example 62
Source File: HiveFileScanner.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.util.HdfsIterator import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} // given a hadoop path, will look for files inside that path that match the // configured settings for hidden files // does not return directories object HiveFileScanner extends Logging { private val config = ConfigFactory.load() private val ignoreHiddenFiles = config.getBoolean("eel.hive.source.ignoreHiddenFiles") private val hiddenFilePattern = config.getString("eel.hive.source.hiddenFilePattern") // returns true if the given file should be considered based on the config settings private def skip(file: LocatedFileStatus): Boolean = { file.getLen == 0L || ignoreHiddenFiles && file.getPath.getName.matches(hiddenFilePattern) } def apply(path: Path, recursive: Boolean)(implicit fs: FileSystem): Seq[LocatedFileStatus] = { logger.debug(s"Scanning $path, filtering=$ignoreHiddenFiles, pattern=$hiddenFilePattern") val files: List[LocatedFileStatus] = if (fs.exists(path)) { val files = fs.listFiles(path, recursive) HdfsIterator.remote(files) .filter(_.isFile) .filterNot(skip) .toList } else { Nil } logger.debug(s"Scanner found ${files.size} files") files } }
Example 63
Source File: HiveTableFilesFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import io.eels.component.hive.partition.PartitionMetaData import io.eels.schema.{Partition, PartitionConstraint} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient object HiveTableFilesFn extends Logging { def apply(dbName: String, tableName: String, tableLocation: Path, partitionConstraints: Seq[PartitionConstraint]) (implicit fs: FileSystem, client: IMetaStoreClient): Map[Partition, Seq[LocatedFileStatus]] = { val ops = new HiveOps(client) // when we have no partitions, this will scan just the table folder directly for files def rootScan(): Map[Partition, Seq[LocatedFileStatus]] = { Map(Partition.empty -> HiveFileScanner(tableLocation, false)) } def partitionsScan(partitions: Seq[PartitionMetaData]): Map[Partition, Seq[LocatedFileStatus]] = { new HivePartitionScanner().scan(partitions, partitionConstraints) .map { case (key, value) => key.partition -> value } } // the table may or may not have partitions. // // 1. If we do have partitions then we need to scan the path of each partition // (and each partition may be located anywhere outside of the table root) // // 2. If we do not have partitions then we can simply scan the table root. // we go to the metastore as we need the locations of the partitions not the values val partitions = ops.partitionsMetaData(dbName, tableName) if (partitions.isEmpty && partitionConstraints.nonEmpty) { sys.error("Constraints were used on a table that was not partitioned") } else if (partitions.isEmpty) { logger.debug(s"No partitions for $tableName; performing root table scan") rootScan } else partitionsScan(partitions) } }
Example 64
Source File: HiveFilePublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.io.Using import io.eels.datastream.{Subscription, Publisher, Subscriber} import io.eels.schema.{Partition, StructType} import io.eels.{Predicate, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus} class HiveFilePublisher(dialect: HiveDialect, file: LocatedFileStatus, metastoreSchema: StructType, projectionSchema: StructType, predicate: Option[Predicate], partition: Partition) (implicit fs: FileSystem, conf: Configuration) extends Publisher[Seq[Row]] with Using { require(projectionSchema.fieldNames.forall { it => it == it.toLowerCase() }, s"Use only lower case field names with hive") override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { val partitionMap: Map[String, Any] = partition.entries.map { it => (it.key, it.value) }.toMap // the schema we send to the dialect must have any partition fields removed, because those // fields won't exist in the data files. This is because partitions are not always written // and instead inferred from the partition itself. val projectionFields = projectionSchema.fields.filterNot(field => partition.containsKey(field.name)) val projectionWithoutPartitions = StructType(projectionFields) // since we removed the partition fields from the target schema, we must repopulate them after the read // we also need to throw away the dummy field if we had an empty schema val publisher = dialect.input(file.getPath, metastoreSchema, projectionWithoutPartitions, predicate) publisher.subscribe(new Subscriber[Seq[Row]] { override def subscribed(s: Subscription): Unit = subscriber.subscribed(s) override def next(chunk: Seq[Row]): Unit = { val aligned = chunk.map { row => if (projectionFields.isEmpty) { val values = projectionSchema.fieldNames().map(partitionMap.apply) Row(projectionSchema, values.toVector) } else { RowUtils.rowAlign(row, projectionSchema, partitionMap) } } subscriber.next(aligned) } override def completed(): Unit = subscriber.completed() override def error(t: Throwable): Unit = subscriber.error(t) }) } }
Example 65
Source File: DynamicPartitionStrategy.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.partition import io.eels.component.hive.HiveOps import io.eels.schema.Partition import io.eels.util.HdfsMkdir import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient class DynamicPartitionStrategy extends PartitionStrategy { private val cache = scala.collection.mutable.Map.empty[Partition, Path] def ensurePartition(partition: Partition, dbName: String, tableName: String, inheritPermissions: Boolean, client: IMetaStoreClient)(implicit fs: FileSystem): Path = { def createPartition: Path = this.synchronized { val ops = new HiveOps(client) ops.partitionMetaData(dbName, tableName, partition) match { case Some(meta) => meta.location case _ => val tableLocation = ops.tablePath(dbName, tableName) val partitionPath = new Path(tableLocation, partition.unquoted) ops.createPartitionIfNotExists(dbName, tableName, partition, partitionPath) HdfsMkdir(partitionPath, inheritPermissions) partitionPath } } cache.getOrElseUpdate(partition, createPartition) } }
Example 66
Source File: StaticPartitionStrategy.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.partition import io.eels.component.hive.HiveOps import io.eels.schema.Partition import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import com.sksamuel.exts.OptionImplicits._ object StaticPartitionStrategy extends PartitionStrategy { private val cache = scala.collection.mutable.Map.empty[Partition, Path] def ensurePartition(partition: Partition, dbName: String, tableName: String, inheritPermissions: Boolean, client: IMetaStoreClient)(implicit fs: FileSystem): Path = { cache.getOrElseUpdate(partition, { val ops = new HiveOps(client) val meta = ops.partitionMetaData(dbName, tableName, partition).getOrError(s"Unknown partition $partition") meta.location }) } }
Example 67
Source File: HiveStats.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import io.eels.schema.PartitionConstraint import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import scala.collection.JavaConverters._ trait HiveStats { // total number of records def count: Long = count(Nil) // total number of records in the partitions that match the constraints def count(constraints: Seq[PartitionConstraint]): Long // returns the minimum value of this field def min(field: String): Any = min(field, Nil) // returns the maximum value of this field def max(field: String): Any = max(field, Nil) // returns the minimum value of this field for the partitions that match the constraints def min(field: String, constraints: Seq[PartitionConstraint]): Any // returns the maximum value of this field for the partitions that match the constraints def max(field: String, constraints: Seq[PartitionConstraint]): Any } class ParquetHiveStats(dbName: String, tableName: String, table: HiveTable) (implicit fs: FileSystem, conf: Configuration, client: IMetaStoreClient) extends HiveStats with Logging { private val ops = new HiveOps(client) private def count(path: Path) = { val blocks = ParquetFileReader.readFooter(fs.getConf, path, ParquetMetadataConverter.NO_FILTER).getBlocks.asScala blocks.map(_.getRowCount).sum } override def count(constraints: Seq[PartitionConstraint]): Long = { val counts = HiveTableFilesFn(dbName, tableName, table.location, constraints) .flatMap(_._2) .map(_.getPath).map(count) if (counts.isEmpty) 0 else counts.sum } private def minmax(field: String, constraints: Seq[PartitionConstraint]): (Any, Any) = { def stats[T]: (Any, Any) = { def min(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) <= 0) a else b } def max(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) >= 0) a else b } val location = new Path(ops.location(dbName, tableName)) val (mins, maxes) = HiveTableFilesFn(dbName, tableName, location, constraints).toSeq.flatMap { case (_, files) => logger.debug(s"Calculating min,max in file $files") files.flatMap { file => val footer = ParquetFileReader.readFooter(conf, file, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.map { block => val column = block.getColumns.asScala.find(_.getPath.toDotString == field).getOrError(s"Unknown column $field") val min = column.getStatistics.genericGetMin.asInstanceOf[Comparable[T]] val max = column.getStatistics.genericGetMax.asInstanceOf[Comparable[T]] (min, max) } } }.unzip (min(mins), max(maxes)) } stats[Any] } override def min(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._1 override def max(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._2 }
Example 68
Source File: ParquetHiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.dialect import java.util.concurrent.atomic.AtomicInteger import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.hive.{HiveDialect, HiveOps, HiveOutputStream} import io.eels.component.parquet._ import io.eels.component.parquet.util.{ParquetIterator, ParquetLogMute} import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe import org.apache.hadoop.hive.ql.io.parquet.{MapredParquetInputFormat, MapredParquetOutputFormat} import scala.math.BigDecimal.RoundingMode.RoundingMode case class ParquetHiveDialect(options: ParquetWriteOptions = ParquetWriteOptions()) extends HiveDialect with Logging with Using { override val serde: String = classOf[ParquetHiveSerDe].getCanonicalName override val inputFormat: String = classOf[MapredParquetInputFormat].getCanonicalName override val outputFormat: String = classOf[MapredParquetOutputFormat].getCanonicalName override def input(path: Path, ignore: StructType, projectionSchema: StructType, predicate: Option[Predicate]) (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] { val client = new HiveMetaStoreClient(new HiveConf) val ops = new HiveOps(client) override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { // convert the eel projection schema into a parquet schema which will be used by the native parquet reader try { val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(projectionSchema) using(RowParquetReaderFn(path, predicate, parquetProjectionSchema.some, true)) { reader => val subscription = new Subscription { override def cancel(): Unit = reader.close() } subscriber.subscribed(subscription) ParquetIterator(reader).grouped(DataStream.DefaultBatchSize).foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } } override def output(schema: StructType, path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String]) (implicit fs: FileSystem, conf: Configuration): HiveOutputStream = { val path_x = path new HiveOutputStream { ParquetLogMute() private val _records = new AtomicInteger(0) logger.debug(s"Creating parquet writer at $path") private val writer = RowParquetWriterFn(path, schema, metadata, true, roundingMode, fs.getConf) override def write(row: Row) { require(row.values.nonEmpty, "Attempting to write an empty row") writer.write(row) _records.incrementAndGet() } override def close(): Unit = { logger.debug(s"Closing hive parquet writer $path") writer.close() // after the files are closed, we should set permissions if we've been asked to, this allows // all the files we create to stay consistent permission.foreach(fs.setPermission(path, _)) } override def records: Int = _records.get() override def path: Path = path_x } } }
Example 69
Source File: OrcHiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.dialect import com.sksamuel.exts.Logging import io.eels.component.hive.{HiveDialect, HiveOutputStream} import io.eels.component.orc.{OrcPublisher, OrcWriteOptions, OrcWriter} import io.eels.datastream.{Publisher, Subscriber} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde} import scala.math.BigDecimal.RoundingMode.RoundingMode case class OrcHiveDialect(options: OrcWriteOptions = OrcWriteOptions()) extends HiveDialect with Logging { override val serde: String = classOf[OrcSerde].getCanonicalName override val inputFormat: String = classOf[OrcInputFormat].getCanonicalName override val outputFormat: String = classOf[OrcOutputFormat].getCanonicalName override def input(path: Path, metastoreSchema: StructType, projectionSchema: StructType, predicate: Option[Predicate]) (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { new OrcPublisher(path, projectionSchema.fieldNames(), predicate).subscribe(subscriber) } } override def output(schema: StructType, path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String])(implicit fs: FileSystem, conf: Configuration): HiveOutputStream = { val path_x = path val writer = new OrcWriter(path, schema, options) new HiveOutputStream { override def write(row: Row): Unit = { require(row.values.nonEmpty, "Attempting to write an empty row") writer.write(row) } override def close(): Unit = { writer.close() permission.foreach(fs.setPermission(path, _)) } override def records: Int = writer.records override def path: Path = path_x } } }
Example 70
Source File: HiveDatabase.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.collection.JavaConverters._ class HiveContext(implicit fs: FileSystem, client: IMetaStoreClient) { def databases: Seq[HiveDatabase] = client.getAllDatabases.asScala.map { dbName => HiveDatabase(dbName) } } case class HiveDatabase(dbName: String)(implicit fs: FileSystem, client: IMetaStoreClient) { def tables(): List[HiveSource] = { val tables = client.getAllTables(dbName).asScala tables.map { it => HiveSource(dbName, it) }.toList } def table(tableName: String): HiveSource = { val exists = client.tableExists(dbName, tableName) if (!exists) throw new IllegalArgumentException(s"$dbName.$tableName does not exist") HiveSource(dbName, tableName) } def location: String = client.getDatabase(dbName).getLocationUri }
Example 71
Source File: HiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import io.eels.component.hive.dialect.{OrcHiveDialect, ParquetHiveDialect} import io.eels.datastream.Publisher import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.api.Table import scala.math.BigDecimal.RoundingMode.RoundingMode trait HiveDialect extends Logging { def serde: String def inputFormat: String def outputFormat: String def output(schema: StructType, // schema without partition information path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String]) (implicit fs: FileSystem, conf: Configuration): HiveOutputStream def stats(getPath: Path)(implicit fs: FileSystem): Long = throw new UnsupportedOperationException } object HiveDialect extends Logging { def apply(format: String): HiveDialect = format match { case input if input.contains("ParquetInputFormat") => ParquetHiveDialect() case input if input.contains("OrcInputFormat") => OrcHiveDialect() //case input if input.contains("AvroHiveDialect") || input.contains("AvroContainerInputFormat") => AvroHiveDialect // "org.apache.hadoop.mapred.TextInputFormat" -> TextHiveDialect case _ => throw new UnsupportedOperationException(s"Unknown hive input format $format") } def apply(table: Table): HiveDialect = { val format = table.getSd.getInputFormat logger.debug(s"Table format is $format") val dialect = HiveDialect(format) logger.debug(s"HiveDialect is $dialect") dialect } }
Example 72
Source File: HivePartitionPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.Row import io.eels.datastream.{Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.util.control.NonFatal class HivePartitionPublisher(dbName: String, tableName: String, projectionSchema: StructType, partitionKeys: List[String], // partition keys for this table, used to map the partition values back to a map dialect: HiveDialect // used to open up the files to check they exist if checkDataForPartitionOnlySources is true ) (implicit fs: FileSystem, client: IMetaStoreClient) extends Publisher[Seq[Row]] with Logging { private val config = ConfigFactory.load() // if this is true, then we will still check that some files exist for each partition, to avoid // a situation where the partitions have been created in the hive metastore, but no actual // data has been written using those yet. private val partitionPartFileCheck = config.getBoolean("eel.hive.source.checkDataForPartitionOnlySources") logger.info(s"eel.hive.source.checkDataForPartitionOnlySources=$partitionPartFileCheck") // returns true if the partition exists on disk private def isPartitionPhysical(part: org.apache.hadoop.hive.metastore.api.Partition): Boolean = { val location = new Path(part.getSd.getLocation) logger.debug(s"Checking that partition $location has been created on disk...") try { val exists = fs.exists(location) if (exists) { logger.debug("...exists") } else { logger.debug("...not found") } exists } catch { case NonFatal(e) => logger.warn(s"Error reading $location", e) false } } override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = client.synchronized { try { import scala.collection.JavaConverters._ // each row will contain just the values from the metastore val rows = client.listPartitions(dbName, tableName, Short.MaxValue).asScala.filter { part => !partitionPartFileCheck || isPartitionPhysical(part) }.map { part => // the partition values are assumed to be the same order as the supplied partition keys // first we build a map of the keys to values, then use that map to return a Row with // values in the order set by the fieldNames parameter val map = partitionKeys.zip(part.getValues.asScala).toMap Row(projectionSchema, projectionSchema.fieldNames.map(map(_)).toVector) } logger.debug(s"After scanning partitions and files we have ${rows.size} rows") subscriber.subscribed(Subscription.empty) rows.iterator.grouped(10).foreach(subscriber.next) subscriber.completed() } catch { case t: Throwable => subscriber.error(t) } } }
Example 73
Source File: ParquetVsOrcSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.io.File import java.math.MathContext import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.component.orc.{OrcSink, OrcSource} import io.eels.component.parquet.{ParquetSink, ParquetSource} import io.eels.datastream.DataStream import io.eels.schema._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.math.BigDecimal.RoundingMode import scala.util.Random object ParquetVsOrcSpeedTest extends App with Timed { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val size = 5000000 val structType = StructType( Field("name", StringType), Field("age", IntType.Signed), Field("height", DoubleType), Field("amazing", BooleanType), Field("fans", LongType.Signed), Field("rating", DecimalType(4, 2)) ) def iter: Iterator[Vector[Any]] = Iterator.continually(Vector( Random.nextString(10), Random.nextInt(), Random.nextDouble(), Random.nextBoolean(), Random.nextLong(), BigDecimal(Random.nextDouble(), new MathContext(4)).setScale(2, RoundingMode.UP) )) def ds: DataStream = DataStream.fromIterator(structType, iter.take(size).map(Row(structType, _))) val ppath = new Path("parquet_speed.pq") fs.delete(ppath, false) val opath = new Path("orc_speed.orc") fs.delete(opath, false) new File(ppath.toString).deleteOnExit() new File(opath.toString).deleteOnExit() timed("Orc Insertion") { ds.to(OrcSink(opath)) } timed("Parquet Insertion") { ds.to(ParquetSink(ppath)) } while (true) { timed("Reading with OrcSource") { val actual = OrcSource(opath).toDataStream().size assert(actual == size, s"$actual != $size") } timed("Reading with ParquetSource") { val actual = ParquetSource(ppath).toDataStream().size assert(actual == size, s"$actual != $size") } } }
Example 74
Source File: HiveBenchmarkApp.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.util.UUID import com.sksamuel.exts.metrics.Timed import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient import scala.util.Random object HiveBenchmarkApp extends App with Timed { val states = List( "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming").map(_.replace(' ', '_').toLowerCase) import HiveConfig._ val schema = StructType("id", "state") val rows = List.fill(1000000)(List(UUID.randomUUID.toString, states(Random.nextInt(50)))) logger.info(s"Generated ${rows.size} rows") new HiveOps(client).createTable( "sam", "people", schema, List("state"), overwrite = true ) logger.info("Table created") val sink = HiveSink("sam", "people") DataStream.fromValues(schema, rows).to(sink) logger.info("Write complete") while (true) { timed("datastream took") { val result = HiveSource("sam", "people").toDataStream().collect println(result.size) } } }
Example 75
Source File: OrcSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.orc.OrcFile.ReaderOptions import org.apache.orc._ import scala.collection.JavaConverters._ object OrcSource { def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(path)) def apply(str: String)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(str)) } case class OrcSource(pattern: FilePattern, projection: Seq[String] = Nil, predicate: Option[Predicate] = None) (implicit fs: FileSystem, conf: Configuration) extends Source with Using { override def parts(): Seq[Publisher[Seq[Row]]] = pattern.toPaths().map(new OrcPublisher(_, projection, predicate)) def withPredicate(predicate: Predicate): OrcSource = copy(predicate = predicate.some) def withProjection(first: String, rest: String*): OrcSource = withProjection(first +: rest) def withProjection(fields: Seq[String]): OrcSource = { require(fields.nonEmpty) copy(projection = fields.toList) } override def schema: StructType = { val reader = OrcFile.createReader(pattern.toPaths().head, new ReaderOptions(conf)) val schema = reader.getSchema OrcSchemaFns.fromOrcType(schema).asInstanceOf[StructType] } private def reader() = { val options = new ReaderOptions(conf) OrcFile.createReader(pattern.toPaths().head, options) } def count(): Long = reader().getNumberOfRows def statistics(): Seq[ColumnStatistics] = reader().getStatistics.toVector def stripes(): Seq[StripeInformation] = reader().getStripes.asScala def stripeStatistics(): Seq[StripeStatistics] = reader().getStripeStatistics.asScala } class OrcPublisher(path: Path, projection: Seq[String], predicate: Option[Predicate])(implicit conf: Configuration) extends Publisher[Seq[Row]] { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { val reader = OrcFile.createReader(path, new ReaderOptions(conf)) val fileSchema = OrcSchemaFns.fromOrcType(reader.getSchema).asInstanceOf[StructType] val iterator: Iterator[Row] = OrcBatchIterator(reader, fileSchema, projection, predicate).flatten val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) iterator.grouped(DataStream.DefaultBatchSize).takeWhile(_ => running.get).foreach(subscriber.next) subscriber.completed() } catch { case t: Throwable => subscriber.error(t) } } }
Example 76
Source File: OrcSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.config.ConfigSupport import com.typesafe.config.ConfigFactory import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.orc.OrcFile.{CompressionStrategy, EncodingStrategy} import org.apache.orc.OrcProto.CompressionKind case class OrcWriteOptions(overwrite: Boolean = false, compressionKind: CompressionKind, compressionStrategy: CompressionStrategy, compressionBufferSize: Option[Int], encodingStrategy: Option[EncodingStrategy], bloomFilterColumns: Seq[String] = Nil, permission: Option[FsPermission] = None, inheritPermissions: Option[Boolean] = None, rowIndexStride: Option[Int] = None) { def withCompressionKind(kind: CompressionKind): OrcWriteOptions = copy(compressionKind = kind) def withCompressionStrategy(strategy: CompressionStrategy): OrcWriteOptions = copy(compressionStrategy = strategy) def withCompressionBufferSize(size: Int): OrcWriteOptions = copy(compressionBufferSize = size.some) def withEncodingStrategy(strategy: EncodingStrategy): OrcWriteOptions = copy(encodingStrategy = strategy.some) def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcWriteOptions = copy(bloomFilterColumns = bloomFilterColumns) def withRowIndexStride(stride: Int): OrcWriteOptions = copy(rowIndexStride = stride.some) def withOverwrite(overwrite: Boolean): OrcWriteOptions = copy(overwrite = overwrite) def withPermission(permission: FsPermission): OrcWriteOptions = copy(permission = permission.some) def withInheritPermission(inheritPermissions: Boolean): OrcWriteOptions = copy(inheritPermissions = inheritPermissions.some) } object OrcWriteOptions extends ConfigSupport { // creates a config from the typesafe reference.confs def apply(): OrcWriteOptions = { val config = ConfigFactory.load() OrcWriteOptions( false, CompressionKind valueOf config.getString("eel.orc.writer.compression-kind"), CompressionStrategy valueOf config.getString("eel.orc.writer.compression-strategy"), config.getIntOpt("eel.orc.writer.compression-buffer-size"), config.getStringOpt("eel.orc.writer.encoding-strategy").map(EncodingStrategy.valueOf) ) } } case class OrcSink(path: Path, options: OrcWriteOptions = OrcWriteOptions()) (implicit fs: FileSystem, conf: Configuration) extends Sink with Logging { // -- convenience options -- def withCompressionKind(kind: CompressionKind): OrcSink = copy(options = options.copy(compressionKind = kind)) def withCompressionStrategy(strategy: CompressionStrategy): OrcSink = copy(options = options.copy(compressionStrategy = strategy)) def withCompressionBufferSize(size: Int): OrcSink = copy(options = options.copy(compressionBufferSize = size.some)) def withEncodingStrategy(strategy: EncodingStrategy): OrcSink = copy(options = options.copy(encodingStrategy = strategy.some)) def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcSink = copy(options = options.copy(bloomFilterColumns = bloomFilterColumns)) def withRowIndexStride(stride: Int): OrcSink = copy(options = options.copy(rowIndexStride = stride.some)) def withOverwrite(overwrite: Boolean): OrcSink = copy(options = options.copy(overwrite = overwrite)) def withPermission(permission: FsPermission): OrcSink = copy(options = options.copy(permission = permission.some)) def withInheritPermission(inheritPermissions: Boolean): OrcSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some)) override def open(schema: StructType, n: Int): Seq[SinkWriter] = { if (n == 1) Seq(create(schema, path)) else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) } } override def open(schema: StructType): SinkWriter = create(schema, path) private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter { if (options.overwrite && fs.exists(path)) fs.delete(path, false) val writer = new OrcWriter(path, schema, options) override def write(row: Row): Unit = writer.write(row) override def close(): Unit = { writer.close() options.permission match { case Some(perm) => fs.setPermission(path, perm) case None => if (options.inheritPermissions.getOrElse(false)) { val permission = fs.getFileStatus(path.getParent).getPermission fs.setPermission(path, permission) } } } } }
Example 77
Source File: OrcMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.datastream.DataStream import io.eels.schema.StructType import io.eels.{FilePattern, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object OrcMultipleFileSpeedTest extends App with Timed { val size = 5000000 val count = 20 val schema = StructType("a", "b", "c", "d", "e") def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val dir = new Path("orc-speed-test") new File(dir.toString).mkdirs() timed("Insertion") { val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size)) new File(dir.toString).listFiles().foreach(_.delete) ds.to(OrcSink(new Path("orc-speed-test/orc_speed.pq")).withOverwrite(true), count) } for (_ <- 1 to 25) { assert(count == FilePattern("orc-speed-test/*").toPaths().size) timed("Reading with OrcSource") { val actual = OrcSource("orc-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size assert(actual == size, s"Expected $size but was $actual") } println("") println("---------") println("") } }
Example 78
Source File: OrcPredicateTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.io.{File, FilenameFilter} import io.eels.Predicate import io.eels.datastream.DataStream import io.eels.schema.{Field, LongType, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} class OrcPredicateTest extends FlatSpec with Matchers with BeforeAndAfterAll { cleanUpResidualOrcTestFiles val schema = StructType( Field("name", StringType, nullable = true), Field("city", StringType, nullable = true), Field("age", LongType.Signed, nullable = true) ) val values = Vector.fill(1000) { Vector("sam", "middlesbrough", 37) } ++ Vector.fill(1000) { Vector("laura", "iowa city", 24) } val ds = DataStream.fromValues(schema, values) implicit val conf = new Configuration() implicit val fs = FileSystem.get(new Configuration()) val path = new Path("test.orc") if (fs.exists(path)) fs.delete(path, false) new File(path.toString).deleteOnExit() ds.to(OrcSink(path).withRowIndexStride(1000)) override protected def afterAll(): Unit = fs.delete(path, false) "OrcSource" should "support string equals predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L)) } it should "support gt predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.gt("age", 30L)).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L)) } it should "support lt predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.lt("age", 30)).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("laura", "iowa city", 24L)) } it should "enable row level filtering with predicates by default" in { conf.set("eel.orc.predicate.row.filter", "true") val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect rows.head.schema shouldBe schema rows.head.values shouldBe Vector("sam", "middlesbrough", 37L) } private def cleanUpResidualOrcTestFiles = { new File(".").listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { (name.startsWith("test_") && name.endsWith(".orc")) || (name.startsWith(".test_") && name.endsWith(".orc.crc")) } }).foreach(_.delete()) } }
Example 79
Source File: InputFormatConf.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.io.{ LongWritable, Text, Writable } import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader } import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat } import scala.collection.immutable trait InputFormatConf[K, V] extends Serializable { type IF <: InputFormat[K, V] type Split <: InputSplit with Writable type KExtract <: Extract[K] type VExtract <: Extract[V] def kExtract: KExtract def vExtract: VExtract def makeInputFormat(): IF // I'm unsure if we should WriSer them for them def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]] // TODO do we want to require typing of the RecordReader as well? final def createRecordReader(hadoopConf: Configuration, split: Split, inputFormat: IF = makeInputFormat()): RecordReader[K, V] = { val tac = ConfOnlyTAC(hadoopConf) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) recordReader } } case class TextInputFormatConf(file: String, partitions: Int) extends InputFormatConf[LongWritable, Text] { type IF = TextInputFormat type Split = FileSplit // TODO now that we figured out what's up, see if we can't eliminate the need for this... val internalK = Extract.unit[LongWritable] val internalV = Extract.text type KExtract = internalK.type type VExtract = internalV.type override val kExtract: KExtract = internalK override val vExtract: VExtract = internalV def makeInputFormat() = new TextInputFormat() def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = { val job = Job.getInstance(hadoopConf) FileInputFormat.setInputPaths(job, file) val path = new Path(file) val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen val size_per = math.round(len / partitions.toDouble) ((0 until partitions - 1).map { p => new FileSplit(path, size_per * p, size_per, null) } :+ { val fin = size_per * (partitions - 1) new FileSplit(path, fin, len - fin, null) }).map(WriSer(_)) } } // TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf object CSVInputFormatConf { def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract } = new InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract override val kExtract: KExtract = ifc.kExtract override val vExtract: VExtract = ifc.vExtract override def makeInputFormat() = ifc.makeInputFormat() override def makeSplits(hadoopConf: Configuration) = { val splits = ifc.makeSplits(hadoopConf) splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) { case WriSer(head) => val rr = createRecordReader(hadoopConf, head) require(rr.nextKeyValue, "csv has no header, first line was empty") val afterHeader = rr.getCurrentKey.get require(rr.nextKeyValue, "first split is empty") WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +: splits.tail } } } }
Example 80
Source File: package.scala From amadou with Apache License 2.0 | 5 votes |
package com.mediative.amadou import com.google.api.services.bigquery.model._ import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem import com.google.cloud.hadoop.io.bigquery._ import org.apache.hadoop.fs.{FileSystem, Path} import net.ceedubs.ficus.readers.ValueReader import net.ceedubs.ficus.FicusInstances import org.apache.spark.sql.{Dataset, SparkSession, Encoder} import java.util.concurrent.ThreadLocalRandom import scala.collection.JavaConversions._ package object bigquery extends FicusInstances { object CreateDisposition extends Enumeration { val CREATE_IF_NEEDED, CREATE_NEVER = Value } object WriteDisposition extends Enumeration { val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value } val BQ_CSV_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss zzz" object TableNotFound { import com.google.api.client.googleapis.json.GoogleJsonResponseException import com.google.api.client.googleapis.json.GoogleJsonError import scala.collection.JavaConverters._ def unapply(error: Throwable): Option[GoogleJsonError.ErrorInfo] = error match { case error: GoogleJsonResponseException => Some(error.getDetails) .filter(_.getCode == 404) .flatMap(_.getErrors.asScala.find(_.getReason == "notFound")) case _ => None } } def tableHasDataForDate( spark: SparkSession, table: TableReference, date: java.sql.Date, column: String): Boolean = { val bq = BigQueryClient.getInstance(spark.sparkContext.hadoopConfiguration) bq.hasDataForDate(table, date, column) } def saveAsBigQueryTable( tableRef: TableReference, writeDisposition: WriteDisposition.Value, createDisposition: CreateDisposition.Value): Unit = { val bucket = conf.get(BigQueryConfiguration.GCS_BUCKET_KEY) val temp = s"spark-bigquery-${System.currentTimeMillis()}=${ThreadLocalRandom.current.nextInt(Int.MaxValue)}" val gcsPath = s"gs://$bucket/spark-bigquery-tmp/$temp" self.write.json(gcsPath) val schemaFields = self.schema.fields.map { field => import org.apache.spark.sql.types._ val fieldType = field.dataType match { case BooleanType => "BOOLEAN" case LongType => "INTEGER" case IntegerType => "INTEGER" case StringType => "STRING" case DoubleType => "FLOAT" case TimestampType => "TIMESTAMP" case _: DecimalType => "INTEGER" } new TableFieldSchema().setName(field.name).setType(fieldType) }.toList val tableSchema = new TableSchema().setFields(schemaFields) bq.load(gcsPath, tableRef, tableSchema, writeDisposition, createDisposition) delete(new Path(gcsPath)) } private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, conf) fs.delete(path, true) () } } implicit val valueReader: ValueReader[BigQueryTable.PartitionStrategy] = ValueReader[String].map { _ match { case "month" => BigQueryTable.PartitionByMonth case "day" => BigQueryTable.PartitionByDay case other => sys.error(s"Unknown partition strategy") } } }
Example 81
Source File: MNIST.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne.examples import java.io.{BufferedWriter, OutputStreamWriter} import com.github.saurfang.spark.tsne.impl._ import com.github.saurfang.spark.tsne.tree.SPTree import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.LoggerFactory object MNIST { private def logger = LoggerFactory.getLogger(MNIST.getClass) def main (args: Array[String]) { val conf = new SparkConf() .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[SPTree])) val sc = new SparkContext(conf) val hadoopConf = sc.hadoopConfiguration val fs = FileSystem.get(hadoopConf) val dataset = sc.textFile("data/MNIST/mnist.csv.gz") .zipWithIndex() .filter(_._2 < 6000) .sortBy(_._2, true, 60) .map(_._1) .map(_.split(",")) .map(x => (x.head.toInt, x.tail.map(_.toDouble))) .cache() //logInfo(dataset.collect.map(_._2.toList).toList.toString) //val features = dataset.map(x => Vectors.dense(x._2)) //val scaler = new StandardScaler(true, true).fit(features) //val scaledData = scaler.transform(features) // .map(v => Vectors.dense(v.toArray.map(x => if(x.isNaN || x.isInfinite) 0.0 else x))) // .cache() val data = dataset.flatMap(_._2) val mean = data.mean() val std = data.stdev() val scaledData = dataset.map(x => Vectors.dense(x._2.map(v => (v - mean) / std))).cache() val labels = dataset.map(_._1).collect() val matrix = new RowMatrix(scaledData) val pcaMatrix = matrix.multiply(matrix.computePrincipalComponents(50)) pcaMatrix.rows.cache() val costWriter = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(s".tmp/MNIST/cost.txt"), true))) //SimpleTSNE.tsne(pcaMatrix, perplexity = 20, maxIterations = 200) BHTSNE.tsne(pcaMatrix, maxIterations = 500, callback = { //LBFGSTSNE.tsne(pcaMatrix, perplexity = 10, maxNumIterations = 500, numCorrections = 10, convergenceTol = 1e-8) case (i, y, loss) => if(loss.isDefined) logger.info(s"$i iteration finished with loss $loss") val os = fs.create(new Path(s".tmp/MNIST/result${"%05d".format(i)}.csv"), true) val writer = new BufferedWriter(new OutputStreamWriter(os)) try { (0 until y.rows).foreach { row => writer.write(labels(row).toString) writer.write(y(row, ::).inner.toArray.mkString(",", ",", "\n")) } if(loss.isDefined) costWriter.write(loss.get + "\n") } finally { writer.close() } }) costWriter.close() sc.stop() } }
Example 82
Source File: RMCallbackHandler.scala From DataXServer with Apache License 2.0 | 5 votes |
package org.tianlangstudio.data.hamal.yarn import java.io.File import java.util.{Collections, List} import org.tianlangstudio.data.hamal.core.{Constants, HamalConf} import org.tianlangstudio.data.hamal.core.HamalConf //import java.util.Collections import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path, FileContext} import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.client.api.{AMRMClient, NMClient} import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import scala.jdk.CollectionConverters._ //import scala.collection.JavaConverters._ /** * Created by zhuhq on 2016/4/29. */ class RMCallbackHandler(nmClient:NMClient,containerCmd:Container => String,hamalConf: HamalConf,yarnConfiguration: Configuration) extends AMRMClientAsync.CallbackHandler { private val logging = org.slf4j.LoggerFactory.getLogger(classOf[RMCallbackHandler]) override def onContainersCompleted(statuses: List[ContainerStatus]): Unit = { for(containerStatus <- statuses.asScala) { logging.info(s"containerId:${containerStatus} exitStatus:${containerStatus}") } } override def onError(e: Throwable): Unit = { logging.error("on error",e) } override def getProgress: Float = { 0 } override def onShutdownRequest(): Unit = { logging.info("on shutdown request") } override def onNodesUpdated(updatedNodes: List[NodeReport]): Unit = { logging.info("on nodes updated") for(nodeReport <- updatedNodes.asScala) { logging.info(s"node id:${nodeReport} node labels:${nodeReport}"); } } override def onContainersAllocated(containers: List[Container]): Unit = { logging.info("on containers allocated"); for (container:Container <- containers.asScala) { try { // Launch container by create ContainerLaunchContext val ctx = Records.newRecord(classOf[ContainerLaunchContext]); //ctx.setCommands(Collections.singletonList(""" echo "begin";sleep 900;echo "end"; """)) ctx.setCommands(Collections.singletonList(containerCmd(container))) val packagePath = hamalConf.getString(Constants.DATAX_EXECUTOR_FILE,"executor.zip"); val archiveStat = FileSystem.get(yarnConfiguration).getFileStatus(new Path(packagePath)) val packageUrl = ConverterUtils.getYarnUrlFromPath( FileContext.getFileContext.makeQualified(new Path(packagePath))); val packageResource = Records.newRecord[LocalResource](classOf[LocalResource]) packageResource.setResource(packageUrl); packageResource.setSize(archiveStat.getLen); packageResource.setTimestamp(archiveStat.getModificationTime); packageResource.setType(LocalResourceType.ARCHIVE); packageResource.setVisibility(LocalResourceVisibility.APPLICATION) ctx.setLocalResources(Collections.singletonMap(Constants.DATAX_EXECUTOR_ARCHIVE_FILE_NAME,packageResource)) logging.info("[AM] Launching container " + container.getId()); nmClient.startContainer(container, ctx); } catch { case ex:Exception => logging.info("[AM] Error launching container " + container.getId() + " " + ex); } } } }
Example 83
Source File: GenericMainClass.scala From darwin with Apache License 2.0 | 5 votes |
package it.agilelab.darwin.app.spark import java.text.SimpleDateFormat import java.util.Date import com.typesafe.config.{Config, ConfigFactory} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ trait GenericMainClass { self: SparkManager => val genericMainClassLogger: Logger = LoggerFactory.getLogger("SparkManager") private def makeFileSystem(session: SparkSession): FileSystem = { if (session.sparkContext.isLocal) { FileSystem.getLocal(session.sparkContext.hadoopConfiguration) } else { FileSystem.get(session.sparkContext.hadoopConfiguration) } } // scalastyle:off private def getGlobalConfig: Config = { genericMainClassLogger.debug("system environment vars") for ((k, v) <- System.getenv().asScala.toSeq.sortBy(_._1)) genericMainClassLogger.debug(s"$k -> $v") genericMainClassLogger.debug("system properties") for ((k, v) <- System.getProperties.asScala.toSeq.sortBy(_._1)) genericMainClassLogger.debug(s"$k -> $v") ConfigFactory.load() } // scalastyle:on }
Example 84
Source File: SchemaManagerSparkApp.scala From darwin with Apache License 2.0 | 5 votes |
package it.agilelab.darwin.app.spark import java.nio.ByteOrder import com.typesafe.config.{Config, ConfigFactory} import it.agilelab.darwin.app.spark.classes._ import it.agilelab.darwin.manager.AvroSchemaManagerFactory import org.apache.avro.reflect.ReflectData import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} object SchemaManagerSparkApp extends GenericMainClass with SparkManager { val mainLogger: Logger = LoggerFactory.getLogger("SchemaManagerSparkApp") val endianness: ByteOrder = ByteOrder.BIG_ENDIAN override protected def runJob(settings: Config)(implicit fs: FileSystem, sparkSession: SparkSession): Int = { import sparkSession.implicits._ val ds = sparkSession.createDataset(sparkSession.sparkContext.parallelize(1 to 1000, 20)) mainLogger.info("Registering schemas") // val reflections = new Reflections("it.agilelab.darwin.app.spark.classes") // val annotationClass: Class[AvroSerde] = classOf[AvroSerde] // val classes = reflections.getTypesAnnotatedWith(annotationClass).asScala.toSeq // .filter(c => !c.isInterface && !Modifier.isAbstract(c.getModifiers)) // val schemas = classes.map(c => ReflectData.get().getSchema(Class.forName(c.getName))) val schemas = Seq(ReflectData.get().getSchema(classOf[Menu]), ReflectData.get().getSchema(classOf[MenuItem]), ReflectData.get().getSchema(classOf[Food]), ReflectData.get().getSchema(classOf[Order]), ReflectData.get().getSchema(classOf[Price])) val conf = ConfigFactory.load() val manager = AvroSchemaManagerFactory.initialize(conf) val registeredIDs: Seq[Long] = manager.registerAll(schemas).map(_._1) mainLogger.info("Schemas registered") mainLogger.info("Getting ID for a schema") manager.getId(ReflectData.get().getSchema(classOf[Menu])) mainLogger.info("ID retrieved for the schema") mainLogger.info("Get Schema from ID") val d2 = ds.map { x => AvroSchemaManagerFactory.initialize(conf).getSchema(registeredIDs(x % registeredIDs.size)) x } d2.count() mainLogger.info("All schemas obtained") 10 } override protected def handleException(exception: Throwable, applicationSettings: Config): Unit = { mainLogger.error(exception.getMessage) } }
Example 85
Source File: ImageLoaderUtils.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import java.awt.image.BufferedImage import java.io.{InputStream, ByteArrayInputStream} import java.net.URI import java.util.zip.GZIPInputStream import javax.imageio.ImageIO import keystoneml.loaders.VOCLoader._ import org.apache.commons.compress.archivers.ArchiveStreamFactory import org.apache.commons.compress.archivers.tar.TarArchiveInputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import keystoneml.pipelines.Logging import keystoneml.utils._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag object ImageLoaderUtils extends Logging { def loadFiles[L, I <: AbstractLabeledImage[L] : ClassTag]( filePathsRDD: RDD[URI], labelsMap: String => L, imageBuilder: (Image, L, Option[String]) => I, // TODO(etrain): We can probably do this with implicits. namePrefix: Option[String] = None): RDD[I] = { filePathsRDD.flatMap(fileUri => loadFile(fileUri, labelsMap, imageBuilder, namePrefix)) } private def loadFile[L, I <: AbstractLabeledImage[L]]( fileUri: URI, labelsMap: String => L, imageBuilder: (Image, L, Option[String]) => I, namePrefix: Option[String]): Iterator[I] = { val filePath = new Path(fileUri) val conf = new Configuration(true) val fs = FileSystem.get(filePath.toUri(), conf) val fStream = fs.open(filePath) val tarStream = new ArchiveStreamFactory().createArchiveInputStream( "tar", fStream).asInstanceOf[TarArchiveInputStream] var entry = tarStream.getNextTarEntry() val imgs = new ArrayBuffer[I] while (entry != null) { if (!entry.isDirectory && (namePrefix.isEmpty || entry.getName.startsWith(namePrefix.get))) { var offset = 0 var ret = 0 val content = new Array[Byte](entry.getSize().toInt) while (ret >= 0 && offset != entry.getSize()) { ret = tarStream.read(content, offset, content.length - offset) if (ret >= 0) { offset += ret } } val bais = new ByteArrayInputStream(content) val image = ImageUtils.loadImage(bais).map { img => imageBuilder(img, labelsMap(entry.getName), Some(entry.getName)) } imgs ++= image } entry = tarStream.getNextTarEntry() } imgs.iterator } }
Example 86
Source File: QueryPartitionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.io.File import java.sql.Timestamp import com.google.common.io.Files import org.apache.hadoop.fs.FileSystem import org.apache.spark.internal.config._ import org.apache.spark.sql._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import spark.implicits._ private def queryWhenPathNotExist(): Unit = { withTempView("testData") { withTable("table_with_partition", "createAndInsertTest") { withTempDir { tmpDir => val testData = sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.createOrReplaceTempView("testData") // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.union(testData).union(testData).union(testData)) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.union(testData).union(testData)) } } } } test("SPARK-5068: query data when path doesn't exist") { withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") { queryWhenPathNotExist() } } test("Replace spark.sql.hive.verifyPartitionPath by spark.files.ignoreMissingFiles") { withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "false") { sparkContext.conf.set(IGNORE_MISSING_FILES.key, "true") queryWhenPathNotExist() } } test("SPARK-21739: Cast expression should initialize timezoneId") { withTable("table_with_timestamp_partition") { sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)") sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " + "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)") // test for Cast expression in TableReader checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"), Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000")))) // test for Cast expression in HiveTableScanExec checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " + "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1)) } } }
Example 87
Source File: ParquetFileFormatSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext { test("read parquet footers in parallel") { def testReadFooters(ignoreCorruptFiles: Boolean): Unit = { withTempDir { dir => val fs = FileSystem.get(spark.sessionState.newHadoopConf()) val basePath = dir.getCanonicalPath val path1 = new Path(basePath, "first") val path2 = new Path(basePath, "second") val path3 = new Path(basePath, "third") spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString) spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString) spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString) val fileStatuses = Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val footers = ParquetFileFormat.readParquetFootersInParallel( spark.sessionState.newHadoopConf(), fileStatuses, ignoreCorruptFiles) assert(footers.size == 2) } } testReadFooters(true) val exception = intercept[SparkException] { testReadFooters(false) }.getCause assert(exception.getMessage().contains("Could not read footer for file")) } }
Example 88
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 89
Source File: UnsplittableSequenceFileInputFormat.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.hadoop.splits import java.io.IOException import java.util import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath } import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat } import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input import scala.collection.JavaConverters._ override def listStatus(job: JobContext): util.List[FileStatus] = super .listStatus(job) .asScala .sortBy { _.getPath.getName match { case PartFileBasename(idx) ⇒ idx case basename ⇒ throw new IllegalArgumentException(s"Bad partition file: $basename") } } .asJava }
Example 90
Source File: HadoopBundleFileSystem.scala From mleap with Apache License 2.0 | 5 votes |
package ml.bundle.hdfs import java.io.File import java.net.URI import java.nio.file.{Files, Paths} import com.typesafe.config.Config import ml.combust.bundle.fs.BundleFileSystem import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Try import scala.collection.JavaConverters._ object HadoopBundleFileSystem { lazy val defaultSchemes: Seq[String] = Seq("hdfs") def createHadoopConfiguration(config: Config): Configuration = { val options: Map[String, String] = if(config.hasPath("options")) { config.getConfig("options").entrySet().asScala.map { entry => (entry.getKey, entry.getValue.unwrapped().toString) }.toMap } else { Map() } val c = new Configuration() for ((key, value) <- options) { c.set(key, value) } c } def createSchemes(config: Config): Seq[String] = if (config.hasPath("schemes")) { config.getStringList("schemes").asScala } else { Seq("hdfs") } } class HadoopBundleFileSystem(fs: FileSystem, override val schemes: Seq[String] = HadoopBundleFileSystem.defaultSchemes) extends BundleFileSystem { def this(config: Config) = { this(FileSystem.get(HadoopBundleFileSystem.createHadoopConfiguration(config)), HadoopBundleFileSystem.createSchemes(config)) } override def load(uri: URI): Try[File] = Try { val tmpDir = Files.createTempDirectory("hdfs-bundle") val tmpFile = Paths.get(tmpDir.toString, "bundle.zip") fs.copyToLocalFile(new Path(uri.toString), new Path(tmpFile.toString)) tmpFile.toFile } override def save(uri: URI, localFile: File): Unit = { fs.copyFromLocalFile(new Path(localFile.toString), new Path(uri.toString)) } }
Example 91
Source File: HadoopBundleFileSystemSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.bundle.hdfs import java.net.URI import java.nio.file.{Files, Paths} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.FunSpec class HadoopBundleFileSystemSpec extends FunSpec { private val fs = FileSystem.get(new Configuration()) private val bundleFs = new HadoopBundleFileSystem(fs) describe("scheme") { it("returns hdfs") { assert(bundleFs.schemes == Seq("hdfs")) } } describe("load") { it("loads a file from hadoop and saves to a local file") { val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt") Files.write(testFile.toAbsolutePath, "HELLO".getBytes()) val loadedFile = bundleFs.load(testFile.toUri).get val contents = new String(Files.readAllBytes(loadedFile.toPath)) assert(contents == "HELLO") } } describe("save") { it("saves local file to HDFS") { val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt") Files.write(testFile.toAbsolutePath, "HELLO".getBytes()) val tmpDir = Files.createTempDirectory("HadoopBundleFileSystemSpec") val tmpFile = new URI(s"file://$tmpDir/test.txt") bundleFs.save(tmpFile, testFile.toFile) val contents = new String(Files.readAllBytes(Paths.get(tmpFile))) assert(contents == "HELLO") } } }
Example 92
Source File: SparkBundleContext.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle import ml.bundle.hdfs.HadoopBundleFileSystem import ml.combust.bundle.{BundleRegistry, HasBundleRegistry} import ml.combust.mleap.ClassLoaderUtil import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.DataFrame object SparkBundleContext { val DEFAULT_REGISTRY_KEY: String = "ml.combust.mleap.spark.registry.default" implicit lazy val defaultContext: SparkBundleContext = SparkBundleContext(None, Some(classOf[SparkBundleContext].getClassLoader)) def apply(dataset: Option[DataFrame] = None): SparkBundleContext = apply(dataset, None) def apply(dataset: Option[DataFrame], clOption: Option[ClassLoader]): SparkBundleContext = { val cl = clOption.getOrElse(ClassLoaderUtil.findClassLoader(classOf[SparkBundleContext].getCanonicalName)) apply(dataset, BundleRegistry(DEFAULT_REGISTRY_KEY, Some(cl))) } } case class SparkBundleContext(dataset: Option[DataFrame], override val bundleRegistry: BundleRegistry) extends HasBundleRegistry { def withDataset(dataset: DataFrame): SparkBundleContext = withDataset(dataset, registerHdfs = true) def withDataset(dataset: DataFrame, registerHdfs: Boolean): SparkBundleContext = { val bundleRegistry2 = if (registerHdfs) { bundleRegistry.registerFileSystem( new HadoopBundleFileSystem(FileSystem.get( dataset.sqlContext.sparkSession.sparkContext.hadoopConfiguration))) } else { bundleRegistry } copy(dataset = Some(dataset), bundleRegistry = bundleRegistry2) } }
Example 93
Source File: TestSpec.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata import java.io.ByteArrayInputStream import java.nio.file.Files import com.coxautodata.objects.SerializableFileStatus import com.coxautodata.utils.FileListing import org.apache.commons.io.{FileUtils, IOUtils} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers} trait TestSpec extends FunSpec with Matchers with BeforeAndAfterEach { var testingBaseDir: java.nio.file.Path = _ var testingBaseDirName: String = _ var testingBaseDirPath: Path = _ var localFileSystem: LocalFileSystem = _ override def beforeEach(): Unit = { super.beforeEach() testingBaseDir = Files.createTempDirectory("test_output") testingBaseDirName = testingBaseDir.toString localFileSystem = FileSystem.getLocal(new Configuration()) testingBaseDirPath = localFileSystem.makeQualified(new Path(testingBaseDirName)) } override def afterEach(): Unit = { super.afterEach() FileUtils.deleteDirectory(testingBaseDir.toFile) } def createFile(relativePath: Path, content: Array[Byte]): SerializableFileStatus = { val path = new Path(testingBaseDirPath, relativePath) localFileSystem.mkdirs(path.getParent) val in = new ByteArrayInputStream(content) val out = localFileSystem.create(path) IOUtils.copy(in, out) in.close() out.close() SerializableFileStatus(localFileSystem.getFileStatus(path)) } def fileStatusToResult(f: SerializableFileStatus): FileListing = { FileListing(f.getPath.toString, if (f.isFile) Some(f.getLen) else None) } }
Example 94
Source File: GeoJSONRelation.scala From magellan with Apache License 2.0 | 5 votes |
package magellan import magellan.mapreduce.WholeFileInputFormat import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.io.{NullWritable, Text} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.json4s._ import org.json4s.jackson.JsonMethods._ case class GeoJSONRelation( path: String, parameters: Map[String, String]) (@transient val sqlContext: SQLContext) extends SpatialRelation { protected override def _buildScan(): RDD[Array[Any]] = { val conf = sc.hadoopConfiguration FileSystem.getLocal(conf) sc.newAPIHadoopFile( path, classOf[WholeFileInputFormat], classOf[NullWritable], classOf[Text]).flatMap { case (k, v) => val line = v.toString() parseShapeWithMeta(line) }.map { case (shape: Shape, meta: Option[Map[String, String]]) => Array(shape, meta) } } private def parseShapeWithMeta(line: String) = { val tree = parse(line) implicit val formats = org.json4s.DefaultFormats val result = tree.extract[GeoJSON] result.features.flatMap { f => f.geometry.shapes.map(shape => (shape, f.properties)) } } }
Example 95
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 96
Source File: ShxReaderSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import magellan.TestSparkContext import magellan.io.PolygonReader import org.apache.commons.io.EndianUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{ArrayWritable, LongWritable, Text} import org.scalatest.FunSuite class ShxReaderSuite extends FunSuite with TestSparkContext { test("Read shx file") { val path = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shx").getPath val conf = new Configuration() conf.set("mapreduce.input.fileinputformat.split.maxsize", "10000") val data = sc.newAPIHadoopFile( path, classOf[ShxInputFormat], classOf[Text], classOf[ArrayWritable], conf ).map { case (txt: Text, splits: ArrayWritable) => val fileName = txt.toString val s = splits.get() val size = s.length var i = 0 val v = Array.fill(size)(0L) while (i < size) { v.update(i, s(i).asInstanceOf[LongWritable].get()) i += 1 } (fileName, v) } assert(data.count() === 1) val (fileName, splits) = data.first() assert(fileName === "tl_2016_us_state") // the offsets should be correct val firstOffset = splits(0) val secondOffset = splits(1) // skipping to the first offset in the Shapefile should allow me to read the first polygon val shpFilePath = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shp").getPath val fs = FileSystem.get(sc.hadoopConfiguration) var dis = fs.open(new Path(shpFilePath)) // skip firstOffset # of bytes dis.seek(firstOffset) // skip record number assert(dis.readInt() === 1) // read content length var contentLength = 16 * (dis.readInt() + 4) // extract the shape type var shapeType = EndianUtils.swapInteger(dis.readInt()) // expect a Polygon assert(shapeType === 5) // the first polygon's content should follow from here val polygonReader = new PolygonReader() val polygon = polygonReader.readFields(dis) assert(polygon != null) // seek to the second offset dis.seek(secondOffset) assert(dis.readInt() === 2) } }
Example 97
Source File: WorkbookReader.scala From spark-excel with Apache License 2.0 | 5 votes |
package com.crealytics.spark.excel import java.io.InputStream import com.crealytics.spark.excel.Utils.MapIncluding import com.github.pjfanning.xlsx.StreamingReader import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory} trait WorkbookReader { protected def openWorkbook(): Workbook def withWorkbook[T](f: Workbook => T): T = { val workbook = openWorkbook() val res = f(workbook) workbook.close() res } def sheetNames: Seq[String] = { withWorkbook( workbook => for (sheetIx <- (0 until workbook.getNumberOfSheets())) yield { workbook.getSheetAt(sheetIx).getSheetName() } ) } } object WorkbookReader { val WithLocationMaxRowsInMemoryAndPassword = MapIncluding(Seq("path"), optionally = Seq("maxRowsInMemory", "workbookPassword")) def apply(parameters: Map[String, String], hadoopConfiguration: Configuration): WorkbookReader = { def readFromHadoop(location: String) = { val path = new Path(location) FileSystem.get(path.toUri, hadoopConfiguration).open(path) } parameters match { case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(Some(maxRowsInMemory), passwordOption)) => new StreamingWorkbookReader(readFromHadoop(location), passwordOption, maxRowsInMemory.toInt) case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(None, passwordOption)) => new DefaultWorkbookReader(readFromHadoop(location), passwordOption) } } } class DefaultWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String]) extends WorkbookReader { protected def openWorkbook(): Workbook = workbookPassword .fold(WorkbookFactory.create(inputStreamProvider))( password => WorkbookFactory.create(inputStreamProvider, password) ) } class StreamingWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String], maxRowsInMem: Int) extends WorkbookReader { override protected def openWorkbook(): Workbook = { val builder = StreamingReader .builder() .rowCacheSize(maxRowsInMem) .bufferSize(4096) workbookPassword .fold(builder)(password => builder.password(password)) .open(inputStreamProvider) } }
Example 98
Source File: ExcelFileSaver.scala From spark-excel with Apache License 2.0 | 5 votes |
package com.crealytics.spark.excel import java.io.BufferedOutputStream import com.crealytics.spark.excel.ExcelFileSaver.{DEFAULT_DATE_FORMAT, DEFAULT_SHEET_NAME, DEFAULT_TIMESTAMP_FORMAT} import com.norbitltd.spoiwo.model._ import com.norbitltd.spoiwo.natures.xlsx.Model2XlsxConversions._ import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.poi.ss.util.CellRangeAddress import org.apache.poi.xssf.usermodel.XSSFWorkbook import org.apache.spark.sql.{DataFrame, SaveMode} import scala.collection.JavaConverters._ object ExcelFileSaver { final val DEFAULT_SHEET_NAME = "Sheet1" final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm" final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000" } class ExcelFileSaver( fs: FileSystem, location: Path, dataFrame: DataFrame, saveMode: SaveMode, dataLocator: DataLocator, header: Boolean = true ) { def save(): Unit = { def sheet(workbook: XSSFWorkbook) = { val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None val dataRows = dataFrame .toLocalIterator() .asScala .map(_.toSeq) dataLocator.toSheet(headerRow, dataRows, workbook) } val fileAlreadyExists = fs.exists(location) def writeToWorkbook(workbook: XSSFWorkbook): Unit = { Workbook(sheet(workbook)).writeToExisting(workbook) autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write) } (fileAlreadyExists, saveMode) match { case (false, _) | (_, SaveMode.Overwrite) => if (fileAlreadyExists) { fs.delete(location, true) } writeToWorkbook(new XSSFWorkbook()) case (true, SaveMode.ErrorIfExists) => sys.error(s"path $location already exists.") case (true, SaveMode.Ignore) => () case (true, SaveMode.Append) => val inputStream: FSDataInputStream = fs.open(location) val workbook = new XSSFWorkbook(inputStream) inputStream.close() writeToWorkbook(workbook) } } def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = { try { fun(closeable) } finally { closeable.close() } } }
Example 99
Source File: ParquetWriterTask.scala From gearpump-examples with Apache License 2.0 | 5 votes |
package io.gearpump.examples.kafka_hdfs_pipeline import org.apache.avro.Schema import io.gearpump.Message import io.gearpump.cluster.UserConfig import io.gearpump.examples.kafka_hdfs_pipeline.ParquetWriterTask._ import io.gearpump.streaming.task.{StartTime, Task, TaskContext} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.parquet.avro.AvroParquetWriter import scala.util.{Failure, Success, Try} class ParquetWriterTask(taskContext : TaskContext, config: UserConfig) extends Task(taskContext, config) { val outputFileName = taskContext.appName + ".parquet" val absolutePath = Option(getHdfs + config.getString(PARQUET_OUTPUT_DIRECTORY).getOrElse("/parquet") + "/" + outputFileName).map(deleteFile(_)).get val outputPath = new Path(absolutePath) var parquetWriter = new AvroParquetWriter[SpaceShuttleRecord](outputPath, SpaceShuttleRecord.SCHEMA$) def getYarnConf = new YarnConfiguration def getFs = FileSystem.get(getYarnConf) def getHdfs = new Path(getFs.getHomeDirectory, "gearpump") private def deleteFile(fileName: String): String = { val file = new Path(fileName) getFs.exists(file) match { case true => getFs.delete(file,false) case false => } fileName } override def onStart(startTime: StartTime): Unit = { LOG.info(s"ParquetWriter.onStart $absolutePath") } override def onNext(msg: Message): Unit = { Try({ parquetWriter.write(msg.msg.asInstanceOf[SpaceShuttleRecord]) }) match { case Success(ok) => case Failure(throwable) => LOG.error(s"failed ${throwable.getMessage}") } } override def onStop(): Unit = { LOG.info("ParquetWriter.onStop") parquetWriter.close() } } object ParquetWriterTask { val PARQUET_OUTPUT_DIRECTORY = "parquet.output.directory" val PARQUET_WRITER = "parquet.writer" }
Example 100
Source File: ParquetWriterTaskSpec.scala From gearpump-examples with Apache License 2.0 | 5 votes |
package io.gearpump.examples.kafka_hdfs_pipeline import akka.actor.ActorSystem import org.apache.avro.Schema import io.gearpump.Message import io.gearpump.cluster.UserConfig import io.gearpump.streaming.MockUtil import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.parquet.avro.{AvroParquetReader, AvroParquetWriter} import org.apache.parquet.hadoop.ParquetReader import org.apache.parquet.hadoop.api.ReadSupport import org.mockito.Mockito import org.mockito.Mockito._ import org.scalatest.prop.PropertyChecks import org.scalatest.{BeforeAndAfterAll, Matchers, PropSpec} class ParquetWriterTaskSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfterAll { implicit var system: ActorSystem = ActorSystem("PipeLineSpec") val context = MockUtil.mockTaskContext val appName = "KafkaHdfsPipeLine" when(context.appName).thenReturn(appName) val fs = FileSystem.get(new YarnConfiguration) val homeDir = fs.getHomeDirectory.toUri.getPath val parquetDir = new Path(homeDir, "gearpump") + "/parquet/" val parquetPath = parquetDir + appName + ".parquet" val parquetCrc = parquetDir + "." + appName + ".parquet.crc" val parquetWriter = Mockito.mock(classOf[AvroParquetWriter[SpaceShuttleRecord]]) val anomaly = 0.252 val now = System.currentTimeMillis val userConfig = UserConfig.empty.withString(ParquetWriterTask.PARQUET_OUTPUT_DIRECTORY, "/parquet") override def afterAll(): Unit = { List(parquetPath, parquetCrc, parquetDir).foreach(new java.io.File(_).delete) system.shutdown() } property("ParquetWriterTask should initialize with local parquet file opened for writing") { val parquetWriterTask = new ParquetWriterTask(context, userConfig) val path = parquetWriterTask.absolutePath.stripPrefix("file:") assert(parquetPath.equals(path)) parquetWriterTask.onStop } property("ParquetWriterTask should write records to a parquet file") { val message = Message(SpaceShuttleRecord(now, anomaly), now) val parquetWriterTask = new ParquetWriterTask(context, userConfig) parquetWriterTask.parquetWriter = parquetWriter parquetWriterTask.onNext(message) verify(parquetWriterTask.parquetWriter).write(message.msg.asInstanceOf[SpaceShuttleRecord]) parquetWriterTask.onStop } property("ParquetWriterTask should have verifiable written record") { val message = Message(SpaceShuttleRecord(now, anomaly), now) val parquetWriterTask = new ParquetWriterTask(context, userConfig) parquetWriterTask.onNext(message) parquetWriterTask.onStop val reader = new AvroParquetReader[SpaceShuttleRecord](new Path(parquetPath)) val record = reader.read() assert(message.msg.asInstanceOf[SpaceShuttleRecord].anomaly == record.anomaly) assert(message.msg.asInstanceOf[SpaceShuttleRecord].ts == record.ts) } }
Example 101
Source File: HdfsUtilsTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.serving.core.utils import java.io.{FileNotFoundException, InputStream} import org.apache.hadoop.fs.{FileSystem, _} import org.junit.runner.RunWith import org.mockito.Mockito._ import org.scalatest._ import org.scalatest.junit.JUnitRunner import org.scalatest.mock.MockitoSugar import scala.util.{Failure, Try} @RunWith(classOf[JUnitRunner]) class HdfsUtilsTest extends FlatSpec with ShouldMatchers with MockitoSugar { val fileSystem: FileSystem = mock[FileSystem] val utils = new HdfsUtils(fileSystem, "stratio") "hdfs utils" should "getfiles from a path" in { val expected = Array(mock[FileStatus]) when(fileSystem.listStatus(new Path("myTestPath"))).thenReturn(expected) val result = utils.getFiles("myTestPath") result should be(expected) } it should "return single file as inputStream" in { val expected: InputStream = mock[FSDataInputStream] when(fileSystem.open(new Path("testFile"))).thenReturn(expected.asInstanceOf[FSDataInputStream]) val result: InputStream = utils.getFile("testFile") result should be(expected) } it should "write" in { val result = Try(utils.write("from", "to", true)) match { case Failure(ex: Throwable) => ex } result.isInstanceOf[FileNotFoundException] should be(true) } it should "write without override" in { val result = Try(utils.write("from", "to", false)) match { case Failure(ex: Throwable) => ex } result.isInstanceOf[FileNotFoundException] should be(true) } }
Example 102
Source File: SharedOapContext.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test.oap import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.{OapExtensions, SparkSession} import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, SparkPlan} import org.apache.spark.sql.execution.datasources.oap.{IndexType, OapFileFormat} import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.oap.{OapDriverRuntime, OapRuntime} import org.apache.spark.sql.test.OapSharedSQLContext trait SharedOapContext extends SharedOapContextBase { protected override def createSparkSession: SparkSession = { SparkSession.cleanupAnyExistingSession() val session = SparkSession.builder() .master("local[2]") .appName("test-oap-context") .config(oapSparkConf).getOrCreate() OapRuntime.getOrCreate.asInstanceOf[OapDriverRuntime].setTestSession(session) session } } protected def withFileSystem(f: FileSystem => Unit): Unit = { var fs: FileSystem = null try { fs = FileSystem.get(configuration) f(fs) } finally { if (fs != null) { fs.close() } } } } case class TestPartition(key: String, value: String) case class TestIndex( tableName: String, indexName: String, partitions: TestPartition*)
Example 103
Source File: SeqFileStreamProcessor.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import java.io.File import java.time.Instant import java.util.concurrent.TimeUnit import scala.concurrent.duration.FiniteDuration import akka.actor.Cancellable import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.SequenceFile._ import org.apache.hadoop.io.{SequenceFile, Text} import org.apache.gearpump.Message import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._ import org.apache.gearpump.streaming.examples.fsio.SeqFileStreamProcessor._ import org.apache.gearpump.streaming.task.{Task, TaskContext} class SeqFileStreamProcessor(taskContext: TaskContext, config: UserConfig) extends Task(taskContext, config) { import taskContext.taskId val outputPath = new Path(config.getString(OUTPUT_PATH).get + File.separator + taskId) var writer: SequenceFile.Writer = null val textClass = new Text().getClass val key = new Text() val value = new Text() val hadoopConf = config.hadoopConf private var msgCount: Long = 0 private var snapShotKVCount: Long = 0 private var snapShotTime: Long = 0 private var scheduler: Cancellable = null override def onStart(startTime: Instant): Unit = { val fs = FileSystem.get(hadoopConf) fs.deleteOnExit(outputPath) writer = SequenceFile.createWriter(hadoopConf, Writer.file(outputPath), Writer.keyClass(textClass), Writer.valueClass(textClass)) scheduler = taskContext.schedule(new FiniteDuration(5, TimeUnit.SECONDS), new FiniteDuration(5, TimeUnit.SECONDS))(reportStatus()) snapShotTime = System.currentTimeMillis() LOG.info("sequence file bolt initiated") } override def onNext(msg: Message): Unit = { val kv = msg.value.asInstanceOf[String].split("\\+\\+") if (kv.length >= 2) { key.set(kv(0)) value.set(kv(1)) writer.append(key, value) } msgCount += 1 } override def onStop(): Unit = { if (scheduler != null) { scheduler.cancel() } writer.close() LOG.info("sequence file bolt stopped") } private def reportStatus() = { val current: Long = System.currentTimeMillis() LOG.info(s"Task $taskId Throughput: ${ (msgCount - snapShotKVCount, (current - snapShotTime) / 1000) } (KVPairs, second)") snapShotKVCount = msgCount snapShotTime = current } } object SeqFileStreamProcessor { val OUTPUT_PATH = "outputpath" }
Example 104
Source File: SeqFileStreamProducer.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import java.time.Instant import org.apache.gearpump.streaming.source.Watermark import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.SequenceFile._ import org.apache.hadoop.io.{SequenceFile, Text} import org.apache.gearpump.Message import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._ import org.apache.gearpump.streaming.examples.fsio.SeqFileStreamProducer._ import org.apache.gearpump.streaming.task.{Task, TaskContext} class SeqFileStreamProducer(taskContext: TaskContext, config: UserConfig) extends Task(taskContext, config) { import taskContext.output val value = new Text() val key = new Text() var reader: SequenceFile.Reader = _ val hadoopConf = config.hadoopConf val fs = FileSystem.get(hadoopConf) val inputPath = new Path(config.getString(INPUT_PATH).get) override def onStart(startTime: Instant): Unit = { reader = new SequenceFile.Reader(hadoopConf, Reader.file(inputPath)) self ! Start LOG.info("sequence file spout initiated") } override def onNext(msg: Message): Unit = { if (reader.next(key, value)) { output(Message(key + "++" + value)) } else { reader.close() reader = new SequenceFile.Reader(hadoopConf, Reader.file(inputPath)) } self ! Continue } override def onStop(): Unit = { reader.close() } } object SeqFileStreamProducer { def INPUT_PATH: String = "inputpath" val Start = Watermark(Instant.now) val Continue = Watermark(Instant.now) }
Example 105
Source File: SeqFileStreamProcessorSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import java.io.File import java.time.Instant import scala.collection.mutable.ArrayBuffer import akka.actor.ActorSystem import akka.testkit.TestProbe import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.SequenceFile.Reader import org.apache.hadoop.io.{SequenceFile, Text} import org.mockito.Mockito._ import org.scalacheck.Gen import org.scalatest.prop.PropertyChecks import org.scalatest.{BeforeAndAfter, Matchers, PropSpec} import org.apache.gearpump.Message import org.apache.gearpump.cluster.{TestUtil, UserConfig} import org.apache.gearpump.streaming.task.TaskId import org.apache.gearpump.streaming.{MockUtil, Processor} class SeqFileStreamProcessorSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter { val kvPairs = new ArrayBuffer[(String, String)] val outputDirectory = "SeqFileStreamProcessor_Test" val sequenceFilePath = new Path(outputDirectory + File.separator + TaskId(0, 0)) val hadoopConf = new Configuration() val fs = FileSystem.get(hadoopConf) val textClass = new Text().getClass val _key = new Text() val _value = new Text() val kvGenerator = for { key <- Gen.alphaStr value <- Gen.alphaStr } yield (key, value) before { implicit val system1 = ActorSystem("SeqFileStreamProcessor", TestUtil.DEFAULT_CONFIG) val system2 = ActorSystem("Reporter", TestUtil.DEFAULT_CONFIG) val watcher = TestProbe()(system1) val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProcessor.OUTPUT_PATH, outputDirectory)).withHadoopConf(new Configuration()) val context = MockUtil.mockTaskContext val processorDescription = Processor.ProcessorToProcessorDescription(id = 0, Processor[SeqFileStreamProcessor](1)) val taskId = TaskId(0, 0) when(context.taskId).thenReturn(taskId) val processor = new SeqFileStreamProcessor(context, conf) processor.onStart(Instant.EPOCH) forAll(kvGenerator) { kv => val (key, value) = kv kvPairs.append((key, value)) processor.onNext(Message(key + "++" + value)) } processor.onStop() } property("SeqFileStreamProcessor should write the key-value pairs to a sequence file") { val reader = new SequenceFile.Reader(hadoopConf, Reader.file(sequenceFilePath)) kvPairs.foreach { kv => val (key, value) = kv if (value.length > 0 && reader.next(_key, _value)) { assert(_key.toString == key && _value.toString == value) } } reader.close() } after { fs.deleteOnExit(new Path(outputDirectory)) } }
Example 106
Source File: SeqFileStreamProducerSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import java.time.Instant import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.SequenceFile.Writer import org.apache.hadoop.io.{SequenceFile, Text} import org.mockito.Mockito._ import org.scalacheck.Gen import org.scalatest.prop.PropertyChecks import org.scalatest.{BeforeAndAfter, Matchers, PropSpec} import org.apache.gearpump.Message import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.streaming.MockUtil import org.apache.gearpump.streaming.MockUtil._ class SeqFileStreamProducerSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter { val kvPairs = new ArrayBuffer[(String, String)] val inputFile = "SeqFileStreamProducer_Test" val sequenceFilePath = new Path(inputFile) val hadoopConf = new Configuration() val fs = FileSystem.get(hadoopConf) val textClass = new Text().getClass val _key = new Text() val _value = new Text() val kvGenerator = for { key <- Gen.alphaStr value <- Gen.alphaStr } yield (key, value) before { fs.deleteOnExit(sequenceFilePath) val writer = SequenceFile.createWriter(hadoopConf, Writer.file(sequenceFilePath), Writer.keyClass(textClass), Writer.valueClass(textClass)) forAll(kvGenerator) { kv => _key.set(kv._1) _value.set(kv._2) kvPairs.append((kv._1, kv._2)) writer.append(_key, _value) } writer.close() } property("SeqFileStreamProducer should read the key-value pairs from " + "a sequence file and deliver them") { val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProducer.INPUT_PATH, inputFile)).withHadoopConf(new Configuration()) val context = MockUtil.mockTaskContext val producer = new SeqFileStreamProducer(context, conf) producer.onStart(Instant.EPOCH) producer.onNext(Message("start")) val expected = kvPairs.map(kv => kv._1 + "++" + kv._2).toSet verify(context).output(argMatch[Message](msg => expected.contains(msg.value.asInstanceOf[String]))) } after { fs.deleteOnExit(sequenceFilePath) } }
Example 107
Source File: HDFSClusterTest.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import java.io.{ BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter} import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.rdd.RDD import org.scalatest.FunSuite class HDFSClusterTest extends FunSuite with SharedSparkContext with RDDComparisons { var hdfsCluster: HDFSCluster = null override def beforeAll(): Unit = { super.beforeAll() hdfsCluster = new HDFSCluster hdfsCluster.startHDFS() } test("get the namenode uri") { val nameNodeURI = hdfsCluster.getNameNodeURI() assert(nameNodeURI == "hdfs://localhost:8020") } test("read and write from spark to hdfs") { val list = List(1, 2, 3, 4, 5) val numRDD: RDD[Int] = sc.parallelize(list) val path = hdfsCluster.getNameNodeURI() + "/myRDD" numRDD.saveAsTextFile(path) val loadedRDD: RDD[Int] = sc.textFile(path).map(_.toInt) assertRDDEquals(numRDD, loadedRDD) } test("test creating local file to hdfs") { val path = new Path(hdfsCluster.getNameNodeURI() + "/myfile") val fs = FileSystem.get(path.toUri, new Configuration()) val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path))) val writtenString = "hello, it's me" writer.write(writtenString) writer.close() val reader = new BufferedReader(new InputStreamReader(fs.open(path))) val readString = reader.readLine() reader.close() assert(writtenString == readString) } override def afterAll() { hdfsCluster.shutdownHDFS() super.afterAll() } }
Example 108
Source File: StreamMetadata.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = FileSystem.get(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 109
Source File: HDFSCredentialProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).flatMap { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val hdfsToken = creds.getAllTokens.asScala .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) hdfsToken.map { t => val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 110
Source File: ExecutorSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import java.util.concurrent.ThreadPoolExecutor import scala.collection.JavaConverters._ import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.hadoop.fs.FileSystem import org.apache.spark.metrics.source.Source private[spark] class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source { private def fileStats(scheme: String) : Option[FileSystem.Statistics] = FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme)) private def registerFileSystemStat[T]( scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = { metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] { override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue) }) } override val metricRegistry = new MetricRegistry() override val sourceName = "executor" // Gauge for executor thread pool's actively executing task counts metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] { override def getValue: Int = threadPool.getActiveCount() }) // Gauge for executor thread pool's approximate total number of tasks that have been completed metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] { override def getValue: Long = threadPool.getCompletedTaskCount() }) // Gauge for executor thread pool's current number of threads metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getPoolSize() }) // Gauge got executor thread pool's largest number of threads that have ever simultaneously // been in th pool metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getMaximumPoolSize() }) // Gauge for file system stats of this executor for (scheme <- Array("hdfs", "file")) { registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L) registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L) registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0) registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0) registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0) } }
Example 111
Source File: SimrSchedulerBackend.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.AkkaUtils private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = AkkaUtils.address( AkkaUtils.protocol(actorSystem), SparkEnv.driverActorSystemName, sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port"), CoarseGrainedSchedulerBackend.ACTOR_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 112
Source File: ExecutorSource.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import scala.collection.JavaConversions._ import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.hadoop.fs.FileSystem import org.apache.spark.metrics.source.Source private[spark] class ExecutorSource(val executor: Executor, executorId: String) extends Source { private def fileStats(scheme: String) : Option[FileSystem.Statistics] = FileSystem.getAllStatistics().filter(s => s.getScheme.equals(scheme)).headOption private def registerFileSystemStat[T]( scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = { metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] { override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue) }) } override val metricRegistry = new MetricRegistry() override val sourceName = "executor" // Gauge for executor thread pool's actively executing task counts metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] { override def getValue: Int = executor.threadPool.getActiveCount() }) // Gauge for executor thread pool's approximate total number of tasks that have been completed metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] { override def getValue: Long = executor.threadPool.getCompletedTaskCount() }) // Gauge for executor thread pool's current number of threads metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] { override def getValue: Int = executor.threadPool.getPoolSize() }) // Gauge got executor thread pool's largest number of threads that have ever simultaneously // been in th pool metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] { override def getValue: Int = executor.threadPool.getMaximumPoolSize() }) // Gauge for file system stats of this executor for (scheme <- Array("hdfs", "file")) { registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L) registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L) registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0) registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0) registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0) } }
Example 113
Source File: SessionDataFileHDFSWriter.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream.sessionization import java.io.BufferedWriter import java.io.FileWriter import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.conf.Configuration import java.io.OutputStreamWriter import org.apache.hadoop.fs.Path import java.util.Random object SessionDataFileHDFSWriter { val eol = System.getProperty("line.separator"); def main(args: Array[String]) { if (args.length == 0) { println("SessionDataFileWriter {tempDir} {distDir} {numberOfFiles} {numberOfEventsPerFile} {waitBetweenFiles}"); return; } val conf = new Configuration conf.addResource(new Path("/etc/hadoop/conf/core-site.xml")) conf.addResource(new Path("/etc/hadoop/conf/mapred-site.xml")) conf.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")) val fs = FileSystem.get(new Configuration) val rootTempDir = args(0) val rootDistDir = args(1) val files = args(2).toInt val loops = args(3).toInt val waitBetweenFiles = args(4).toInt val r = new Random for (f <- 1 to files) { val rootName = "/weblog." + System.currentTimeMillis() val tmpPath = new Path(rootTempDir + rootName + ".tmp") val writer = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath))) print(f + ": [") val randomLoops = loops + r.nextInt(loops) for (i <- 1 to randomLoops) { writer.write(SessionDataGenerator.getNextEvent + eol) if (i%100 == 0) { print(".") } } println("]") writer.close val distPath = new Path(rootDistDir + rootName + ".dat") fs.rename(tmpPath, distPath) Thread.sleep(waitBetweenFiles) } println("Done") } }
Example 114
Source File: spark_algo.scala From mllib_subpackage with Apache License 2.0 | 5 votes |
import org.apache.commons.cli.{Options, PosixParser} import org.apache.spark.SparkContext import org.apache.spark.SparkConf //import org.apache.hadoop.fs import java.util.Date import java.util.Calendar import org.apache.hadoop.fs.FileSystem //import sun.management.FileSystem object spark_algo { def main(args: Array[String]) { // Input Params val parser = new PosixParser( ) val options = new Options( ) options.addOption("a", "algo", true, "algo type; 10. sgd 11. lbfgs") val cl = parser.parse( options, args, true ) val algo = cl.getOptionValue("algo") val conf = new SparkConf() val sc = new SparkContext(conf) sc.getConf.getAll.foreach(println) val configuration = sc.hadoopConfiguration configuration.setBoolean("mapreduce.output.fileoutputformat.compress", false) val fs = FileSystem.get(configuration) val modeltmp = if(algo=="10" || algo=="11" || algo=="12" || algo=="13") { new mllib_lr(sc, fs, args) } else if(algo=="21") { new ftrl(sc, fs, args) } else if(algo=="22") { new ftrl_batch(sc, fs, args) } else if(algo=="31") { new relative(sc, fs, args) } else if(algo=="40") { new mllib_gbdt(sc, fs, args) } else if(algo=="41") { new lambda_mart(sc, fs, args) } else if(algo=="91") { new feature_analyse(sc, fs, args) } else if(algo=="docs_words_analyse") { new docs_words_analyse(sc, fs, args) } val model = modeltmp.asInstanceOf[malgo] model.deal() } }
Example 115
Source File: GzipDecompressor.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo import java.util.concurrent.{Executors, TimeUnit} import com.adidas.analytics.algo.GzipDecompressor.{changeFileExtension, compressedExtension, _} import com.adidas.analytics.algo.core.JobRunner import com.adidas.analytics.config.GzipDecompressorConfiguration import com.adidas.analytics.util.DFSWrapper import com.adidas.analytics.util.DFSWrapper._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.IOUtils import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} import scala.concurrent._ import scala.concurrent.duration._ final class GzipDecompressor protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) extends JobRunner with GzipDecompressorConfiguration { private val hadoopConfiguration: Configuration = spark.sparkContext.hadoopConfiguration private val fileSystem: FileSystem = dfs.getFileSystem(inputDirectoryPath) override def run(): Unit = { //check if directory exists if (!fileSystem.exists(inputDirectoryPath)){ logger.error(s"Input directory: $inputDirectoryPath does not exist.") throw new RuntimeException(s"Directory $inputDirectoryPath does not exist.") } val compressedFilePaths = fileSystem.ls(inputDirectoryPath, recursive) .filterNot(path => fileSystem.isDirectory(path)) .filter(_.getName.toLowerCase.endsWith(compressedExtension)) if (compressedFilePaths.isEmpty) { logger.warn(s"Input directory $inputDirectoryPath does not contain compressed files. Skipping...") } else { implicit val ec: ExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(threadPoolSize)) Await.result(Future.sequence( compressedFilePaths.map { compressedFilePath => Future { logger.info(s"Decompressing file: $compressedFilePath") val decompressedFileName = changeFileExtension(compressedFilePath.getName, compressedExtension, outputExtension) val decompressedFilePath = new Path(compressedFilePath.getParent, decompressedFileName) val compressionCodecFactory = new CompressionCodecFactory(hadoopConfiguration) val inputCodec = compressionCodecFactory.getCodec(compressedFilePath) val inputStream = inputCodec.createInputStream(fileSystem.open(compressedFilePath)) val output = fileSystem.create(decompressedFilePath) IOUtils.copyBytes(inputStream, output, hadoopConfiguration) logger.info(s"Finished decompressing file: $compressedFilePath") //Delete the compressed file fileSystem.delete(compressedFilePath, false) logger.info(s"Removed file: $compressedFilePath") } } ), Duration(4, TimeUnit.HOURS)) } } } object GzipDecompressor { private val logger: Logger = LoggerFactory.getLogger(this.getClass) private val compressedExtension: String = ".gz" def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): GzipDecompressor = { new GzipDecompressor(spark, dfs, configLocation) } private def changeFileExtension(fileName: String, currentExt: String, newExt: String): String = { val newFileName = fileName.substring(0, fileName.lastIndexOf(currentExt)) if (newFileName.endsWith(newExt)) newFileName else newFileName + newExt } }
Example 116
Source File: HDFSSupport.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.utils import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{CommonConfigurationKeysPublic, FileSystem} import org.apache.hadoop.hdfs.MiniDFSCluster import org.slf4j.Logger trait HDFSSupport { private lazy val defaultDataNodesNum: Int = 2 private lazy val defaultPort: Int = 8201 lazy val cluster: MiniDFSCluster = startHDFS(clusterHdfsConf) lazy val fs: FileSystem = cluster.getFileSystem() def logger: Logger def testAppId: String def localTestDir: String def clusterHdfsConf: Option[Configuration] = Option.empty def startHDFS(hadoopConf: Option[Configuration]): MiniDFSCluster = { val appDir = new File(localTestDir, testAppId) val hdfsTestDir = new File(appDir, "hdfs").getAbsoluteFile hdfsTestDir.mkdirs() val clusterConf = hadoopConf.fold(new Configuration())(c => new Configuration(c)) clusterConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsTestDir.getAbsolutePath) clusterConf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, s"hdfs://localhost:$defaultPort/") logger.info(s"Starting test DFS cluster with base directory at ${hdfsTestDir.getAbsolutePath} ...") new MiniDFSCluster.Builder(clusterConf) .numDataNodes(defaultDataNodesNum) .nameNodePort(defaultPort) .format(true) .build() } }
Example 117
Source File: HdfsUtils.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.utils import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import scala.collection.mutable object HdfsUtils { def renameFiles(fromBase: Path, toBase: Path, fs: FileSystem) = { if (fs.exists(fromBase)) { val filesToMove = listFiles(fromBase, fs) println("files to move:") filesToMove foreach (p => println("+++" + p.toString)) filesToMove foreach { file => val relPath = relativize(fromBase, file) val toPath = new Path(toBase, relPath) fs.mkdirs(toPath.getParent) fs.rename(file, toPath) println(" file renamed to: " + toPath.toString) } } } def relativize(base: Path, files: List[Path]) = { files map (file => new Path(base.toUri.relativize(file.toUri).getPath)) } def relativize(base: Path, file: Path): Path = { new Path(base.toUri.relativize(file.toUri).getPath) } def listFiles(path: Path, fs: FileSystem): List[Path] = { val statusList = mutable.MutableList[FileStatus]() traverse(path, statusList, fs) statusList.map(status => new Path(status.getPath.toUri.getPath)).toList } private def traverse(path: Path, list: mutable.MutableList[FileStatus], fs: FileSystem): Unit = { fs.listStatus(path) foreach { status => if (!status.isDirectory) { list += status } else { traverse(status.getPath, list, fs) } } } }
Example 118
Source File: TimePartitioningWriter.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.writers import java.io.IOException import com.typesafe.config.Config import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.joda.time.format.DateTimeFormat import org.slf4j.LoggerFactory import yamrcraft.etlite.EtlException import yamrcraft.etlite.transformers.Message import yamrcraft.etlite.utils.ConfigConversions._ import scala.collection.mutable class TimePartitioningWriter[T](config: Config, jobId: Long, partitionId: Int, writerFactory: (String, String) => Writer[T]) extends Writer[Message[T]] { val logger = LoggerFactory.getLogger(this.getClass) // config settings val workingFolder: String = config.getString("working-folder") val outputFolder: String = config.getString("output-folder") val partitionPattern: String = config.getString("partition.pattern") val folderMapping: Map[String, String] = config.getConfig("record-name-to-folder-mapping").asMap val fs = FileSystem.get(new Configuration()) val partitionFormat = DateTimeFormat.forPattern(partitionPattern) val partitionsWriters = mutable.Map[String, Writer[T]]() @throws(classOf[EtlException]) @throws(classOf[IOException]) override def write(event: Message[T]): Unit = { val timestamp = event.msgTimestamp val baseFolder = folderMapping.getOrElse(event.msgType, event.msgType) val writer = writerFor(baseFolder, timestamp) writer.write(event.msg) } override def commit() = { // close all writers partitionsWriters foreach { case (file, writer) => writer.commit() } } @throws(classOf[EtlException]) private def writerFor(baseFolder: String, timestamp: Long): Writer[T] = { val relativeFileName = new Path(s"$baseFolder/${partitionFormat.print(timestamp)}/events_${baseFolder}_job${jobId}_part$partitionId") val tempFile = new Path(workingFolder, relativeFileName) val outputFile = new Path(outputFolder, relativeFileName) partitionsWriters.getOrElseUpdate(tempFile.toString, writerFactory(tempFile.toString, outputFile.toString)) } }
Example 119
Source File: BigQueryDataFrame.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import com.google.api.services.bigquery.model.{TableReference, TableSchema} import com.google.cloud.hadoop.io.bigquery._ import com.google.gson._ import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{LongWritable, NullWritable} import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat import org.apache.spark.sql.DataFrame import org.slf4j.LoggerFactory import scala.util.Random def saveAsBigQueryTable(fullyQualifiedOutputTableId: String, isPartitionedByDay: Boolean = false, timePartitionExpiration: Long = 0, writeDisposition: WriteDisposition.Value = null, createDisposition: CreateDisposition.Value = null): Unit = { val destinationTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId) val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf) val gcsPath = writeDFToGoogleStorage(adaptedDf,destinationTable,bigQuerySchema) bq.load(destinationTable, bigQuerySchema, gcsPath, isPartitionedByDay, timePartitionExpiration, writeDisposition, createDisposition) delete(new Path(gcsPath)) } def writeDFToGoogleStorage(adaptedDf: DataFrame, destinationTable: TableReference, bqSchema: TableSchema): String = { val tableName = BigQueryStrings.toString(destinationTable) BigQueryConfiguration.configureBigQueryOutput(hadoopConf, tableName, bqSchema.toPrettyString()) hadoopConf.set("mapreduce.job.outputformat.class", classOf[BigQueryOutputFormat[_, _]].getName) val bucket = self.sparkSession.conf.get(BigQueryConfiguration.GCS_BUCKET_KEY) val temp = s"spark-bigquery-${System.currentTimeMillis()}=${Random.nextInt(Int.MaxValue)}" val gcsPath = s"gs://$bucket/hadoop/tmp/spark-bigquery/$temp" if(hadoopConf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY) == null) { hadoopConf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, gcsPath) } logger.info(s"Loading $gcsPath into $tableName") adaptedDf .toJSON .rdd .map(json => (null, jsonParser.parse(json))) .saveAsNewAPIHadoopFile(gcsPath, classOf[GsonBigQueryInputFormat], classOf[LongWritable], classOf[TextOutputFormat[NullWritable, JsonObject]], hadoopConf) gcsPath } private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, hadoopConf) fs.delete(path, true) } }
Example 120
Source File: SentencePieceWrapper.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.ml.tensorflow.sentencepiece import java.io.File import java.nio.file.{Files, Paths} import java.util.UUID import org.apache.commons.io.FileUtils import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession class SentencePieceWrapper( var sppModel: Array[Byte] ) extends Serializable { @transient private var mspp: SentencePieceProcessor = _ def getSppModel: SentencePieceProcessor = { if (mspp == null){ val spp = new SentencePieceProcessor() spp.loadFromSerializedProto(sppModel) mspp = spp } mspp } } object SentencePieceWrapper { def read( path: String ): SentencePieceWrapper = { val byteArray = Files.readAllBytes(Paths.get(path)) val sppWrapper = new SentencePieceWrapper(byteArray) val spp = new SentencePieceProcessor() spp.loadFromSerializedProto(byteArray) sppWrapper.mspp = spp sppWrapper } } trait WriteSentencePieceModel { def writeSentencePieceModel( path: String, spark: SparkSession, spp: SentencePieceWrapper, suffix: String, filename:String ): Unit = { val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) // 1. Create tmp folder val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + suffix) .toAbsolutePath.toString val sppFile = Paths.get(tmpFolder, filename).toString // 2. Save Tensorflow state FileUtils.writeByteArrayToFile(new File(sppFile), spp.sppModel) // 3. Copy to dest folder fs.copyFromLocalFile(new Path(sppFile), new Path(path)) // 4. Remove tmp folder FileUtils.deleteDirectory(new File(tmpFolder)) } } trait ReadSentencePieceModel { val sppFile: String def readSentencePieceModel( path: String, spark: SparkSession, suffix: String ): SentencePieceWrapper = { val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) // 1. Create tmp directory val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12)+ suffix) .toAbsolutePath.toString // 2. Copy to local dir fs.copyToLocalFile(new Path(path, sppFile), new Path(tmpFolder)) val sppModelFilePath = new Path(tmpFolder, sppFile) val byteArray = Files.readAllBytes(Paths.get(sppModelFilePath.toString)) val sppWrapper = new SentencePieceWrapper(byteArray) sppWrapper } }
Example 121
Source File: NerDLPythonReader.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ner.dl import java.nio.file.{Files, Paths} import java.util.UUID import com.johnsnowlabs.ml.tensorflow.{DatasetEncoderParams, NerDatasetEncoder, TensorflowNer, TensorflowWrapper} import com.johnsnowlabs.nlp.annotators.ner.Verbose import com.johnsnowlabs.storage.{RocksDBConnection, StorageHelper} import com.johnsnowlabs.util.FileHelper import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession import scala.io.Source object NerDLModelPythonReader { val embeddingsMetaFile = "embeddings.meta" val embeddingsFile = "embeddings" val tagsFile = "tags.csv" val charsFile = "chars.csv" private def readTags(folder: String): List[String] = { Source.fromFile(Paths.get(folder, tagsFile).toString).getLines().toList } private def readChars(folder: String): List[Char] = { val lines = Source.fromFile(Paths.get(folder, charsFile).toString).getLines() lines.toList.head.toCharArray.toList } private def readEmbeddingsHead(folder: String): Int = { val metaFile = Paths.get(folder, embeddingsMetaFile).toString Source.fromFile(metaFile).getLines().toList.head.toInt } private def readEmbeddings( folder: String, spark: SparkSession, embeddingsDim: Int, normalize: Boolean ): RocksDBConnection = { StorageHelper.load( Paths.get(folder, embeddingsFile).toString, spark, "python_tf_model", "python_tf_ref", false ) } def readLocal(folder: String, dim: Int, useBundle: Boolean = false, verbose: Verbose.Level = Verbose.All, tags: Array[String] = Array.empty[String]): TensorflowNer = { val labels = readTags(folder) val chars = readChars(folder) val settings = DatasetEncoderParams(labels, chars, Array.fill(dim)(0f).toList, dim) val encoder = new NerDatasetEncoder(settings) val tf = TensorflowWrapper.read(folder, zipped=false, useBundle, tags) new TensorflowNer(tf, encoder, 32, verbose) } def read( folder: String, dim: Int, spark: SparkSession, useBundle: Boolean = false, tags: Array[String] = Array.empty[String]): NerDLModel = { val uri = new java.net.URI(folder.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_bundle") .toAbsolutePath.toString fs.copyToLocalFile(new Path(folder), new Path(tmpFolder)) val nerModel = readLocal(tmpFolder, dim, useBundle, tags = tags) FileHelper.delete(tmpFolder) new NerDLModel() .setModelIfNotSet(spark, nerModel.tensorflow) .setDatasetParams(nerModel.encoder.params) } }
Example 122
Source File: StorageHelper.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.storage import java.io.File import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.spark.{SparkContext, SparkFiles} import org.apache.spark.sql.SparkSession object StorageHelper { def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString def load( storageSourcePath: String, spark: SparkSession, database: String, storageRef: String, withinStorage: Boolean ): RocksDBConnection = { val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef) val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage) val locator = StorageLocator(database, storageRef, spark) sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext) RocksDBConnection.getOrCreate(locator.clusterFileName) } def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = { val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath) val index = new Path(indexUri) val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) val dst = new Path(path+{if (withinStorage) "/storage/" else ""}) save(fs, index, dst) } private def save(fs: FileSystem, index: Path, dst: Path): Unit = { if (!fs.exists(dst)) fs.mkdirs(dst) fs.copyFromLocalFile(false, true, index, dst) } def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = { if (destinationScheme == "file") { copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext) } else { copyIndexToCluster(source, clusterFilePath, sparkContext) } } private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = { if (!new File(SparkFiles.get(dst.getName)).exists()) { val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration) val dstFS = dst.getFileSystem(spark.hadoopConfiguration) if (srcFS.getScheme == "file") { val src = sourcePath dstFS.copyFromLocalFile(false, true, src, dst) } else { FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration) } spark.addFile(dst.toString, recursive = true) } dst.toString } private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = { val fs = source.getFileSystem(context.hadoopConfiguration) if (!fs.exists(destination)) fs.copyFromLocalFile(false, true, source, destination) } }
Example 123
Source File: StorageLocator.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.storage import java.util.UUID import com.johnsnowlabs.util.ConfigHelper import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession case class StorageLocator(database: String, storageRef: String, sparkSession: SparkSession) { private val fs = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration) private val clusterTmpLocation: String = { val tmpLocation = ConfigHelper.getConfigValue(ConfigHelper.storageTmpDir).map(p => new Path(p)).getOrElse( sparkSession.sparkContext.hadoopConfiguration.get("hadoop.tmp.dir") ).toString+"/"+UUID.randomUUID().toString.takeRight(12)+"_cdx" val tmpLocationPath = new Path(tmpLocation) fs.mkdirs(tmpLocationPath) fs.deleteOnExit(tmpLocationPath) tmpLocation } val clusterFileName: String = { StorageHelper.resolveStorageName(database, storageRef) } val clusterFilePath: Path = { Path.mergePaths(new Path(fs.getUri.toString + clusterTmpLocation), new Path("/"+clusterFileName)) } val destinationScheme: String = { fs.getScheme } } object StorageLocator { def getStorageSerializedPath(path: String, folder: String, withinStorage: Boolean): Path = Path.mergePaths(new Path(path), new Path((if (withinStorage) "/storage/" else "/")+folder)) }
Example 124
Source File: HadoopFileSystemLogStore.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.storage import java.io.{BufferedReader, FileNotFoundException, InputStreamReader} import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.FileAlreadyExistsException import java.util.UUID import scala.collection.JavaConverters._ import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession protected def writeWithRename( path: Path, actions: Iterator[String], overwrite: Boolean = false): Unit = { val fs = path.getFileSystem(getHadoopConfiguration) if (!fs.exists(path.getParent)) { throw new FileNotFoundException(s"No such file or directory: ${path.getParent}") } if (overwrite) { val stream = fs.create(path, true) try { actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write) } finally { stream.close() } } else { if (fs.exists(path)) { throw new FileAlreadyExistsException(path.toString) } val tempPath = createTempPath(path) var streamClosed = false // This flag is to avoid double close var renameDone = false // This flag is to save the delete operation in most of cases. val stream = fs.create(tempPath) try { actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write) stream.close() streamClosed = true try { if (fs.rename(tempPath, path)) { renameDone = true } else { if (fs.exists(path)) { throw new FileAlreadyExistsException(path.toString) } else { throw new IllegalStateException(s"Cannot rename $tempPath to $path") } } } catch { case _: org.apache.hadoop.fs.FileAlreadyExistsException => throw new FileAlreadyExistsException(path.toString) } } finally { if (!streamClosed) { stream.close() } if (!renameDone) { fs.delete(tempPath, false) } } } } protected def createTempPath(path: Path): Path = { new Path(path.getParent, s".${path.getName}.${UUID.randomUUID}.tmp") } override def invalidateCache(): Unit = {} }
Example 125
Source File: Job.scala From spark-avro-compactor with Apache License 2.0 | 5 votes |
package ie.ianduffy.spark.avro.compactor import ie.ianduffy.spark.avro.compactor.Utils._ import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.avro.mapred.AvroKey import org.apache.avro.mapreduce.AvroKeyOutputFormat import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.NullWritable import org.apache.spark.sql.SparkSession import org.slf4j.LoggerFactory object Job { private val log = LoggerFactory.getLogger(Job.getClass.getName.replace("$", "")) def run(spark: SparkSession, schemaRegistry: SchemaRegistryClient, jobConfig: JobConfig): Unit = { val schema: Schema = { val latestSchemaMetadata: SchemaMetadata = schemaRegistry.getLatestSchemaMetadata(jobConfig.schemaRegistrySubject) val id: Int = latestSchemaMetadata.getId schemaRegistry.getById(id) } implicit val sparkConfig: Configuration = spark.sparkContext.hadoopConfiguration sparkConfig.set("avro.schema.input.key", schema.toString()) sparkConfig.set("avro.schema.output.key", schema.toString()) val inputPath: Path = new Path(jobConfig.input) val outputPath: Path = new Path(jobConfig.output) val fs: FileSystem = inputPath.getFileSystem(sparkConfig) // avoid raising org.apache.hadoop.mapred.FileAlreadyExistsException if (jobConfig.overrideOutput) fs.delete(outputPath, true) // from fileSystem prefix with s3 the default is 64MB and can be overwitten by fs.s3.block.size // from fileSystem prefix with s3a the default is 32MB and can be overwitten by setting fs.s3a.block.size val outputBlocksize: Long = fs.getDefaultBlockSize(outputPath) // Where inputPath is of the form s3://some/path val inputPathSize: Long = fs.getContentSummary(inputPath).getSpaceConsumed val numPartitions: Int = Math.max(1, Math.floor((inputPathSize / CompressionRatio.AVRO_SNAPPY) / outputBlocksize).toInt) log.debug( s"""outputBlocksize: $outputBlocksize | inputPathSize: $inputPathSize | splitSize: $numPartitions """.stripMargin) val rdd = readHadoopFile(spark, inputPath.toString) rdd.coalesce(numPartitions) .saveAsNewAPIHadoopFile( outputPath.toString, classOf[AvroKey[GenericRecord]], classOf[NullWritable], classOf[AvroKeyOutputFormat[GenericRecord]], sparkConfig ) } }
Example 126
Source File: BinaryFileReader.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark import com.microsoft.ml.spark.core.env.StreamUtilities import com.microsoft.ml.spark.core.schema.BinaryFileSchema import com.microsoft.ml.spark.core.utils.AsyncUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.spark.binary.BinaryFileFormat import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.binary.ConfUtils import org.apache.spark.sql.types.BinaryType import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.Duration object BinaryFileReader { private def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean, visitedSymlinks: Set[Path]): Array[Path] ={ val filteredPaths = fileSystem.listStatus(path).filter(pathFilter) val filteredDirs = filteredPaths.filter(fs => fs.isDirectory & !visitedSymlinks(fs.getPath)) val symlinksFound = visitedSymlinks ++ filteredDirs.filter(_.isSymlink).map(_.getPath) filteredPaths.map(_.getPath) ++ filteredDirs.map(_.getPath) .flatMap(p => recursePath(fileSystem, p, pathFilter, symlinksFound)) } def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean): Array[Path] ={ recursePath(fileSystem, path, pathFilter, Set()) } def readFromPaths(df: DataFrame, pathCol: String, bytesCol: String, concurrency: Int, timeout: Int ): DataFrame = { val outputSchema = df.schema.add(bytesCol, BinaryType, nullable = true) val encoder = RowEncoder(outputSchema) val hconf = ConfUtils.getHConf(df) df.mapPartitions { rows => val futures = rows.map {row: Row => Future { val path = new Path(row.getAs[String](pathCol)) val fs = path.getFileSystem(hconf.value) val bytes = StreamUtilities.using(fs.open(path)) {is => IOUtils.toByteArray(is)}.get val ret = Row.merge(Seq(row, Row(bytes)): _*) ret }(ExecutionContext.global) } AsyncUtils.bufferedAwait( futures,concurrency, Duration.fromNanos(timeout*(20^6).toLong))(ExecutionContext.global) }(encoder) } }
Example 127
Source File: IndexedBinaryBlockReader.scala From hail with MIT License | 5 votes |
package is.hail.io import is.hail.annotations.RegionValueBuilder import is.hail.io.fs.{HadoopFS, WrappedSeekableDataInputStream} import org.apache.commons.logging.{Log, LogFactory} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.LongWritable import org.apache.hadoop.mapred._ abstract class KeySerializedValueRecord[K] extends Serializable { var input: Array[Byte] = _ var key: K = _ def setSerializedValue(arr: Array[Byte]) { this.input = arr } def getValue(rvb: RegionValueBuilder, includeGT: Boolean): Unit def setKey(k: K) { this.key = k } def getKey: K = key } abstract class IndexedBinaryBlockReader[T](job: Configuration, split: FileSplit) extends RecordReader[LongWritable, T] { val LOG: Log = LogFactory.getLog(classOf[IndexedBinaryBlockReader[T]].getName) val partitionStart: Long = split.getStart var pos: Long = partitionStart val end: Long = partitionStart + split.getLength val bfis = openFile() def openFile(): HadoopFSDataBinaryReader = { val file: Path = split.getPath val fs: FileSystem = file.getFileSystem(job) val is = fs.open(file) new HadoopFSDataBinaryReader( new WrappedSeekableDataInputStream( HadoopFS.toSeekableInputStream(is))) } def createKey(): LongWritable = new LongWritable() def createValue(): T def getPos: Long = pos def getProgress: Float = { if (partitionStart == end) 0.0f else Math.min(1.0f, (pos - partitionStart) / (end - partitionStart).toFloat) } def close() = bfis.close() }
Example 128
Source File: HadoopFSHelpers.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.api.io.fs import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter} import org.apache.hadoop.fs.{FileSystem, Path} import org.opencypher.morpheus.api.io.util.FileSystemUtils.using object HadoopFSHelpers { implicit class RichHadoopFileSystem(fileSystem: FileSystem) { protected def createDirectoryIfNotExists(path: Path): Unit = { if (!fileSystem.exists(path)) { fileSystem.mkdirs(path) } } def listDirectories(path: String): List[String] = { val p = new Path(path) createDirectoryIfNotExists(p) fileSystem.listStatus(p) .filter(_.isDirectory) .map(_.getPath.getName) .toList } def deleteDirectory(path: String): Unit = { fileSystem.delete(new Path(path), true) } def readFile(path: String): String = { using(new BufferedReader(new InputStreamReader(fileSystem.open(new Path(path)), "UTF-8"))) { reader => def readLines = Stream.cons(reader.readLine(), Stream.continually(reader.readLine)) readLines.takeWhile(_ != null).mkString } } def writeFile(path: String, content: String): Unit = { val p = new Path(path) val parentDirectory = p.getParent createDirectoryIfNotExists(parentDirectory) using(fileSystem.create(p)) { outputStream => using(new BufferedWriter(new OutputStreamWriter(outputStream, "UTF-8"))) { bufferedWriter => bufferedWriter.write(content) } } } } }
Example 129
Source File: StreamMetadata.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = FileSystem.get(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 130
Source File: ExecutorSource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import java.util.concurrent.ThreadPoolExecutor import scala.collection.JavaConverters._ import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.hadoop.fs.FileSystem import org.apache.spark.metrics.source.Source private[spark] class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source { private def fileStats(scheme: String) : Option[FileSystem.Statistics] = FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme)) private def registerFileSystemStat[T]( scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = { metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] { override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue) }) } override val metricRegistry = new MetricRegistry() override val sourceName = "executor" // Gauge for executor thread pool's actively executing task counts metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] { override def getValue: Int = threadPool.getActiveCount() }) // Gauge for executor thread pool's approximate total number of tasks that have been completed metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] { override def getValue: Long = threadPool.getCompletedTaskCount() }) // Gauge for executor thread pool's current number of threads metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getPoolSize() }) // Gauge got executor thread pool's largest number of threads that have ever simultaneously // been in th pool metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getMaximumPoolSize() }) // Gauge for file system stats of this executor for (scheme <- Array("hdfs", "file")) { registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L) registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L) registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0) registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0) registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0) } }
Example 131
Source File: EventHistoryReporter.scala From sparklens with Apache License 2.0 | 5 votes |
package com.qubole.sparklens.app import java.io.{BufferedInputStream, InputStream} import java.net.URI import com.ning.compress.lzf.LZFInputStream import com.qubole.sparklens.QuboleJobListener import com.qubole.sparklens.common.Json4sWrapper import com.qubole.sparklens.helper.HDFSConfigHelper import net.jpountz.lz4.LZ4BlockInputStream import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkConf import org.json4s.DefaultFormats import org.xerial.snappy.SnappyInputStream class EventHistoryReporter(file: String, extraConf: List[(String, String)] = List.empty) { // This is using reflection in spark-2.0.0 ReplayListenerBus val busKlass = Class.forName("org.apache.spark.scheduler.ReplayListenerBus") val bus = busKlass.newInstance() val addListenerMethod = busKlass.getMethod("addListener", classOf[Object]) val conf = new SparkConf() .set("spark.sparklens.reporting.disabled", "false") .set("spark.sparklens.save.data", "false") extraConf.foreach(x => { conf.set(x._1, x._2) }) val listener = new QuboleJobListener(conf) addListenerMethod.invoke(bus, listener) try { val replayMethod = busKlass.getMethod("replay", classOf[InputStream], classOf[String], classOf[Boolean]) replayMethod.invoke(bus, getDecodedInputStream(file, conf), file, boolean2Boolean(false)) } catch { case _: NoSuchMethodException => // spark binaries are 2.1* and above val replayMethod = busKlass.getMethod("replay", classOf[InputStream], classOf[String], classOf[Boolean], classOf[String => Boolean]) replayMethod.invoke(bus, getDecodedInputStream(file, conf), file, boolean2Boolean(false), getFilter _) case x: Exception => { println(s"Failed replaying events from ${file} [${x.getMessage}]") } } // Borrowed from CompressionCodecs in spark private def getDecodedInputStream(file: String, conf: SparkConf): InputStream = { val fs = FileSystem.get(new URI(file), HDFSConfigHelper.getHadoopConf(Some(conf))) val path = new Path(file) val bufStream = new BufferedInputStream(fs.open(path)) val logName = path.getName.stripSuffix(".inprogress") val codecName: Option[String] = logName.split("\\.").tail.lastOption codecName.getOrElse("") match { case "lz4" => new LZ4BlockInputStream(bufStream) case "lzf" => new LZFInputStream(bufStream) case "snappy" => new SnappyInputStream(bufStream) case _ => bufStream } } private def getFilter(eventString: String): Boolean = { implicit val formats = DefaultFormats eventFilter.contains(Json4sWrapper.parse(eventString).extract[Map[String, Any]].get("Event") .get.asInstanceOf[String]) } private def eventFilter: Set[String] = { Set( "SparkListenerTaskEnd", "SparkListenerApplicationStart", "SparkListenerApplicationEnd", "SparkListenerExecutorAdded", "SparkListenerExecutorRemoved", "SparkListenerJobStart", "SparkListenerJobEnd", "SparkListenerStageSubmitted", "SparkListenerStageCompleted" ) } }
Example 132
Source File: ExecutorDelegationTokenUpdater.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(hadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { executorUpdaterRunnable.run() } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 133
Source File: SimrSchedulerBackend.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 134
Source File: ExecutorSource.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import java.util.concurrent.ThreadPoolExecutor import scala.collection.JavaConversions._ import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.hadoop.fs.FileSystem import org.apache.spark.metrics.source.Source private[spark] class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source { private def fileStats(scheme: String) : Option[FileSystem.Statistics] = FileSystem.getAllStatistics().find(s => s.getScheme.equals(scheme)) private def registerFileSystemStat[T]( scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = { metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] { override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue) }) } override val metricRegistry = new MetricRegistry() override val sourceName = "executor" // Gauge for executor thread pool's actively executing task counts metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] { override def getValue: Int = threadPool.getActiveCount() }) // Gauge for executor thread pool's approximate total number of tasks that have been completed metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] { override def getValue: Long = threadPool.getCompletedTaskCount() }) // Gauge for executor thread pool's current number of threads metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getPoolSize() }) // Gauge got executor thread pool's largest number of threads that have ever simultaneously // been in th pool metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getMaximumPoolSize() }) // Gauge for file system stats of this executor for (scheme <- Array("hdfs", "file")) { registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L) registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L) registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0) registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0) registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0) } }
Example 135
Source File: FeatureSelection.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.aerosolve.training import java.io.BufferedWriter import java.io.OutputStreamWriter import java.util import com.airbnb.aerosolve.core.{ModelRecord, ModelHeader, FeatureVector, Example} import com.airbnb.aerosolve.core.models.LinearModel import com.airbnb.aerosolve.core.util.Util import com.typesafe.config.Config import org.slf4j.{LoggerFactory, Logger} import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.Buffer import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ import scala.util.Random import scala.math.abs import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.fs.Path import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path object FeatureSelection { private final val log: Logger = LoggerFactory.getLogger("FeatureSelection") val allKey : (String, String) = ("$ALL", "$POS") // Given a RDD compute the pointwise mutual information between // the positive label and the discrete features. def pointwiseMutualInformation(examples : RDD[Example], config : Config, key : String, rankKey : String, posThreshold : Double, minPosCount : Double, newCrosses : Boolean) : RDD[((String, String), Double)] = { val pointwise = LinearRankerUtils.makePointwise(examples, config, key, rankKey) val features = pointwise .mapPartitions(part => { // The tuple2 is var, var | positive val output = scala.collection.mutable.HashMap[(String, String), (Double, Double)]() part.foreach(example =>{ val featureVector = example.example.get(0) val isPos = if (featureVector.floatFeatures.get(rankKey).asScala.head._2 > posThreshold) 1.0 else 0.0 val all : (Double, Double) = output.getOrElse(allKey, (0.0, 0.0)) output.put(allKey, (all._1 + 1.0, all._2 + 1.0 * isPos)) val features : Array[(String, String)] = LinearRankerUtils.getFeatures(featureVector) if (newCrosses) { for (i <- features) { for (j <- features) { if (i._1 < j._1) { val key = ("%s<NEW>%s".format(i._1, j._1), "%s<NEW>%s".format(i._2, j._2)) val x = output.getOrElse(key, (0.0, 0.0)) output.put(key, (x._1 + 1.0, x._2 + 1.0 * isPos)) } } } } for (feature <- features) { val x = output.getOrElse(feature, (0.0, 0.0)) output.put(feature, (x._1 + 1.0, x._2 + 1.0 * isPos)) } }) output.iterator }) .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2)) .filter(x => x._2._2 >= minPosCount) val allCount = features.filter(x => x._1.equals(allKey)).take(1).head features.map(x => { val prob = x._2._1 / allCount._2._1 val probPos = x._2._2 / allCount._2._2 (x._1, math.log(probPos / prob) / math.log(2.0)) }) } // Returns the maximum entropy per family def maxEntropy(input : RDD[((String, String), Double)]) : RDD[((String, String), Double)] = { input .map(x => (x._1._1, (x._1._2, x._2))) .reduceByKey((a, b) => if (math.abs(a._2) > math.abs(b._2)) a else b) .map(x => ((x._1, x._2._1), x._2._2)) } }
Example 136
Source File: HDFSUtil.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.util import java.io.{BufferedReader, IOException, InputStreamReader} import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} object HDFSUtil extends ScalaLogging { private lazy val hadoopConfiguration = new Configuration() def lastTaskSucceed(path: String): Boolean = { if (dirExists(path)) { if (dirExists(path + "/_temporary")) { logger.info(s"Deleting partial data for $path.") deleteDirWithoutThrow(path) false } else { logger.info(s"$path exists") true } } else { logger.info(s"$path does not exist") false } } def dirExists(dir: String): Boolean = { val path = new Path(dir) val hdfs = FileSystem.get( new java.net.URI(dir), hadoopConfiguration) hdfs.exists(path) } def deleteDirWithoutThrow(dir: String): Unit = { val path = new Path(dir) val hdfs = FileSystem.get( new java.net.URI(dir), hadoopConfiguration) if (hdfs.exists(path)) { logger.warn(s"$dir exists, DELETING") try { hdfs.delete(path, true) } catch { case e: IOException => logger.error(s" exception $e") } } } def createPath(path: String): Unit = { val remotePath = new Path(path) val remoteFS = remotePath.getFileSystem(hadoopConfiguration) remoteFS.mkdirs(new Path(path)) } def readStringFromFile(inputFile : String): String = { val fs = FileSystem.get(new URI(inputFile), hadoopConfiguration) val path = new Path(inputFile) val stream = fs.open(path) val reader = new BufferedReader(new InputStreamReader(stream)) val str = Stream.continually(reader.readLine()).takeWhile(_ != null).mkString("\n") str } }
Example 137
Source File: WriteTransformer.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations import java.io.{File, IOException} import scala.reflect.runtime.{universe => ru} import ai.deepsense.commons.utils.Version import ai.deepsense.commons.utils.FileOperations.deleteRecursivelyIfExists import ai.deepsense.deeplang.DOperation.Id import ai.deepsense.deeplang.documentation.OperationDocumentation import ai.deepsense.deeplang.doperables.Transformer import ai.deepsense.deeplang.doperations.exceptions.DeepSenseIOException import ai.deepsense.deeplang.params.{BooleanParam, Params, StringParam} import ai.deepsense.deeplang.{DOperation1To0, ExecutionContext} import java.net.URI import org.apache.hadoop.fs.{FileSystem, Path} case class WriteTransformer() extends DOperation1To0[Transformer] with Params with OperationDocumentation { override val id: Id = "58368deb-68d0-4657-ae3f-145160cb1e2b" override val name: String = "Write Transformer" override val description: String = "Writes a Transformer to a directory" override val since: Version = Version(1, 1, 0) val shouldOverwrite = BooleanParam( name = "overwrite", description = Some("Should an existing transformer with the same name be overwritten?") ) setDefault(shouldOverwrite, true) def getShouldOverwrite: Boolean = $(shouldOverwrite) def setShouldOverwrite(value: Boolean): this.type = set(shouldOverwrite, value) val outputPath = StringParam( name = "output path", description = Some("The output path for writing the Transformer.")) def getOutputPath: String = $(outputPath) def setOutputPath(value: String): this.type = set(outputPath, value) val specificParams: Array[ai.deepsense.deeplang.params.Param[_]] = Array(outputPath, shouldOverwrite) override protected def execute(transformer: Transformer)(context: ExecutionContext): Unit = { val outputDictPath = getOutputPath try { if (getShouldOverwrite) { removeDirectory(context, outputDictPath) } transformer.save(context, outputDictPath) } catch { case e: IOException => logger.error(s"WriteTransformer error. Could not write transformer to the directory", e) throw DeepSenseIOException(e) } } private def removeDirectory(context: ExecutionContext, path: String): Unit = { if (path.startsWith("hdfs://")) { val configuration = context.sparkContext.hadoopConfiguration val hdfs = FileSystem.get(new URI(extractHdfsAddress(path)), configuration) hdfs.delete(new Path(path), true) } else { deleteRecursivelyIfExists(new File(path)) } } private def extractHdfsAddress(path: String): String = { // first group: "hdfs://ip.addr.of.hdfs", second group: "/some/path/on/hdfs" val regex = "(hdfs:\\/\\/[^\\/]*)(.*)".r val regex(hdfsAddress, _) = path hdfsAddress } @transient override lazy val tTagTI_0: ru.TypeTag[Transformer] = ru.typeTag[Transformer] } object WriteTransformer { def apply(outputPath: String): WriteTransformer = { new WriteTransformer().setOutputPath(outputPath) } }
Example 138
Source File: FileDownloader.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage import java.io.{BufferedWriter, FileOutputStream, IOException, OutputStreamWriter} import java.nio.file.{Files, Paths} import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperations.exceptions.DeepSenseIOException import ai.deepsense.deeplang.doperations.readwritedataframe.FilePath private[filestorage] object FileDownloader { def downloadFile(url: String)(implicit context: ExecutionContext): FilePath = { if (context.tempPath.startsWith("hdfs://")) { downloadFileToHdfs(url) } else { downloadFileToDriver(url) } } private def downloadFileToHdfs(url: String)(implicit context: ExecutionContext) = { val content = scala.io.Source.fromURL(url).getLines() val hdfsPath = s"${context.tempPath}/${UUID.randomUUID()}" val configuration = new Configuration() val hdfs = FileSystem.get(configuration) val file = new Path(hdfsPath) val hdfsStream = hdfs.create(file) val writer = new BufferedWriter(new OutputStreamWriter(hdfsStream)) try { content.foreach {s => writer.write(s) writer.newLine() } } finally { safeClose(writer) hdfs.close() } FilePath(hdfsPath) } private def downloadFileToDriver(url: String) (implicit context: ExecutionContext) = { val outputDirPath = Paths.get(context.tempPath) // We're checking if the output is a directory following symlinks. // The default behaviour of createDirectories is NOT to follow symlinks if (!Files.isDirectory(outputDirPath)) { Files.createDirectories(outputDirPath) } val outFilePath = Files.createTempFile(outputDirPath, "download", ".csv") // content is a stream. Do not invoke stuff like .toList() on it. val content = scala.io.Source.fromURL(url).getLines() val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFilePath.toFile))) try { content.foreach {s => writer.write(s) writer.newLine() } } finally { safeClose(writer) } FilePath(s"file:///$outFilePath") } private def safeClose(bufferedWriter: BufferedWriter): Unit = { try { bufferedWriter.flush() bufferedWriter.close() } catch { case e: IOException => throw new DeepSenseIOException(e) } } }
Example 139
Source File: DQMainClass.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.utils import java.util.Locale import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveContext trait DQMainClass { this: DQSparkContext with Logging => private def initLogger(): Unit = { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("org.apache.spark.scheduler.TaskSetManager").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.OFF) Logger.getLogger("io.netty").setLevel(Level.OFF) Logger.getLogger("org.spark-project.jetty").setLevel(Level.OFF) Logger.getLogger("org.apache.hadoop.hdfs.KeyProviderCache").setLevel(Level.OFF) } private def makeFileSystem(settings: DQSettings, sc: SparkContext): FileSystem = { if (sc.isLocal) FileSystem.getLocal(sc.hadoopConfiguration) else { if (settings.s3Bucket.isDefined) { sc.hadoopConfiguration.set("fs.defaultFS", settings.s3Bucket.get) sc.hadoopConfiguration.set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") } FileSystem.get( sc.hadoopConfiguration) } } protected def body()(implicit fs: FileSystem, sparkContext: SparkContext, sqlContext: SQLContext, sqlWriter: HistoryDBManager, settings: DQSettings): Boolean def preMessage(task: String): Unit = { log.warn("************************************************************************") log.warn(s" Starting execution of task $task") log.warn("************************************************************************") } def postMessage(task: String): Unit = { log.warn("************************************************************************") log.warn(s" Finishing execution of task $task") log.warn("************************************************************************") } def main(args: Array[String]): Unit = { // set to avoid casting problems in metric result name generation Locale.setDefault(Locale.ENGLISH) initLogger() DQCommandLineOptions.parser().parse(args, DQCommandLineOptions("","")) match { case Some(commandLineOptions) => // Load our own config values from the default location, application.conf val settings = new DQSettings(commandLineOptions) val sparkContext = makeSparkContext(settings) val fs = makeFileSystem(settings, sparkContext) settings.logThis()(log) val sqlContext: SQLContext = if (settings.hiveDir.isDefined) { val hc = new HiveContext(sparkContext) hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir.get) hc } else makeSqlContext(sparkContext) val historyDatabase = new HistoryDBManager(settings) // Starting application body preMessage(s"{${settings.appName}}") val startTime = System.currentTimeMillis() body()(fs, sparkContext, sqlContext, historyDatabase, settings) postMessage(s"{${settings.appName}}") log.info(s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)") log.info("Closing application...") historyDatabase.closeConnection() sparkContext.stop() log.info("Spark context were terminated. Exiting...") case None => log.error("Wrong parameters provided") throw new Exception("Wrong parameters provided") } } }
Example 140
Source File: TransposePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, SQLContext} import scala.collection.JavaConversions._ final class TransposePostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings: DQSettings) { private val vs = config.getString("source") private val keys = config.getStringList("keyColumns") private val target: HdfsTargetConfig = { val conf = config.getConfig("saveTo") utils.parseTargetConfig(conf)(settings).get } override def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])( implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile = { import sqlContext.implicits._ def toLong(df: DataFrame, by: Seq[String]): DataFrame = { val (cols, types) = df.dtypes.filter { case (c, _) => !by.contains(c) }.unzip require(types.distinct.length == 1) val kvs = explode( array( cols.map(c => struct(lit(c).alias(settings.backComp.trKeyName), col(c).alias(settings.backComp.trValueName))): _* )) val byExprs = by.map(col) df.select(byExprs :+ kvs.alias("_kvs"): _*) .select(byExprs ++ Seq($"_kvs.${settings.backComp.trKeyName}", $"_kvs.${settings.backComp.trValueName}"): _*) } val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head val transposed: DataFrame = toLong(df, keys) HdfsWriter.saveVirtualSource(transposed, target, settings.refDateString)( fs, sqlContext.sparkContext) new HdfsFile(target) } }
Example 141
Source File: EnrichPostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import java.util import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.functions.lit import org.apache.spark.sql.{DataFrame, SQLContext} import scala.collection.JavaConversions._ import scala.util.Try final class EnrichPostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings) { private val vs: Option[String] = Try(config.getString("source")).toOption private val metrics: util.List[String] = config.getStringList("metrics") private val checks: util.List[String] = config.getStringList("checks") private val extra = config.getObject("extra").toMap private val target: HdfsTargetConfig = { val conf = config.getConfig("saveTo") utils.parseTargetConfig(conf)(settings).get } override def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])( implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile = { import sqlContext.implicits._ val df: DataFrame = vs match { case Some(vsource) => val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vsource).head HdfsReader.load(reqVS, settings.ref_date).head case None => sqlContext.sparkContext.parallelize(Seq(1)).toDF("teapot") } val reqMet: Seq[(String, Double)] = metRes .filter(mr => metrics.contains(mr.metricId)) .map(mr => mr.metricId -> mr.result) val reqCheck: Seq[(String, String)] = chkRes .filter(cr => checks.contains(cr.checkId)) .map(cr => cr.checkId -> cr.status) if (reqMet.size != metrics.size()) throw IllegalParameterException("Some of stated metrics are missing!") if (reqCheck.size != checks.size()) throw IllegalParameterException("Some of stated checks are missing!") val dfWithMet: DataFrame = reqMet.foldLeft(df)((df, met) => df.withColumn(met._1, lit(met._2))) val dfWithChecks = reqCheck.foldLeft(dfWithMet)((df, met) => df.withColumn(met._1, lit(met._2))) val dfWithExtra = extra.foldLeft(dfWithChecks)((df, ex) => df.withColumn(ex._1, lit(ex._2.unwrapped()))) HdfsWriter.saveVirtualSource( dfWithExtra.drop("teapot"), target, settings.refDateString)(fs, sqlContext.sparkContext) new HdfsFile(target) } }
Example 142
Source File: ArrangePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, NumericType} import org.apache.spark.sql.{Column, DataFrame, SQLContext} import scala.collection.JavaConversions._ final class ArrangePostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings) { private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) { def toColumn()(implicit df: DataFrame): Column = { val dataType: Option[NumericType with Product with Serializable] = tipo.getOrElse("").toUpperCase match { case "DOUBLE" => Some(DoubleType) case "INT" => Some(IntegerType) case "LONG" => Some(LongType) case _ => None } import org.apache.spark.sql.functions.format_number import org.apache.spark.sql.functions.format_string (dataType, precision, format) match { case (Some(dt), None, None) => df(name).cast(dt) case(Some(dt), None, Some(f)) => format_string(f, df(name).cast(dt)).alias(name) case (Some(dt), Some(p),None) => format_number(df(name).cast(dt), p).alias(name) case (None, Some(p), None) => format_number(df(name), p).alias(name) case (None, None, Some(f)) => format_string(f, df(name)).alias(name) case _ => df(name) } } } private val vs = config.getString("source") private val target: HdfsTargetConfig = { val conf = config.getConfig("saveTo") utils.parseTargetConfig(conf)(settings).get } private val columns: Seq[ColumnSelector] = config.getAnyRefList("columnOrder").map { case x: String => ColumnSelector(x) case x: java.util.HashMap[_, String] => { val (name, v) = x.head.asInstanceOf[String Tuple2 _] v match { case v: String => ColumnSelector(name, Option(v)) case v: java.util.HashMap[String, _] => { val k = v.head._1 val f = v.head._2 f match { case f: Integer => ColumnSelector(name, Option(k), None, Option(f)) case f: String => ColumnSelector(name, Option(k), Option(f)) } } } } } override def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])( implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile = { val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head implicit val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head val arrangeDF = df.select(columns.map(_.toColumn): _*) HdfsWriter.saveVirtualSource(arrangeDF, target, settings.refDateString)( fs, sqlContext.sparkContext) new HdfsFile(target) } }
Example 143
Source File: PostprocessorType.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.utils.DQSettings import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.SQLContext object PostprocessorType extends Enumeration { val enrich: PostprocessorVal = PostprocessorVal("enrich", classOf[EnrichPostprocessor]) val transpose: PostprocessorVal = PostprocessorVal("transpose_by_key", classOf[TransposePostprocessor]) val headless: PostprocessorVal = PostprocessorVal("transpose_by_column", classOf[TransposeByColumnPostprocessor]) val arrange: PostprocessorVal = PostprocessorVal("arrange", classOf[ArrangePostprocessor]) protected case class PostprocessorVal(name: String, service: Class[_ <: BasicPostprocessor]) extends super.Val() { override def toString(): String = this.name } implicit def convert(value: Value): PostprocessorVal = value.asInstanceOf[PostprocessorVal] } abstract class BasicPostprocessor(config: Config, settings: DQSettings){ def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])(implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile }
Example 144
Source File: SparkTestSpec.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
import com.typesafe.config._ import it.agilelab.bigdata.DataQuality.configs.ConfigReader import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetric, FileMetric, MetricProcessor} import it.agilelab.bigdata.DataQuality.sources.{HdfsFile, SourceConfig} import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} import org.joda.time import org.scalatest.{BeforeAndAfterAll, FunSuite} import scala.util.Random class SparkTestSpec extends FunSuite with BeforeAndAfterAll { @transient private var _sc: SparkContext = _ def sc: SparkContext = _sc @transient private var _fs: FileSystem = _ def fs: FileSystem = _fs val SAMPLE_SIZE = 100 val r = Random r.setSeed(123) val settings: DQSettings = new DQSettings( conf = ConfigFactory.parseURL(getClass.getResource("/application.conf")).getConfig("dataquality"), configFilePath = getClass.getResource("/conf/test.conf").getPath, repartition = false, local = true, ref_date = time.DateTime.now() ) val conf = new SparkConf().setAppName(settings.appName) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.kryoserializer.buffer.max", "128") .set("spark.sql.parquet.compression.codec", "snappy") .setMaster("local[*]") val localSqlWriter = new HistoryDBManager(settings) override def beforeAll() { _sc = new SparkContext(conf) _fs = FileSystem.getLocal(_sc.hadoopConfiguration) super.beforeAll() } test("parse basic conf") { val configuration = new ConfigReader(settings.configFilePath)(localSqlWriter, settings) val testSource: HdfsFile = HdfsFile("T1", "./t1.csv", "csv", true, "2018-03-26", None) val sources: Map[String, SourceConfig] = configuration.sourcesConfigMap assert(sources.keySet.size == 3, "Should be equal 3") assert(sources.keySet == Set("T1","T2","T3")) assert(sources("T1") === testSource) assert(true) } case class TestRow( str: String = r.nextString(5), int: Int = r.nextInt(), long: Long = r.nextLong(), double: Double = r.nextDouble(), boolean: Boolean = r.nextBoolean() ) test("run basic metrics") { val sqlContext = new SQLContext(sc) val input: DataFrame = sqlContext.createDataFrame(List.fill(SAMPLE_SIZE)(TestRow.apply())) val metric = FileMetric("123","ROW_COUNT","","input","2018-12-12",Map.empty) val res: (Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], Map[FileMetric, (Double, Option[String])]) = MetricProcessor.processAllMetrics(input, Seq.empty, Seq(metric), Seq.empty)(settings,sc, sqlContext, fs) assert(res._2(metric)._1 == SAMPLE_SIZE) } override def afterAll() { sc.stop() System.clearProperty("spark.driver.port") _sc = null super.afterAll() } }
Example 145
Source File: AvroInOutTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.io.avro import java.io.{File, FileNotFoundException, FileWriter} import java.nio.file.Paths import com.salesforce.op.test.TestSparkContext import com.salesforce.op.utils.io.avro.AvroInOut._ import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.rdd.RDD import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class AvroInOutTest extends FlatSpec with TestSparkContext { val avroSchemaPath = s"$testDataDir/PassengerDataAll.avsc" val avroFilePath = s"$testDataDir/PassengerDataAll.avro" val avroFileRecordCount = 891 val hdfs: FileSystem = FileSystem.get(sc.hadoopConfiguration) lazy val avroTemp: String = tempDir + "/avro-inout-test" Spec(AvroInOut.getClass) should "creates RDD from an avro file" in { val res = readPathSeq(avroFilePath, withCount = true, deepCopy = true, persist = false) res shouldBe a[RDD[_]] res.count shouldBe avroFileRecordCount } it should "creates RDD from a sequence of avro files" in { val res = readPathSeq(s"$avroFilePath,$avroFilePath") res.count shouldBe avroFileRecordCount*2 } it should "create RDD from a mixed sequence of valid and invalid avro files" in { val res = readPathSeq(s"badfile/path1,$avroFilePath,badfile/path2,$avroFilePath,badfile/path3") res.count shouldBe avroFileRecordCount*2 } it should "throw an error if passed in avro files are invalid" in { val error = intercept[IllegalArgumentException](readPathSeq("badfile/path1,badfile/path2")) error.getMessage shouldBe "No valid directory found in path 'badfile/path1,badfile/path2'" } it should "creates Some(RDD) from an avro file" in { val res = read(avroFilePath) res.size shouldBe 1 res.get shouldBe an[RDD[_]] res.get.count shouldBe avroFileRecordCount } it should "create None from an invalid avro file" in { val res = read("badfile/path") res shouldBe None } Spec[AvroWriter[_]] should "writeAvro to filesystem" in { val avroData = readPathSeq(avroFilePath).asInstanceOf[RDD[GenericRecord]] val avroSchema = loadFile(avroSchemaPath) val error = intercept[FileNotFoundException](hdfs.listStatus(new Path(avroTemp))) error.getMessage shouldBe s"File $avroTemp does not exist" AvroWriter(avroData).writeAvro(avroTemp, avroSchema) val hdfsFiles = hdfs.listStatus(new Path(avroTemp)) filter (x => x.getPath.getName.contains("part")) val res = readPathSeq((for { x <- hdfsFiles } yield avroTemp + "/" + x.getPath.getName).mkString(",")) res.count shouldBe avroFileRecordCount } it should "checkPathsExist" in { val tmpDir = Paths.get(File.separator, "tmp").toFile val f1 = new File(tmpDir, "avroinouttest") f1.delete() val w = new FileWriter(f1) w.write("just checking") w.close() val f2 = new File(tmpDir, "thisfilecannotexist") f2.delete() val f3 = new File(tmpDir, "this file cannot exist") f3.delete() assume(f1.exists && !f2.exists && !f3.exists) // check for one dir being invalid in the path amongst two selectExistingPaths(s"$f1,$f2") shouldBe f1.toString // check if all dirs in the path are invalid then we get an exception intercept[IllegalArgumentException] { selectExistingPaths(f2.toString) } // also, check if all dirs in the path are invalid ( in a different way ) then we get an exception intercept[IllegalArgumentException] { selectExistingPaths(f3.toString) } // check for one dir being invalid ( in a different way ) in the path amongst the two dirs in it selectExistingPaths(s"$f1,$f3") shouldBe f1.toString // check for paths order insensitivity selectExistingPaths(s"$f3,$f1") shouldBe f1.toString // check for an exception if the path is an empty string intercept[IllegalArgumentException] { selectExistingPaths("") } } }
Example 146
Source File: BigQueryDataFrame.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.spotify.spark.bigquery import com.google.api.services.bigquery.model.TableReference import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, BigQueryStrings} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.{DataFrame, SQLContext} import com.databricks.spark.avro._ import scala.util.Random object CreateDisposition extends Enumeration { val CREATE_IF_NEEDED, CREATE_NEVER = Value } object WriteDisposition extends Enumeration { val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value } class BigQueryDataFrame(df: DataFrame) { val sqlContext: SQLContext = df.sqlContext val conf: Configuration = sqlContext.sparkContext.hadoopConfiguration val bq: BigQueryClient = BigQueryClient.getInstance(conf) sqlContext.setConf("spark.sql.avro.compression.codec", "deflate") def saveAsBigQueryTable(tableSpec: String, writeDisposition: WriteDisposition.Value = null, createDisposition: CreateDisposition.Value = null, tmpWriteOptions: Map[String,String] = null): Unit = saveAsBigQueryTable( BigQueryStrings.parseTableReference(tableSpec), writeDisposition, createDisposition, tmpWriteOptions) private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, conf) fs.delete(path, true) } }
Example 147
Source File: JsonFileReporter.scala From kyuubi with Apache License 2.0 | 5 votes |
package yaooqinn.kyuubi.metrics import java.io.{BufferedWriter, Closeable, IOException, OutputStreamWriter} import java.util.{Timer, TimerTask} import java.util.concurrent.TimeUnit import scala.util.Try import scala.util.control.NonFatal import com.codahale.metrics.MetricRegistry import com.codahale.metrics.json.MetricsModule import com.fasterxml.jackson.databind.ObjectMapper import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kyuubi.Logging import org.apache.spark.{KyuubiSparkUtil, SparkConf} import org.apache.spark.KyuubiConf._ private[metrics] class JsonFileReporter(conf: SparkConf, registry: MetricRegistry) extends Closeable with Logging { private val jsonMapper = new ObjectMapper().registerModule( new MetricsModule(TimeUnit.MILLISECONDS, TimeUnit.MILLISECONDS, false)) private val timer = new Timer(true) private val interval = KyuubiSparkUtil.timeStringAsMs(conf.get(METRICS_REPORT_INTERVAL)) private val path = conf.get(METRICS_REPORT_LOCATION) private val hadoopConf = KyuubiSparkUtil.newConfiguration(conf) def start(): Unit = { timer.schedule(new TimerTask { var bw: BufferedWriter = _ override def run(): Unit = try { val json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(registry) val tmpPath = new Path(path + ".tmp") val tmpPathUri = tmpPath.toUri val fs = if (tmpPathUri.getScheme == null && tmpPathUri.getAuthority == null) { FileSystem.getLocal(hadoopConf) } else { FileSystem.get(tmpPathUri, hadoopConf) } fs.delete(tmpPath, true) bw = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath, true))) bw.write(json) bw.close() fs.setPermission(tmpPath, FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)) val finalPath = new Path(path) fs.rename(tmpPath, finalPath) fs.setPermission(finalPath, FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)) } catch { case NonFatal(e) => error("Error writing metrics to json file" + path, e) } finally { if (bw != null) { Try(bw.close()) } } }, 0, interval) } override def close(): Unit = { timer.cancel() } }
Example 148
Source File: HDFSTokenCollector.scala From kyuubi with Apache License 2.0 | 5 votes |
package yaooqinn.kyuubi.session.security import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.kyuubi.Logging import org.apache.spark.KyuubiSparkUtil._ import org.apache.spark.SparkConf import yaooqinn.kyuubi.service.ServiceException private[security] object HDFSTokenCollector extends TokenCollector with Logging { private def hadoopFStoAccess(conf: SparkConf, hadoopConf: Configuration): Set[FileSystem] = { val fileSystems = conf.getOption(ACCESS_FSS) .orElse(conf.getOption(ACCESS_NNS)) match { case Some(nns) => nns.split(",").map(new Path(_).getFileSystem(hadoopConf)).toSet case _ => Set.empty[FileSystem] } fileSystems + conf.getOption(STAGING_DIR).map(new Path(_).getFileSystem(hadoopConf)) .getOrElse(FileSystem.get(hadoopConf)) } private def renewer(hadoopConf: Configuration): String = { val tokenRenewer = Master.getMasterPrincipal(hadoopConf) debug("Delegation token renewer is: " + tokenRenewer) if (tokenRenewer == null || tokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer." error(errorMessage) throw new ServiceException(errorMessage) } tokenRenewer } override def obtainTokens(conf: SparkConf): Unit = try { val hadoopConf = newConfiguration(conf) val tokenRenewer = renewer(hadoopConf) val creds = new Credentials() hadoopFStoAccess(conf, hadoopConf).foreach { fs => fs.addDelegationTokens(tokenRenewer, creds) } UserGroupInformation.getCurrentUser.addCredentials(creds) } catch { case e: Exception => error("Failed to obtain HDFS tokens", e) } }
Example 149
Source File: KyuubiHiveUtil.scala From kyuubi with Apache License 2.0 | 5 votes |
package yaooqinn.kyuubi.utils import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.security.UserGroupInformation import org.apache.kyuubi.Logging import org.apache.spark.{KyuubiSparkUtil, SparkConf} object KyuubiHiveUtil extends Logging { private val HIVE_PREFIX = "hive." private val METASTORE_PREFIX = "metastore." val URIS: String = HIVE_PREFIX + METASTORE_PREFIX + "uris" val METASTORE_PRINCIPAL: String = HIVE_PREFIX + METASTORE_PREFIX + "kerberos.principal" def hiveConf(conf: SparkConf): HiveConf = { val hadoopConf = KyuubiSparkUtil.newConfiguration(conf) new HiveConf(hadoopConf, classOf[HiveConf]) } def addDelegationTokensToHiveState(ugi: UserGroupInformation): Unit = { val state = SessionState.get if (state != null) { addDelegationTokensToHiveState(state, ugi) } } def addDelegationTokensToHiveState(state: SessionState, ugi: UserGroupInformation): Unit = { state.getHdfsEncryptionShim match { case shim: org.apache.hadoop.hive.shims.Hadoop23Shims#HdfsEncryptionShim => try { val hdfsAdmin = ReflectUtils.getFieldValue(shim, "hdfsAdmin") val dfs = ReflectUtils.getFieldValue(hdfsAdmin, "dfs") dfs.asInstanceOf[FileSystem].addDelegationTokens(ugi.getUserName, ugi.getCredentials) } catch { case e: Exception => error("Failed add delegation token to hive session state", e) } case _ => } } }
Example 150
Source File: KyuubiDistributedCacheManager.scala From kyuubi with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.net.URI import scala.collection.mutable.{HashMap, Map} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType} def addResource( fs: FileSystem, conf: Configuration, destPath: Path, localResources: HashMap[String, LocalResource], resourceType: LocalResourceType, link: String, statCache: Map[URI, FileStatus]): Unit = { cacheManager.addResource(fs, conf, destPath, localResources, resourceType, link, statCache, appMasterOnly = true) } }
Example 151
Source File: KyuubiDistributedCacheManagerSuite.scala From kyuubi with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.net.URI import scala.collection.mutable.{HashMap, Map} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType, LocalResourceVisibility} import org.apache.hadoop.yarn.util.ConverterUtils import org.apache.spark.{KyuubiSparkUtil, SparkFunSuite} import org.mockito.Mockito.when import org.scalatest.mock.MockitoSugar import yaooqinn.kyuubi.utils.ReflectUtils class KyuubiDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar { class MockClientDistributedCacheManager extends ClientDistributedCacheManager { override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]): LocalResourceVisibility = { LocalResourceVisibility.PRIVATE } } test("add resource") { val fs = mock[FileSystem] val conf = new Configuration() val destPath = new Path("file:///foo.bar.com:8080/tmp/testing") val localResources = HashMap[String, LocalResource]() val statCache = HashMap[URI, FileStatus]() val status = new FileStatus() when(fs.getFileStatus(destPath)).thenReturn(status) val fileLink = "link" ReflectUtils.setFieldValue( KyuubiDistributedCacheManager, "cacheManager", new MockClientDistributedCacheManager) KyuubiDistributedCacheManager.addResource( fs, conf, destPath, localResources, LocalResourceType.FILE, fileLink, statCache) val res = localResources(fileLink) assert(res.getVisibility === LocalResourceVisibility.PRIVATE) assert(ConverterUtils.getPathFromYarnURL(res.getResource) === destPath) assert(res.getSize === 0) assert(res.getTimestamp === 0) assert(res.getType === LocalResourceType.FILE) val status2 = new FileStatus( 10, false, 1, 1024, 10, 10, null, KyuubiSparkUtil.getCurrentUserName, null, new Path("/tmp/testing2")) val destPath2 = new Path("file:///foo.bar.com:8080/tmp/testing2") when(fs.getFileStatus(destPath2)).thenReturn(status2) val fileLink2 = "link2" KyuubiDistributedCacheManager.addResource( fs, conf, destPath2, localResources, LocalResourceType.FILE, fileLink2, statCache) val res2 = localResources(fileLink2) assert(res2.getVisibility === LocalResourceVisibility.PRIVATE) assert(ConverterUtils.getPathFromYarnURL(res2.getResource) === destPath2) assert(res2.getSize === 10) assert(res2.getTimestamp === 10) assert(res2.getType === LocalResourceType.FILE) } test("add resource when link null") { val distMgr = new MockClientDistributedCacheManager() val fs = mock[FileSystem] val conf = new Configuration() val destPath = new Path("file:///foo.bar.com:8080/tmp/testing") ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr) val localResources = HashMap[String, LocalResource]() val statCache = HashMap[URI, FileStatus]() when(fs.getFileStatus(destPath)).thenReturn(new FileStatus()) intercept[Exception] { KyuubiDistributedCacheManager.addResource( fs, conf, destPath, localResources, LocalResourceType.FILE, null, statCache) } assert(localResources.get("link") === None) assert(localResources.size === 0) } test("test addResource archive") { val distMgr = new MockClientDistributedCacheManager() ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr) val fs = mock[FileSystem] val conf = new Configuration() val destPath = new Path("file:///foo.bar.com:8080/tmp/testing") val localResources = HashMap[String, LocalResource]() val statCache = HashMap[URI, FileStatus]() val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner", null, new Path("/tmp/testing")) when(fs.getFileStatus(destPath)).thenReturn(realFileStatus) KyuubiDistributedCacheManager.addResource( fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", statCache) val resource = localResources("link") assert(resource.getVisibility === LocalResourceVisibility.PRIVATE) assert(ConverterUtils.getPathFromYarnURL(resource.getResource) === destPath) assert(resource.getTimestamp === 10) assert(resource.getSize === 10) assert(resource.getType === LocalResourceType.ARCHIVE) } }
Example 152
Source File: HDFSHelperTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package scalaDemo import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.collection.mutable.ListBuffer object HDFSHelperTest { def start(args: Array[String]): Unit = { val hdfs : FileSystem = FileSystem.get(new Configuration) args(0) match { case "list" => traverse(hdfs, args(1)) case "createFile" => HDFSHelper.createFile(hdfs, args(1)) case "createFolder" => HDFSHelper.createFolder(hdfs, args(1)) case "copyfile" => HDFSHelper.copyFile(hdfs, args(1), args(2)) case "copyfolder" => HDFSHelper.copyFolder(hdfs, args(1), args(2)) case "delete" => HDFSHelper.deleteFile(hdfs, args(1)) case "copyfilefrom" => HDFSHelper.copyFileFromLocal(hdfs, args(1), args(2)) case "copyfileto" => HDFSHelper.copyFileToLocal(hdfs, args(1), args(2)) case "copyfolderfrom" => HDFSHelper.copyFolderFromLocal(hdfs, args(1), args(2)) case "copyfolderto" => HDFSHelper.copyFolderToLocal(hdfs, args(1), args(2)) } } def traverse(hdfs : FileSystem, hdfsPath : String) = { val holder : ListBuffer[String] = new ListBuffer[String] val paths : List[String] = HDFSHelper.listChildren(hdfs, hdfsPath, holder).toList for(path <- paths){ //path.toString才是文件的全路径名 System.out.println("--------- path = " + path) //path.getName只是文件名,不包括路径 System.out.println("--------- Path.getname = " + new Path(path).getName) } } }
Example 153
Source File: ExecutorDelegationTokenUpdater.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. //在执行程序中,该线程唤醒并从HDFS中获取新令牌(如果有的话) private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { executorUpdaterRunnable.run() } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 154
Source File: SimrSchedulerBackend.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, //运行driver的主机名或 IP 地址 RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件 val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 155
Source File: Util.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def dropTempFilePath(conf: Configuration, path: String): Boolean = { val fileSystem = FileSystem.get(conf) val filePath = new Path(path) if (fileSystem.exists(filePath)) { fileSystem.delete(filePath, true) } else { false } } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 156
Source File: Util.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def getTempFilePath(conf: Configuration, prefix: String): String = { val fileSystem = FileSystem.get(conf) val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") if (fileSystem.exists(path)) { fileSystem.delete(path, true) } path.getName } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 157
Source File: YARNHadoopDelegationTokenManager.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.security.Credentials import org.apache.spark.SparkConf import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def obtainDelegationTokens(hadoopConf: Configuration, creds: Credentials): Long = { val superInterval = delegationTokenManager.obtainDelegationTokens(hadoopConf, creds) credentialProviders.values.flatMap { provider => if (provider.credentialsRequired(hadoopConf)) { provider.obtainCredentials(hadoopConf, sparkConf, creds) } else { logDebug(s"Service ${provider.serviceName} does not require a token." + s" Check your configuration to see if security is disabled or not.") None } }.foldLeft(superInterval)(math.min) } private def getCredentialProviders: Map[String, ServiceCredentialProvider] = { val providers = loadCredentialProviders providers. filter { p => delegationTokenManager.isServiceEnabled(p.serviceName) } .map { p => (p.serviceName, p) } .toMap } private def loadCredentialProviders: List[ServiceCredentialProvider] = { ServiceLoader.load(classOf[ServiceCredentialProvider], Utils.getContextOrSparkClassLoader) .asScala .toList } }
Example 158
Source File: InsertIntoHiveDirCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.language.existentials import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.FileUtils import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.mapred._ import org.apache.spark.SparkException import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.hive.client.HiveClientImpl case class InsertIntoHiveDirCommand( isLocal: Boolean, storage: CatalogStorageFormat, query: LogicalPlan, overwrite: Boolean, outputColumns: Seq[Attribute]) extends SaveAsHiveFile { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { assert(storage.locationUri.nonEmpty) val hiveTable = HiveClientImpl.toHiveTable(CatalogTable( identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")), tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW, storage = storage, schema = query.schema )) hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB, storage.serde.getOrElse(classOf[LazySimpleSerDe].getName)) val tableDesc = new TableDesc( hiveTable.getInputFormatClass, hiveTable.getOutputFormatClass, hiveTable.getMetadata ) val hadoopConf = sparkSession.sessionState.newHadoopConf() val jobConf = new JobConf(hadoopConf) val targetPath = new Path(storage.locationUri.get) val writeToPath = if (isLocal) { val localFileSystem = FileSystem.getLocal(jobConf) localFileSystem.makeQualified(targetPath) } else { val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf) val dfs = qualifiedPath.getFileSystem(jobConf) if (!dfs.exists(qualifiedPath)) { dfs.mkdirs(qualifiedPath.getParent) } qualifiedPath } val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath) val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc( tmpPath.toString, tableDesc, false) try { saveAsHiveFile( sparkSession = sparkSession, plan = child, hadoopConf = hadoopConf, fileSinkConf = fileSinkConf, outputLocation = tmpPath.toString, allColumns = outputColumns) val fs = writeToPath.getFileSystem(hadoopConf) if (overwrite && fs.exists(writeToPath)) { fs.listStatus(writeToPath).foreach { existFile => if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true) } } fs.listStatus(tmpPath).foreach { tmpFile => fs.rename(tmpFile.getPath, writeToPath) } } catch { case e: Throwable => throw new SparkException( "Failed inserting overwrite directory " + storage.locationUri.get, e) } finally { deleteExternalTmpPath(hadoopConf) } Seq.empty[Row] } }
Example 159
Source File: QueryPartitionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.io.File import java.sql.Timestamp import com.google.common.io.Files import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import spark.implicits._ test("SPARK-5068: query data when path doesn't exist") { withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { val testData = sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.createOrReplaceTempView("testData") val tmpDir = Files.createTempDir() // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) sql("DROP TABLE IF EXISTS table_with_partition") sql("DROP TABLE IF EXISTS createAndInsertTest") } } test("SPARK-21739: Cast expression should initialize timezoneId") { withTable("table_with_timestamp_partition") { sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)") sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " + "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)") // test for Cast expression in TableReader checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"), Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000")))) // test for Cast expression in HiveTableScanExec checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " + "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1)) } } }
Example 160
Source File: StreamMetadata.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = metadataFile.getFileSystem(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 161
Source File: ParquetFileFormatSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext { test("read parquet footers in parallel") { def testReadFooters(ignoreCorruptFiles: Boolean): Unit = { withTempDir { dir => val fs = FileSystem.get(sparkContext.hadoopConfiguration) val basePath = dir.getCanonicalPath val path1 = new Path(basePath, "first") val path2 = new Path(basePath, "second") val path3 = new Path(basePath, "third") spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString) spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString) spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString) val fileStatuses = Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val footers = ParquetFileFormat.readParquetFootersInParallel( sparkContext.hadoopConfiguration, fileStatuses, ignoreCorruptFiles) assert(footers.size == 2) } } testReadFooters(true) val exception = intercept[java.io.IOException] { testReadFooters(false) } assert(exception.getMessage().contains("Could not read footer for file")) } }
Example 162
Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.TaskAttemptContext import org.mockito.Matchers.any import org.mockito.Mockito.{verify, when} import org.scalatest.{BeforeAndAfter, FlatSpec} import org.scalatest.mock.MockitoSugar import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter { var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _ var mockOutputStream : FSDataOutputStream = _ var byteArrayOutputStream: ByteArrayOutputStream = _ var mockTaskAttemptContext: TaskAttemptContext = _ var mockPath: Path = _ var mockFileSystem: FileSystem = _ before { byteArrayOutputStream = new ByteArrayOutputStream() mockOutputStream = mock[FSDataOutputStream] sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream) mockTaskAttemptContext = mock[TaskAttemptContext] mockPath = mock[Path] mockFileSystem = mock[FileSystem] } it should "write an empty array of bytes" in { val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes" in { val byteArray = Array[Byte](0, 0, 0, 0) byteArrayOutputStream.write(byteArray) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding as necessary" in { byteArrayOutputStream.write(5) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding only as much as necessary" in { byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0)) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "create a record writer from a FSDataOutputStream created by the filesystem" in { val mockTaskAttemptContext = mock[TaskAttemptContext] val mockPath = mock[Path] val mockFileSystem = mock[FileSystem] when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem) new RecordIOOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { mockPath } }.getRecordWriter(mockTaskAttemptContext) verify(mockFileSystem).create(mockPath, true) } }
Example 163
Source File: IOReader.scala From spark-benchmarks with Apache License 2.0 | 5 votes |
package com.bbva.spark.benchmarks.dfsio import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} class IOReader(hadoopConf: Configuration, dataDir: String) extends IOTestBase(hadoopConf, dataDir) { def doIO(fileName: String, fileSize: BytesSize)(implicit conf: Configuration, fs: FileSystem): BytesSize = { val bufferSize = conf.getInt("test.io.file.buffer.size", DefaultBufferSize) // TODO GET RID OF DEFAULT val buffer: Array[Byte] = new Array[Byte](bufferSize) val filePath = new Path(dataDir, fileName.toString) logger.info("Reading file {} with size {}", filePath.toString, fileSize.toString) val in = fs.open(filePath) var actualSize: Long = 0 // TODO improve this try { Stream.continually(in.read(buffer, 0, bufferSize)) .takeWhile(_ > 0 && actualSize < fileSize) .foreach { currentSize => actualSize += currentSize logger.debug(s"Reading chunk of size $currentSize. Currently: $actualSize / $fileSize") } } finally { in.close() } logger.info("File {} with size {} read successfully", fileName, actualSize.toString) actualSize } }
Example 164
Source File: ExecutorDelegationTokenUpdater.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { // We just checked for new credentials but none were there, wait a minute and retry. // This handles the shutdown case where the staging directory may have been removed(see // SPARK-12316 for more details). delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.MINUTES) } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 165
Source File: SimrSchedulerBackend.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) if (!fs.delete(new Path(driverFilePath), false)) { logWarning(s"error deleting ${driverFilePath}") } super.stop() } }
Example 166
Source File: ExecutorSource.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import java.util.concurrent.ThreadPoolExecutor import scala.collection.JavaConverters._ import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.hadoop.fs.FileSystem import org.apache.spark.metrics.source.Source private[spark] class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source { private def fileStats(scheme: String) : Option[FileSystem.Statistics] = FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme)) private def registerFileSystemStat[T]( scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = { metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] { override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue) }) } override val metricRegistry = new MetricRegistry() override val sourceName = "executor" // Gauge for executor thread pool's actively executing task counts metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] { override def getValue: Int = threadPool.getActiveCount() }) // Gauge for executor thread pool's approximate total number of tasks that have been completed metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] { override def getValue: Long = threadPool.getCompletedTaskCount() }) // Gauge for executor thread pool's current number of threads metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getPoolSize() }) // Gauge got executor thread pool's largest number of threads that have ever simultaneously // been in th pool metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] { override def getValue: Int = threadPool.getMaximumPoolSize() }) // Gauge for file system stats of this executor for (scheme <- Array("hdfs", "file")) { registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L) registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L) registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0) registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0) registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0) } }
Example 167
Source File: LineCount.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.operations import java.net.URI import com.cloudera.spark.cloud.ObjectStoreExample import com.cloudera.spark.cloud.s3.SequentialIOPolicy import com.cloudera.spark.cloud.common.CloudTestKeys._ import com.cloudera.spark.cloud.s3.SequentialIOPolicy import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} destFsInfo = Some(s"\nFile System $destPath=\n$destFS\n") } } srcFsInfo = Some(s"\nSource File System = $sourceFs\n") } finally { logInfo("Stopping Spark Context") sc.stop() srcFsInfo.foreach(logInfo(_)) destFsInfo.foreach(logInfo(_)) } 0 } def defaultSource: Option[String] = { Some(S3A_CSV_PATH_DEFAULT) } def maybeEnableAnonymousAccess( sparkConf: SparkConf, dest: Option[String]): Unit = { if (dest.isEmpty) { hconf(sparkConf, AWS_CREDENTIALS_PROVIDER, ANONYMOUS_CREDENTIALS) } } }
Example 168
Source File: S3ADataFrames.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.s3 import com.cloudera.spark.cloud.common.CloudTestKeys import com.cloudera.spark.cloud.operations.CloudDataFrames import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession object S3ADataFrames extends CloudDataFrames with S3AExampleSetup { override def extraValidation( session: SparkSession, conf: Configuration, fs: FileSystem, results: Seq[(String, Path, Long, Long)]): Unit = { val operations = new S3AOperations(fs) if (conf.getBoolean(CloudTestKeys.S3A_COMMITTER_TEST_ENABLED, false)) { results.foreach((tuple: (String, Path, Long, Long)) => { operations.verifyS3Committer(tuple._2, None, None, "") }) } } }
Example 169
Source File: SeekReadTests.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.common import org.apache.hadoop.fs.FileSystem class SeekReadTests extends CloudSuiteWithCSVDatasource { override def enabled: Boolean = super.enabled && hasCSVTestFile ctest("SeekReadFully", """Assess cost of seek and read operations. | When moving the cursor in an input stream, an HTTP connection may be closed and | then re-opened. This can be very expensive; tactics like streaming forwards instead | of seeking, and/or postponing movement until the following read ('lazy seek') try | to address this. Logging these operation times helps track performance. | This test also tries to catch out a regression, where a `close()` operation | is implemented through reading through the entire input stream. This is exhibited | in the time to `close()` while at offset 0 being `O(len(file))`. | | Note also the cost of `readFully()`; this method call is common inside libraries | like Orc and Parquet.""".stripMargin) { val (source, fs) = getCSVSourceAndFileSystem() FileSystem.clearStatistics fs.getStorageStatistics.reset() val st = logDuration("stat") { fs.getFileStatus(source) } val in = logDuration("open") { fs.open(source) } def time[T](operation: String)(testFun: => T): T = { logInfo(s"") var r = logDuration(operation + s" [pos = ${in.getPos}]")(testFun) logInfo(s" ${in.getWrappedStream}") r } val eof = st.getLen time("read()") { assert(-1 !== in.read()) } time("seek(256)") { in.seek(256) } time("seek(256)") { in.seek(256) } time("seek(EOF-2)") { in.seek(eof - 2) } time("read()") { assert(-1 !== in.read()) } def readFully(offset: Long, len: Int): Unit = { time(s"readFully($offset, byte[$len])") { val bytes = new Array[Byte](len) assert(-1 !== in.readFully(offset, bytes)) } } readFully(1L, 1) readFully(1L, 256) readFully(eof - 350, 300) readFully(260L, 256) readFully(1024L, 256) readFully(1536L, 256) readFully(8192L, 1024) readFully(8192L + 1024 + 512, 1024) readFully(0L, 1024) readFully(eof - 1024, 1024) time("seek(getPos)") { in.seek(in.getPos()) } time("read()") { assert(-1 !== in.read()) } logDuration("close()") { in.close } dumpFileSystemStatistics(fs.getStorageStatistics) } }
Example 170
Source File: TableFuncs.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.utils import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.SparkSession object TableFuncs{ def getTableMetadata(spark:SparkSession, tableName:String) = { val catalog = spark.sessionState.catalog val tId = spark.sessionState.sqlParser.parseTableIdentifier(tableName) catalog.getTableMetadata(tId) } def getTableDirectory(spark: SparkSession, tableName:String) ={ getTableMetadata(spark,tableName) .location .toString .split('/') .dropRight(1) .mkString("/") } def getExactSamplePath(spark: SparkSession, path:String) = { val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val statuses = fs.globStatus(new org.apache.hadoop.fs.Path(path)) statuses.head.getPath.toString } def getParentFolderPath(spark: SparkSession, path: String): String = { val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) (new org.apache.hadoop.fs.Path(path)).getParent.toString } def getAllSamples(spark: SparkSession, path:String) = { val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val statuses = fs.globStatus(new org.apache.hadoop.fs.Path(path)) //println(statuses.length) statuses .map(_.getPath.toString.split('/').takeRight(1).head.split('.').take(1).head) } }
Example 171
Source File: FileHandler.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.common.security import java.nio.file._ import java.security.AccessControlException import scala.collection.JavaConverters._ import org.apache.hadoop.fs.{FileSystem, Path} import com.paypal.gimel.common.conf.GimelConstants import com.paypal.gimel.logger.Logger object FileHandler { val logger = Logger(this.getClass) def checkIfFileAccessibleByOthers(filePath: String, source: String, fail: Boolean): Unit = { source.toLowerCase() match { case GimelConstants.HADDOP_FILE_SYSTEM => val conf = new org.apache.hadoop.conf.Configuration() val fs = FileSystem.get(conf) val hdfsPath = new Path(filePath) if (fs.exists(hdfsPath)) { val permission = fs.getFileStatus(hdfsPath).getPermission.toString if (permission.substring(3, permission.length) != "------") { val message = s"FILE IS NOT PROTECTED. PLEASE PROTECT THE FILE WITH PROPER PERMISSIONS (700) : ${filePath}" if (fail) { throw new AccessControlException(message) } } } case GimelConstants.LOCAL_FILE_SYSTEM => val path = Paths.get(filePath) if (Files.exists(path)) { val p = Files.getPosixFilePermissions(path) if (p.asScala.exists(x => x.toString.startsWith("OTHER") || x.toString.startsWith("GROUP"))) { val message = s"FILE IS NOT PROTECTED. PLEASE PROTECT THE FILE WITH PROPER PERMISSIONS (700) : ${filePath}" if (fail) { throw new AccessControlException(message) } } } } } }
Example 172
Source File: ArtifactHdfsSaver.scala From marvin-engine-executor with Apache License 2.0 | 5 votes |
package org.marvin.artifact.manager import java.io.{File, FileInputStream} import akka.Done import akka.actor.{Actor, ActorLogging} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.marvin.artifact.manager.ArtifactSaver.{SaveToLocal, SaveToRemote} import org.marvin.model.EngineMetadata class ArtifactHdfsSaver(metadata: EngineMetadata) extends Actor with ActorLogging { var conf: Configuration = _ override def preStart() = { log.info(s"${this.getClass().getCanonicalName} actor initialized...") conf = new Configuration() if (sys.env.get("HADOOP_CONF_DIR") != None){ val confFiles:List[File] = getListOfFiles(sys.env.get("HADOOP_CONF_DIR").mkString) for(file <- confFiles){ log.info(s"Loading ${file.getAbsolutePath} file to hdfs client configuration ..") conf.addResource(new FileInputStream(file)) } } conf.set("fs.defaultFS", metadata.hdfsHost) } def generatePaths(artifactName: String, protocol: String): Map[String, Path] = { Map( "localPath" -> new Path(s"${metadata.artifactsLocalPath}/${metadata.name}/$artifactName"), "remotePath" -> new Path(s"${metadata.artifactsRemotePath}/${metadata.name}/${metadata.version}/$artifactName/$protocol") ) } def getListOfFiles(path: String): List[File] = { val dir = new File(path) val extensions = List("xml") dir.listFiles.filter(_.isFile).toList.filter { file => extensions.exists(file.getName.endsWith(_)) } } def validatePath(path: Path, isRemote: Boolean, fs: FileSystem): Boolean = { if (isRemote) { fs.exists(path) } else { new java.io.File(path.toString).exists } } override def receive: Receive = { case SaveToLocal(artifactName, protocol) => log.info("Receive message and starting to working...") val fs = FileSystem.get(conf) val uris = generatePaths(artifactName, protocol) if (validatePath(uris("remotePath"), true, fs)) { log.info(s"Copying files from ${uris("remotePath")} to ${uris("localPath")}") fs.copyToLocalFile(false, uris("remotePath"), uris("localPath"), false) fs.close() log.info(s"File ${uris("localPath")} saved!") } else { log.error(s"Invalid protocol: ${protocol}, save process canceled!") } sender ! Done case SaveToRemote(artifactName, protocol) => log.info("Receive message and starting to working...") val fs = FileSystem.get(conf) val uris = generatePaths(artifactName, protocol) if (validatePath(uris("localPath"), false, fs)) { log.info(s"Copying files from ${uris("localPath")} to ${uris("remotePath")}") fs.copyFromLocalFile(uris("localPath"), uris("remotePath")) fs.close() log.info(s"File ${uris("localPath")} saved!") } else { log.error(s"Invalid protocol: ${protocol}, save process canceled!") } sender ! Done case _ => log.warning("Received a bad format message...") } }
Example 173
Source File: IndexBuilder.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.recommender import com.datastax.spark.connector._ import com.typesafe.config.Config import io.gzet.recommender.Config._ import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.SparkContext import spark.jobserver._ object IndexBuilder extends SparkJob { override def runJob(sc: SparkContext, conf: Config): Any = { val inputDir = conf.getString("input.dir") val sampleSizeB = sc.broadcast(SAMPLE_SIZE) val audioSongRDD = AudioLibrary.read(inputDir, sc, MIN_TIME, MAX_TIME) val songRDD = audioSongRDD.keys.sortBy(song => song).zipWithIndex().mapValues(l => l + 1) val songIdsB = sc.broadcast(songRDD.collectAsMap()) val audioRDD = audioSongRDD mapPartitions { audios => val songIds = songIdsB.value audios map { case (song, audio) => (songIds.get(song).get, audio) } } val sampleRDD = audioRDD flatMap { case (songId, audio) => audio.sampleByTime(sampleSizeB.value) map { sample => (songId, sample) } } val recordRDD = songRDD map { case (name, id) => Record(id, name) } val hashRDD = sampleRDD.map({case (songId, sample) => ((sample.hash, songId), Array(sample.id)) }).reduceByKey(_ ++ _).mapValues(a => a.mkString(",")).map({case ((hash, songId), sampleIds) => (hash, songId) }).groupByKey().mapValues(it => it.toList).map({case (id, songs) => Hash(id, songs) }) hashRDD.saveAsCassandraTable(KEYSPACE, TABLE_HASH) recordRDD.saveAsCassandraTable(KEYSPACE, TABLE_RECORD) } def containsWav(hdfs: FileSystem, path: Path) = { val it = hdfs.listFiles(path, false) var i = 0 while(it.hasNext){ if(it.next().getPath.getName.endsWith(".wav")){ i += 1 } } i > 0 } override def validate(sc: SparkContext, config: Config): SparkJobValidation = { if(!config.hasPath("input.dir")) { SparkJobInvalid("Missing parameter [input.dir]") } else { val hdfs = FileSystem.get(sc.hadoopConfiguration) val path = new Path(config.getString("input.dir")) val isDir = hdfs.isDirectory(path) val isValid = containsWav(hdfs, path) hdfs.close() if(isDir && isValid) { SparkJobValid } else { SparkJobInvalid("Input directory does not contains .wav files") } } } }
Example 174
Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0 | 5 votes |
package com.criteo.rsvd import java.nio.ByteBuffer import com.esotericsoftware.kryo.Kryo import com.typesafe.scalalogging.slf4j.StrictLogging import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.rdd.RDD import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.ClassTag object ReadingWritingData extends StrictLogging { def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = { val fs = FileSystem.get(sc.hadoopConfiguration) val path = new Path(inputPathPattern) (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt } def loadMatrixEntries(inputPath: String, singlePartitionSizeMB: Int, sc: SparkContext): RDD[MatrixEntry] = { logger.info(s"Input matrix path: $inputPath") val inputDataSizeMB = getInputDataSizeMB(inputPath + " def makeRddFromKryoFile[T: ClassTag]( sc: SparkContext, path: String, minPartitionsOpt: Option[Int] = None): RDD[T] = { val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions) val serializer = new KryoSerializer(sc.getConf) sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions) .mapPartitions { it => val instance = serializer.newInstance() it.flatMap { case (_, v) => instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes)) } } } object RandomizedSVDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { UnmodifiableCollectionsSerializer.registerSerializers(kryo) kryo.register(classOf[MatrixEntry]) kryo.register(classOf[Array[MatrixEntry]]) } } def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf = appendRegistratorToSparkConf(sparkConf, RandomizedSVDKryoRegistrator.getClass.getName) def appendRegistratorToSparkConf(sparkConf: SparkConf, registratorName: String): SparkConf = { val oldValue = sparkConf.get("spark.kryo.registrator", "") if (oldValue == "") { sparkConf.set("spark.kryo.registrator", registratorName) } else { sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName) } } }
Example 175
Source File: LasRelation.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import fr.ign.spark.iqmulus.{ BinarySectionRelation, BinarySection } import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.OutputWriterFactory import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.types._ import scala.util.{ Try, Success, Failure } class LasRelation( override val paths: Array[String], override val maybeDataSchema: Option[StructType], override val userDefinedPartitionColumns: Option[StructType], parameters: Map[String, String] )(@transient val sqlContext: SQLContext) extends BinarySectionRelation(parameters) { def format = parameters.get("lasformat").map(_.toByte) def minor = parameters.get("minor").map(_.toByte).getOrElse(Version.minorDefault) def major = parameters.get("major").map(_.toByte).getOrElse(Version.majorDefault) def version = parameters.get("version").map(Version.fromString) .getOrElse(Version(major, minor)) lazy val headers: Array[LasHeader] = paths flatMap { location => Try { val path = new Path(location) val fs = FileSystem.get(path.toUri, sqlContext.sparkContext.hadoopConfiguration) val dis = fs.open(path) try LasHeader.read(location, dis) finally { dis.close fs.close } } match { case Success(h) => Some(h) case Failure(e) => logWarning(s"Skipping $location : ${e.getMessage}"); None } } override def sections: Array[BinarySection] = headers.map(_.toBinarySection(paths)) override def prepareJobForWrite(job: Job): OutputWriterFactory = { new LasOutputWriterFactory(format, version) } }
Example 176
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTime: Long = System.currentTimeMillis() var lastKnownFileSize: Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 177
Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.sink.config.TableOptions import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.apache.kafka.connect.data.{Schema, Struct} case class HiveSinkState(offsets: Map[TopicPartition, Offset], committedOffsets: Map[TopicPartition, Offset], table: Table, tableLocation: Path, plan: Option[PartitionPlan], metastoreSchema: Schema, mapper: Struct => Struct, lastSchema: Schema) { def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = { copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset)) } def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(offsets = offsets + (tp -> offset)) } def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = { copy(committedOffsets = committedOffsets ++ offsets) } def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(committedOffsets = committedOffsets + (tp -> offset)) } def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema) } object HiveSinkState { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def from(schema: Schema, table: TableOptions, dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = { logger.info(s"Init sink for schema $schema") val hiveTable = getOrCreateTable(table, dbName, schema) val tableLocation = new Path(hiveTable.getSd.getLocation) val plan = hive.partitionPlan(hiveTable) val metastoreSchema = table.evolutionPolicy .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema) .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema")) val mapperFns: Seq[Struct => Struct] = Seq( table.projection.map(new ProjectionMapper(_)), Some(new MetastoreSchemaAlignMapper(metastoreSchema)), plan.map(new DropPartitionValuesMapper(_)) ).flatten.map(mapper => mapper.map _) val mapper = Function.chain(mapperFns) HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema) } def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema) (implicit client: IMetaStoreClient, fs: FileSystem): Table = { def create: Table = { val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",") logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]") hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format) } logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}") client.tableExists(dbName.value, table.tableName.value) match { case true if table.overwriteTable => hive.dropTable(dbName, table.tableName, true) create case true => client.getTable(dbName.value, table.tableName.value) case false if table.createTable => create case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist") } } }
Example 178
Source File: StrictPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.collection.JavaConverters._ import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} object StrictPartitionHandler extends PartitionHandler { override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { try { val part = client.getPartition(db.value, tableName.value, partition.entries.map(_._2).toList.asJava) Success(new Path(part.getSd.getLocation)) } catch { case NonFatal(e) => Failure(new RuntimeException(s"Partition '${partition.entries.map(_._2).toList.mkString(",")}' does not exist and strict policy requires upfront creation", e)) } } }
Example 179
Source File: CachedPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.util.{Success, Try} class CachedPartitionHandler(partitioner: PartitionHandler) extends PartitionHandler { val cache = scala.collection.mutable.Map.empty[Partition, Path] override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { cache.get(partition) match { case Some(path) => Success(path) case _ => val created = partitioner.path(partition, db, tableName)(client, fs) created.foreach(cache.put(partition, _)) created } } }
Example 180
Source File: DynamicPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.{StorageDescriptor, Table} import scala.collection.JavaConverters._ import scala.util.{Failure, Success, Try} class DynamicPartitionHandler(pathPolicy: PartitionPathPolicy = DefaultMetastorePartitionPathPolicy) extends PartitionHandler { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { def table: Table = client.getTable(db.value, tableName.value) def create(path: Path, table: Table): Unit = { logger.debug(s"New partition will be created at $path") val sd = new StorageDescriptor(table.getSd) sd.setLocation(path.toString) val params = new java.util.HashMap[String, String] val values = partition.entries.map(_._2).toList.asJava val ts = (System.currentTimeMillis / 1000).toInt val p = new org.apache.hadoop.hive.metastore.api.Partition(values, db.value, tableName.value, ts, 0, sd, params) logger.debug(s"Updating hive metastore with partition $p") client.add_partition(p) logger.info(s"Partition has been created in metastore [$partition]") } Try(client.getPartition(db.value, tableName.value, partition.entries.toList.map(_._2).asJava)) match { case Success(p) => Try { new Path(p.getSd.getLocation) } case Failure(_) => Try { val t = table val tableLocation = new Path(t.getSd.getLocation) val path = pathPolicy.path(tableLocation, partition) create(path, t) path } } } }
Example 181
Source File: StageManager.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.formats.HiveWriter import com.landoop.streamreactor.connect.hive.{TopicPartition, TopicPartitionOffset} import org.apache.hadoop.fs.{FileSystem, Path} class StageManager(filenamePolicy: FilenamePolicy) { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private def stageFilename(tp: TopicPartition) = s".${filenamePolicy.prefix}_${tp.topic.value}_${tp.partition}" private def finalFilename(tpo: TopicPartitionOffset) = s"${filenamePolicy.prefix}_${tpo.topic.value}_${tpo.partition}_${tpo.offset.value}" def stage(dir: Path, tp: TopicPartition)(implicit fs: FileSystem): Path = { val filename = stageFilename(tp) val stagePath = new Path(dir, filename) fs.delete(stagePath, false) stagePath } def commit(stagePath: Path, tpo: TopicPartitionOffset)(implicit fs: FileSystem): Path = { val finalPath = new Path(stagePath.getParent, finalFilename(tpo)) logger.info(s"Commiting file $stagePath=>$finalPath") fs.rename(stagePath, finalPath) finalPath } }
Example 182
Source File: OffsetSeeker.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.util.control.NonFatal class OffsetSeeker(filenamePolicy: FilenamePolicy) { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) import HdfsUtils._ def seek(db: DatabaseName, tableName: TableName) (implicit fs: FileSystem, client: IMetaStoreClient): Set[TopicPartitionOffset] = { try { // the table may not have been created, in which case we have no offsets defined if (client.tableExists(db.value, tableName.value)) { val loc = com.landoop.streamreactor.connect.hive.tableLocation(db, tableName) val prefix = filenamePolicy.prefix fs.ls(new Path(loc), true) .map(_.getPath.getName) .collect { case CommittedFileName(`prefix`, topic, partition, _, end) => TopicPartitionOffset(topic, partition, end) } .toSeq .groupBy(_.toTopicPartition) .map { case (tp, tpo) => tp.withOffset(tpo.maxBy(_.offset.value).offset) }.toSet } else { Set.empty } } catch { case NonFatal(e) => logger.error(s"Error seeking table $db.table") throw e } } }
Example 183
Source File: CommitPolicy.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.TopicPartitionOffset import org.apache.hadoop.fs.{FileSystem, Path} import scala.concurrent.duration.FiniteDuration case class DefaultCommitPolicy(fileSize: Option[Long], interval: Option[FiniteDuration], fileCount: Option[Long]) extends CommitPolicy { require(fileSize.isDefined || interval.isDefined || fileCount.isDefined) override def shouldFlush(context: CommitContext) (implicit fs: FileSystem): Boolean = { val open_time = System.currentTimeMillis() - context.createdTimestamp fileSize.exists(_ <= context.fileSize) || interval.exists(_.toMillis <= open_time) || fileCount.exists(_ <= context.count) } }
Example 184
Source File: HiveWriterManager.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive.{Offset, TopicPartition, TopicPartitionOffset} import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveWriter} import com.landoop.streamreactor.connect.hive.sink.staging.StageManager import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.Schema def flush(offsets: Map[TopicPartition, Offset]): Unit = { logger.info(s"Flushing offsets $offsets") // we may not have an offset for a given topic/partition if no data was written to that TP writers.foreach { case (key, writer) => writer.close() offsets.get(key.tp).foreach { offset => stageManager.commit(writer.file, key.tp.withOffset(offset)) } writers.remove(key) } } def getWriters: Seq[OpenWriter] = writers.map { case (key, writer) => OpenWriter(key.tp, key.dir, writer) }.toList }
Example 185
Source File: TableFileScanner.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive.HdfsUtils._ import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient // for a table that is not partitioned, the files will all reside directly in the table location directory // otherwise, the files will each live in the particular partition folder (which technically, could be anywhere) object TableFileScanner { def scan(db: DatabaseName, tableName: TableName) (implicit fs: FileSystem, client: IMetaStoreClient): Seq[(Path, Option[Partition])] = { // the partitions from the metastore which each contain a pointer to the partition location hive.partitionPlan(db, tableName) match { case Some(plan) => hive.partitions(db, tableName).flatMap { case partition@Partition(entries, Some(location)) => val files = fs.listFiles(location, false) files.map(_.getPath).toVector.map(_ -> Some(partition)) } case None => val table = client.getTable(db.value, tableName.value) val files = fs.listFiles(new Path(table.getSd.getLocation), false) files.map(_.getPath).toVector.map(_ -> None) } } }
Example 186
Source File: HiveSource.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveReader, Record} import com.landoop.streamreactor.connect.hive.source.config.HiveSourceConfig import com.landoop.streamreactor.connect.hive.source.mapper.{PartitionValueMapper, ProjectionMapper} import com.landoop.streamreactor.connect.hive.source.offset.HiveSourceOffsetStorageReader import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Struct import org.apache.kafka.connect.source.SourceRecord import scala.collection.JavaConverters._ class HiveSource(db: DatabaseName, tableName: TableName, topic: Topic, offsetReader: HiveSourceOffsetStorageReader, config: HiveSourceConfig) (implicit client: IMetaStoreClient, fs: FileSystem) extends Iterator[SourceRecord] { val tableConfig = config.tableOptions.filter(_.tableName == tableName).find(_.topic == topic) .getOrElse(sys.error(s"Cannot find table configuration for ${db.value}.${tableName.value} => ${topic.value}")) private val table = client.getTable(db.value, tableName.value) private val format = HiveFormat(hive.serde(table)) private val metastoreSchema = HiveSchemas.toKafka(table) private val parts = TableFileScanner.scan(db, tableName) private val readers = parts.map { case (path, partition) => val fns: Seq[Struct => Struct] = Seq( partition.map(new PartitionValueMapper(_).map _), tableConfig.projection.map(new ProjectionMapper(_).map _) ).flatten val mapper: Struct => Struct = Function.chain(fns) val sourceOffset = offsetReader.offset(SourcePartition(db, tableName, topic, path)).getOrElse(SourceOffset(0)) new HiveReader { lazy val reader = format.reader(path, sourceOffset.rowNumber, metastoreSchema) override def iterator: Iterator[Record] = reader.iterator.map { record => Record(mapper(record.struct), record.path, record.offset) } override def close(): Unit = reader.close() } } private val iterator: Iterator[Record] = readers.map(_.iterator).reduce(_ ++ _).take(tableConfig.limit) override def hasNext: Boolean = iterator.hasNext override def next(): SourceRecord = { val record = iterator.next val sourcePartition = SourcePartition(db, tableName, topic, record.path) val offset = SourceOffset(record.offset) new SourceRecord( fromSourcePartition(sourcePartition).asJava, fromSourceOffset(offset).asJava, topic.value, record.struct.schema, record.struct ) } def close(): Unit = { readers.foreach(_.close()) } }
Example 187
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }
Example 188
Source File: StageManagerTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartition, TopicPartitionOffset} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StageManagerTest extends AnyWordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(conf) val dir = new Path("stageman") fs.mkdirs(dir) val manager = new StageManager(DefaultFilenamePolicy) "StageManager" should { "stage file as hidden" in { val stagePath = manager.stage(dir, TopicPartition(Topic("mytopic"), 1)) stagePath.getName.startsWith(".") shouldBe true } "delete existing file" in { val stagePath = manager.stage(dir, TopicPartition(Topic("mytopic"), 1)) fs.create(stagePath) manager.stage(dir, TopicPartition(Topic("mytopic"), 1)) fs.exists(stagePath) shouldBe false } "commit file using offset" in { val stagePath = manager.stage(dir, TopicPartition(Topic("mytopic"), 1)) fs.create(stagePath) val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100)) val finalPath = manager.commit(stagePath, tpo) finalPath.getName shouldBe "streamreactor_mytopic_1_100" } } }
Example 189
Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.concurrent.duration._ class DefaultCommitPolicyTest extends AnyWordSpec with Matchers { val schema: Schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .build() val struct = new Struct(schema) implicit val conf: Configuration = new Configuration() implicit val fs: LocalFileSystem = FileSystem.getLocal(conf) val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100)) private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = { val status = fs.getFileStatus(path) policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime)) } "DefaultCommitPolicy" should { "roll over after interval" in { val policy = DefaultCommitPolicy(None, Option(2.seconds), None) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 10) shouldBe false Thread.sleep(2000) shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file count" in { val policy = DefaultCommitPolicy(None, None, Some(9)) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 7) shouldBe false shouldFlush(policy, path, 8) shouldBe false shouldFlush(policy, path, 9) shouldBe true shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file size" in { val policy = DefaultCommitPolicy(Some(10), None, None) val path = new Path("foo") val out = fs.create(path) shouldFlush(policy, path, 7) shouldBe false out.writeBytes("wibble wobble wabble wubble") out.close() shouldFlush(policy, path, 9) shouldBe true fs.delete(path, false) } } }
Example 190
Source File: ParquetWriterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive.StructUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class ParquetWriterTest extends AnyWordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(conf) "ParquetWriter" should { "write parquet files" in { val schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .field("title", SchemaBuilder.string().optional().build()) .field("salary", SchemaBuilder.float64().optional().build()) .build() val users = List( new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43), new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06) ) val path = new Path("sinktest.parquet") val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) users.foreach(writer.write) writer.close() val reader = parquetReader(path) val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList reader.close() actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues) fs.delete(path, false) } "support writing nulls" in { val schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .field("title", SchemaBuilder.string().optional().build()) .field("salary", SchemaBuilder.float64().optional().build()) .build() val users = List( new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43), new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06) ) val path = new Path("sinktest.parquet") val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) users.foreach(writer.write) writer.close() val reader = parquetReader(path) val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList reader.close() actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues) fs.delete(path, false) } } }
Example 191
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTimestamp: Long = System.currentTimeMillis() var lastKnownFileSize:Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def createdTime: Long = createdTimestamp override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 192
Source File: OrcHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, Serde} import com.landoop.streamreactor.connect.hive.orc.OrcSink import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import scala.util.Try object OrcHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.orc.OrcSerde", "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", Map("org.apache.hadoop.hive.ql.io.orc.OrcSerde" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating orc writer at $path") val sink: OrcSink = com.landoop.streamreactor.connect.hive.orc.sink(path, schema, OrcSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val cretedTimestamp: Long = System.currentTimeMillis() var lastKnownFileSize:Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { sink.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing orc writer at path $path") sink.close() } override def file: Path = path override def currentCount: Long = count override def createdTime: Long = cretedTimestamp override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating orc reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.orc.source(path, OrcSourceConfig()) var offset = startAt override def iterator: Iterator[Record] = reader.iterator.map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 193
Source File: OrcSource.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc import com.landoop.streamreactor.connect.hive.OrcSourceConfig import com.landoop.streamreactor.connect.hive.orc.vectors.OrcVectorReader.fromSchema import com.landoop.streamreactor.connect.hive.orc.vectors.StructVectorReader import com.typesafe.scalalogging.StrictLogging import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.exec.vector.{StructColumnVector, VectorizedRowBatch} import org.apache.kafka.connect.data.Struct import org.apache.orc.OrcFile.ReaderOptions import org.apache.orc.{OrcFile, Reader} import scala.collection.JavaConverters._ class OrcSource(path: Path, config: OrcSourceConfig)(implicit fs: FileSystem) extends StrictLogging { private val reader = OrcFile.createReader(path, new ReaderOptions(fs.getConf)) private val typeDescription = reader.getSchema private val schema = OrcSchemas.toKafka(typeDescription) private val readers = typeDescription.getChildren.asScala.map(fromSchema) private val vectorReader = new StructVectorReader(readers.toIndexedSeq, typeDescription) private val batch = typeDescription.createRowBatch() private val recordReader = reader.rows(new Reader.Options()) def close(): Unit = { recordReader.close() } def iterator: Iterator[Struct] = new Iterator[Struct] { var iter = new BatchIterator(batch) override def hasNext: Boolean = iter.hasNext || { batch.reset() recordReader.nextBatch(batch) iter = new BatchIterator(batch) !batch.endOfFile && batch.size > 0 && iter.hasNext } override def next(): Struct = iter.next() } // iterates over a batch, be careful not to mutate the batch while it is being iterated class BatchIterator(batch: VectorizedRowBatch) extends Iterator[Struct] { var offset = 0 val vector = new StructColumnVector(batch.numCols, batch.cols: _*) override def hasNext: Boolean = offset < batch.size override def next(): Struct = { val struct = vectorReader.read(offset, vector) offset = offset + 1 struct.orNull } } }
Example 194
Source File: OrcSink.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc import com.landoop.streamreactor.connect.hive.orc.vectors.{OrcVectorWriter, StructVectorWriter} import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, StructUtils} import com.typesafe.scalalogging.StrictLogging import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector import org.apache.kafka.connect.data.{Schema, Struct} import scala.collection.JavaConverters._ class OrcSink(path: Path, schema: Schema, config: OrcSinkConfig)(implicit fs: FileSystem) extends StrictLogging { private val typeDescription = OrcSchemas.toOrc(schema) private val structWriter = new StructVectorWriter(typeDescription.getChildren.asScala.map(OrcVectorWriter.fromSchema)) private val batch = typeDescription.createRowBatch(config.batchSize) private val vector = new StructColumnVector(batch.numCols, batch.cols: _*) private val orcWriter = createOrcWriter(path, typeDescription, config) private var n = 0 def flush(): Unit = { logger.debug(s"Writing orc batch [size=$n, path=$path]") batch.size = n orcWriter.addRowBatch(batch) orcWriter.writeIntermediateFooter batch.reset() n = 0 } def write(struct: Struct): Unit = { structWriter.write(vector, n, Some(StructUtils.extractValues(struct))) n = n + 1 if (n == config.batchSize) flush() } def close(): Unit = { if (n > 0) flush() orcWriter.close() } }
Example 195
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.Schema import org.apache.orc.OrcFile.EncodingStrategy import org.apache.orc._ package object orc { def createOrcWriter(path: Path, schema: TypeDescription, config: OrcSinkConfig) (implicit fs: FileSystem): Writer = { val options = OrcFile.writerOptions(null, fs.getConf).setSchema(schema) options.compress(config.compressionKind) options.encodingStrategy(config.encodingStrategy) options.blockPadding(config.blockPadding) options.version(OrcFile.Version.V_0_12) config.bloomFilterColumns.map(_.mkString(",")).foreach(options.bloomFilterColumns) config.rowIndexStride.foreach(options.rowIndexStride) config.blockSize.foreach(options.blockSize) config.stripeSize.foreach(options.stripeSize) if (config.overwrite && fs.exists(path)) fs.delete(path, false) OrcFile.createWriter(path, options) } def source(path: Path, config: OrcSourceConfig) (implicit fs: FileSystem) = new OrcSource(path, config) def sink(path: Path, schema: Schema, config: OrcSinkConfig) (implicit fs: FileSystem) = new OrcSink(path, schema, config) } case class OrcSourceConfig() case class OrcSinkConfig(overwrite: Boolean = false, batchSize: Int = 1024, // orc default is 1024 encodingStrategy: EncodingStrategy = EncodingStrategy.COMPRESSION, compressionKind: CompressionKind = CompressionKind.SNAPPY, blockPadding: Boolean = true, blockSize: Option[Long] = None, stripeSize: Option[Long] = None, bloomFilterColumns: Seq[String] = Nil, rowIndexStride: Option[Int] = None)
Example 196
Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.sink.config.TableOptions import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.apache.kafka.connect.data.{Schema, Struct} case class HiveSinkState(offsets: Map[TopicPartition, Offset], committedOffsets: Map[TopicPartition, Offset], table: Table, tableLocation: Path, plan: Option[PartitionPlan], metastoreSchema: Schema, mapper: Struct => Struct, lastSchema: Schema) { def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = { copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset)) } def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(offsets = offsets + (tp -> offset)) } def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = { copy(committedOffsets = committedOffsets ++ offsets) } def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(committedOffsets = committedOffsets + (tp -> offset)) } def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema) } object HiveSinkState { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def from(schema: Schema, table: TableOptions, dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = { logger.info(s"Init sink for schema $schema") val hiveTable = getOrCreateTable(table, dbName, schema) val tableLocation = new Path(hiveTable.getSd.getLocation) val plan = hive.partitionPlan(hiveTable) val metastoreSchema = table.evolutionPolicy .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema) .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema")) val mapperFns: Seq[Struct => Struct] = Seq( table.projection.map(new ProjectionMapper(_)), Some(new MetastoreSchemaAlignMapper(metastoreSchema)), plan.map(new DropPartitionValuesMapper(_)) ).flatten.map(mapper => mapper.map _) val mapper = Function.chain(mapperFns) HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema) } def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema) (implicit client: IMetaStoreClient, fs: FileSystem): Table = { def create: Table = { val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",") logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]") hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format) } logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}") client.tableExists(dbName.value, table.tableName.value) match { case true if table.overwriteTable => hive.dropTable(dbName, table.tableName, true) create case true => client.getTable(dbName.value, table.tableName.value) case false if table.createTable => create case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist") } } }
Example 197
Source File: StrictPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.collection.JavaConverters._ import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} object StrictPartitionHandler extends PartitionHandler { override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { try { val part = client.getPartition(db.value, tableName.value, partition.entries.map(_._2).toList.asJava) Success(new Path(part.getSd.getLocation)) } catch { case NonFatal(e) => Failure(new RuntimeException(s"Partition '${partition.entries.map(_._2).toList.mkString(",")}' does not exist and strict policy requires upfront creation", e)) } } }
Example 198
Source File: CachedPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.util.{Success, Try} class CachedPartitionHandler(partitioner: PartitionHandler) extends PartitionHandler { val cache = scala.collection.mutable.Map.empty[Partition, Path] override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { cache.get(partition) match { case Some(path) => Success(path) case _ => val created = partitioner.path(partition, db, tableName)(client, fs) created.foreach(cache.put(partition, _)) created } } }
Example 199
Source File: DynamicPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.{StorageDescriptor, Table} import scala.collection.JavaConverters._ import scala.util.{Failure, Success, Try} class DynamicPartitionHandler(pathPolicy: PartitionPathPolicy = DefaultMetastorePartitionPathPolicy) extends PartitionHandler { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { def table: Table = client.getTable(db.value, tableName.value) def create(path: Path, table: Table): Unit = { logger.debug(s"New partition will be created at $path") val sd = new StorageDescriptor(table.getSd) sd.setLocation(path.toString) val params = new java.util.HashMap[String, String] val values = partition.entries.map(_._2).toList.asJava val ts = (System.currentTimeMillis / 1000).toInt val p = new org.apache.hadoop.hive.metastore.api.Partition(values, db.value, tableName.value, ts, 0, sd, params) logger.debug(s"Updating hive metastore with partition $p") client.add_partition(p) logger.info(s"Partition has been created in metastore [$partition]") } Try(client.getPartition(db.value, tableName.value, partition.entries.toList.map(_._2).asJava)) match { case Success(p) => Try { new Path(p.getSd.getLocation) } case Failure(_) => Try { val t = table val tableLocation = new Path(t.getSd.getLocation) val path = pathPolicy.path(tableLocation, partition) create(path, t) path } } } }
Example 200
Source File: StageManager.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.formats.HiveWriter import com.landoop.streamreactor.connect.hive.{TopicPartition, TopicPartitionOffset} import org.apache.hadoop.fs.{FileSystem, Path} class StageManager(filenamePolicy: FilenamePolicy) { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private def stageFilename(tp: TopicPartition) = s".${filenamePolicy.prefix}_${tp.topic.value}_${tp.partition}" private def finalFilename(tpo: TopicPartitionOffset) = s"${filenamePolicy.prefix}_${tpo.topic.value}_${tpo.partition}_${tpo.offset.value}" def stage(dir: Path, tp: TopicPartition)(implicit fs: FileSystem): Path = { val filename = stageFilename(tp) val stagePath = new Path(dir, filename) fs.delete(stagePath, false) stagePath } def commit(stagePath: Path, tpo: TopicPartitionOffset)(implicit fs: FileSystem): Path = { val finalPath = new Path(stagePath.getParent, finalFilename(tpo)) logger.info(s"Commiting file $stagePath=>$finalPath") fs.rename(stagePath, finalPath) finalPath } }