org.apache.hadoop.conf.Configuration Scala Examples
The following examples show how to use org.apache.hadoop.conf.Configuration.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: DirectOutputCommitter.scala From spark-snowflake with Apache License 2.0 | 6 votes |
package net.snowflake.spark.snowflake import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.lib.output.{ FileOutputCommitter, FileOutputFormat } class DirectOutputCommitter extends OutputCommitter { override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = { // We return true here to guard against implementations that do not handle false correctly. // The meaning of returning false is not entirely clear, so it's possible to be interpreted // as an error. Returning true just means that commitTask() will be called, which is a no-op. true } override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def abortTask(taskContext: TaskAttemptContext): Unit = {} private def shouldCreateSuccessFile(conf: Configuration): Boolean = { conf.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true) } }
Example 3
Source File: OrcFileOperator.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.flatMap(getFileReader(_, conf)).headOption.map { reader => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 4
Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).map { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val t = creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .head val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 5
Source File: HBase.scala From AI with Apache License 2.0 | 6 votes |
package com.bigchange.hbase import com.bigchange.util.HBaseUtil._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.client.{Result, _} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.hadoop.hbase.protobuf.ProtobufUtil import org.apache.hadoop.hbase.protobuf.generated.ClientProtos import org.apache.hadoop.hbase.util.Base64 import org.apache.spark.SparkContext def existRowKey(row:String, table: Table): Boolean ={ val get = new Get(row.getBytes()) val result = table.get(get) if (result.isEmpty) { warn("hbase table don't have this data,execute insert") return false } true } def getConfiguration = if(hBaseConfiguration == null) { warn("hbase setDefaultConfiguration....") setDefaultConfiguration } else hBaseConfiguration def setDefaultConfiguration = { hBaseConfiguration = HBaseConfiguration.create // 本地测试 需配置的选项, 在集群上通过对应配置文件路径自动获得 hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径 hBaseConfiguration.set("dfs.nameservices", "ns1"); // hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径 hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址 hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址 // 设置namenode自动切换的实现类 hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider") hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase") hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2") hBaseConfiguration.set("hbase.zookeeper.property.clientPort", "2181") hBaseConfiguration } }
Example 6
Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0 | 6 votes |
package io.eels.component.parquet import java.nio.file.Paths import io.eels.component.parquet.avro.AvroParquetSource import io.eels.component.parquet.util.ParquetLogMute import io.eels.schema._ import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{Matchers, WordSpec} class AvroParquetSourceTest extends WordSpec with Matchers { ParquetLogMute() private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(conf) private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI) private val resourcesDir = personFile.getParent "AvroParquetSource" should { "read schema" in { val people = AvroParquetSource(personFile) people.schema shouldBe StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) } "read parquet files" in { val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } "read multiple parquet files using file expansion" in { import io.eels.FilePattern._ val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner"), Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } // todo add merge to parquet source "merge schemas" ignore { try { fs.delete(new Path("merge1.pq"), false) } catch { case t: Throwable => } try { fs.delete(new Path("merge2.pq"), false) } catch { case t: Throwable => } val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord() val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord() val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build() val record1 = new GenericData.Record(schema1) record1.put("a", "aaaaa") record1.put("b", 124.3) writer1.write(record1) writer1.close() val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build() val record2 = new GenericData.Record(schema2) record2.put("a", 111) record2.put("c", true) writer2.write(record2) writer2.close() ParquetSource(new Path("merge*")).schema shouldBe StructType( Field("a", StringType, nullable = false), Field("b", DoubleType, nullable = false), Field("c", BooleanType, nullable = false) ) fs.delete(new Path(".merge1.pq.crc"), false) fs.delete(new Path(".merge2.pq.crc"), false) fs.delete(new Path("merge1.pq"), false) fs.delete(new Path("merge2.pq"), false) } } }
Example 7
Source File: CompressionCodecs.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress._ import org.apache.spark.util.Utils object CompressionCodecs { private val shortCompressionCodecNames = Map( "none" -> null, "uncompressed" -> null, "bzip2" -> classOf[BZip2Codec].getName, "deflate" -> classOf[DeflateCodec].getName, "gzip" -> classOf[GzipCodec].getName, "lz4" -> classOf[Lz4Codec].getName, "snappy" -> classOf[SnappyCodec].getName) def setCodecConfiguration(conf: Configuration, codec: String): Unit = { if (codec != null) { conf.set("mapreduce.output.fileoutputformat.compress", "true") conf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString) conf.set("mapreduce.output.fileoutputformat.compress.codec", codec) conf.set("mapreduce.map.output.compress", "true") conf.set("mapreduce.map.output.compress.codec", codec) } else { // This infers the option `compression` is set to `uncompressed` or `none`. conf.set("mapreduce.output.fileoutputformat.compress", "false") conf.set("mapreduce.map.output.compress", "false") } } }
Example 8
Source File: HadoopFileLinesReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 9
Source File: HBaseCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging { override def serviceName: String = "hbase" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def credentialsRequired(hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 10
Source File: YarnRMClient.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.{List => JList} import scala.collection.JavaConverters._ import scala.util.Try import org.apache.hadoop.conf.Configuration import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.client.api.AMRMClient import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.webapp.util.WebAppUtils import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.util.Utils def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = { val sparkMaxAttempts = sparkConf.get(MAX_APP_ATTEMPTS).map(_.toInt) val yarnMaxAttempts = yarnConf.getInt( YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS) val retval: Int = sparkMaxAttempts match { case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts case None => yarnMaxAttempts } retval } }
Example 11
Source File: HDFSCredentialProviderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, PrivateMethodTester} import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} class HDFSCredentialProviderSuite extends SparkFunSuite with PrivateMethodTester with Matchers { private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer) private def getTokenRenewer( hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = { hdfsCredentialProvider invokePrivate _getTokenRenewer(conf) } private var hdfsCredentialProvider: HDFSCredentialProvider = null override def beforeAll() { super.beforeAll() if (hdfsCredentialProvider == null) { hdfsCredentialProvider = new HDFSCredentialProvider() } } override def afterAll() { if (hdfsCredentialProvider != null) { hdfsCredentialProvider = null } super.afterAll() } test("check token renewer") { val hadoopConf = new Configuration() hadoopConf.set("yarn.resourcemanager.address", "myrm:8033") hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]") val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf) renewer should be ("yarn/myrm:[email protected]") } test("check token renewer default") { val hadoopConf = new Configuration() val caught = intercept[SparkException] { getTokenRenewer(hdfsCredentialProvider, hadoopConf) } assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer") } }
Example 12
Source File: FileBasedWriteAheadLogRandomReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.Closeable import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration private[streaming] class FileBasedWriteAheadLogRandomReader(path: String, conf: Configuration) extends Closeable { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = (instream == null) // the file may be deleted as we're opening the stream def read(segment: FileBasedWriteAheadLogSegment): ByteBuffer = synchronized { assertOpen() instream.seek(segment.offset) val nextLength = instream.readInt() HdfsUtils.checkState(nextLength == segment.length, s"Expected message length to be ${segment.length}, but was $nextLength") val buffer = new Array[Byte](nextLength) instream.readFully(buffer) ByteBuffer.wrap(buffer) } override def close(): Unit = synchronized { closed = true instream.close() } private def assertOpen() { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Reader to read from the file.") } }
Example 13
Source File: HdfsUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{FileNotFoundException, IOException} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ private[streaming] object HdfsUtils { def getOutputStream(path: String, conf: Configuration): FSDataOutputStream = { val dfsPath = new Path(path) val dfs = getFileSystemForPath(dfsPath, conf) // If the file exists and we have append support, append instead of creating a new file val stream: FSDataOutputStream = { if (dfs.isFile(dfsPath)) { if (conf.getBoolean("hdfs.append.support", false) || dfs.isInstanceOf[RawLocalFileSystem]) { dfs.append(dfsPath) } else { throw new IllegalStateException("File exists and there is no append support!") } } else { dfs.create(dfsPath) } } stream } def getInputStream(path: String, conf: Configuration): FSDataInputStream = { val dfsPath = new Path(path) val dfs = getFileSystemForPath(dfsPath, conf) try { dfs.open(dfsPath) } catch { case _: FileNotFoundException => null case e: IOException => // If we are really unlucky, the file may be deleted as we're opening the stream. // This can happen as clean up is performed by daemon threads that may be left over from // previous runs. if (!dfs.isFile(dfsPath)) null else throw e } } def checkState(state: Boolean, errorMsg: => String) { if (!state) { throw new IllegalStateException(errorMsg) } } def checkFileExists(path: String, conf: Configuration): Boolean = { val hdpPath = new Path(path) val fs = getFileSystemForPath(hdpPath, conf) fs.isFile(hdpPath) } }
Example 14
Source File: FileBasedWriteAheadLogWriter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io._ import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.util.Utils def write(data: ByteBuffer): FileBasedWriteAheadLogSegment = synchronized { assertOpen() data.rewind() // Rewind to ensure all data in the buffer is retrieved val lengthToWrite = data.remaining() val segment = new FileBasedWriteAheadLogSegment(path, nextOffset, lengthToWrite) stream.writeInt(lengthToWrite) Utils.writeByteBuffer(data, stream: OutputStream) flush() nextOffset = stream.getPos() segment } override def close(): Unit = synchronized { closed = true stream.close() } private def flush() { stream.hflush() // Useful for local file system where hflush/sync does not work (HADOOP-7844) stream.getWrappedStream.flush() } private def assertOpen() { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.") } }
Example 15
Source File: FileBasedWriteAheadLogReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{Closeable, EOFException, IOException} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.internal.Logging private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration) extends Iterator[ByteBuffer] with Closeable with Logging { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = (instream == null) // the file may be deleted as we're opening the stream private var nextItem: Option[ByteBuffer] = None override def hasNext: Boolean = synchronized { if (closed) { return false } if (nextItem.isDefined) { // handle the case where hasNext is called without calling next true } else { try { val length = instream.readInt() val buffer = new Array[Byte](length) instream.readFully(buffer) nextItem = Some(ByteBuffer.wrap(buffer)) logTrace("Read next item " + nextItem.get) true } catch { case e: EOFException => logDebug("Error reading next item, EOF reached", e) close() false case e: IOException => logWarning("Error while trying to read data. If the file was deleted, " + "this should be okay.", e) close() if (HdfsUtils.checkFileExists(path, conf)) { // If file exists, this could be a legitimate error throw e } else { // File was deleted. This can occur when the daemon cleanup thread takes time to // delete the file during recovery. false } case e: Exception => logWarning("Error while trying to read data from HDFS.", e) close() throw e } } } override def next(): ByteBuffer = synchronized { val data = nextItem.getOrElse { close() throw new IllegalStateException( "next called without calling hasNext or after hasNext returned false") } nextItem = None // Ensure the next hasNext call loads new data. data } override def close(): Unit = synchronized { if (!closed) { instream.close() } closed = true } }
Example 16
Source File: SerializableWritable.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value: T = t override def toString: String = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration(false)) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 17
Source File: SerializableConfiguration.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{ObjectInputStream, ObjectOutputStream} import org.apache.hadoop.conf.Configuration private[spark] class SerializableConfiguration(@transient var value: Configuration) extends Serializable { private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() value.write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { value = new Configuration(false) value.readFields(in) } }
Example 18
Source File: PortableDataStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import scala.collection.JavaConverters._ import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit} def toArray(): Array[Byte] = { val stream = open() try { ByteStreams.toByteArray(stream) } finally { Closeables.close(stream, true) } } def getPath(): String = path }
Example 19
Source File: WholeTextFileRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 20
Source File: BinaryFileRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.StreamFileInputFormat private[spark] class BinaryFileRDD[T]( sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 21
Source File: SentenceTokenizer.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.text import java.io.FileInputStream import java.net.{URI, URL} import com.intel.analytics.bigdl.dataset.Transformer import scala.collection.Iterator import opennlp.tools.tokenize.{SimpleTokenizer, Tokenizer, TokenizerME, TokenizerModel} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} class SentenceTokenizer(tokenFile: Option[String] = None) extends Transformer[String, Array[String]] { var modelIn: FileInputStream = _ var model: TokenizerModel = _ var tokenizer: Tokenizer = _ def this(tokenFile: URL) { this(Some(tokenFile.getPath)) } def close(): Unit = { if (modelIn != null) { modelIn.close() } } override def apply(prev: Iterator[String]): Iterator[Array[String]] = prev.map(x => { if (tokenizer == null) { if (!tokenFile.isDefined) { tokenizer = SimpleTokenizer.INSTANCE } else { val src: Path = new Path(tokenFile.get) val fs = src.getFileSystem(new Configuration()) val in = fs.open(src) model = new TokenizerModel(in) tokenizer = new TokenizerME(model) } } val words = tokenizer.tokenize(x) words }) } object SentenceTokenizer { def apply(tokenFile: Option[String] = None): SentenceTokenizer = new SentenceTokenizer(tokenFile) def apply(tokenFile: URL): SentenceTokenizer = new SentenceTokenizer(tokenFile) }
Example 22
Source File: SentenceSplitter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.text import java.io.FileInputStream import java.net.{URI, URL} import com.intel.analytics.bigdl.dataset.Transformer import opennlp.tools.sentdetect.{SentenceDetector, SentenceDetectorME, SentenceModel} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.collection.Iterator class SentenceSplitter(sentFile: Option[String] = None) extends Transformer[String, Array[String]] { var modelIn: FileInputStream = _ var model: SentenceModel = _ var sentenceDetector: SentenceDetector = _ def this(sentFileURL: URL) { this(Some(sentFileURL.getPath)) } def this(sentFile: String) { this(Some(sentFile)) } def close(): Unit = { if (modelIn != null) { modelIn.close() } } override def apply(prev: Iterator[String]): Iterator[Array[String]] = prev.map(x => { if (!sentFile.isDefined) { x.split('.') } else { if (sentenceDetector == null) { val src: Path = new Path(sentFile.get) val fs = src.getFileSystem(new Configuration()) val in = fs.open(src) model = new SentenceModel(in) sentenceDetector = new SentenceDetectorME(model) } sentenceDetector.sentDetect(x) } }) } object SentenceSplitter { def apply(sentFile: Option[String] = None): SentenceSplitter = new SentenceSplitter(sentFile) def apply(sentFileURL: URL): SentenceSplitter = new SentenceSplitter(sentFileURL) def apply(sentFile: String): SentenceSplitter = new SentenceSplitter(sentFile) }
Example 23
Source File: LocalSeqFileToBytes.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.image import com.intel.analytics.bigdl.dataset.DataSet.SeqFileFolder import com.intel.analytics.bigdl.dataset.{ByteRecord, LocalSeqFilePath, Transformer} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.SequenceFile.Reader import org.apache.hadoop.io.{SequenceFile, Text} import scala.collection.Iterator object LocalSeqFileToBytes { def apply(): LocalSeqFileToBytes = new LocalSeqFileToBytes() } class LocalSeqFileToBytes extends Transformer[LocalSeqFilePath, ByteRecord] { import org.apache.hadoop.fs.{Path => hPath} @transient private var key: Text = null @transient private var value: Text = null @transient private var reader: SequenceFile.Reader = null @transient private var oneRecordBuffer: ByteRecord = null override def apply(prev: Iterator[LocalSeqFilePath]): Iterator[ByteRecord] = { new Iterator[ByteRecord] { override def next(): ByteRecord = { if (oneRecordBuffer != null) { val res = oneRecordBuffer oneRecordBuffer = null return res } if (key == null) { key = new Text() } if (value == null) { value = new Text } if (reader == null || !reader.next(key, value)) { if (reader != null) { reader.close() } reader = new SequenceFile.Reader(new Configuration, Reader.file(new hPath(prev.next().path.toAbsolutePath.toString))) reader.next(key, value) } ByteRecord(value.copyBytes(), SeqFileFolder.readLabel(key).toFloat) } override def hasNext: Boolean = { if (oneRecordBuffer != null) { true } else if (reader == null) { prev.hasNext } else { if (reader.next(key, value)) { oneRecordBuffer = ByteRecord(value.copyBytes(), SeqFileFolder.readLabel(key).toFloat) return true } else { prev.hasNext } } } } } }
Example 24
Source File: BGRImgToLocalSeqFile.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.image import java.nio.ByteBuffer import java.nio.file.Path import com.intel.analytics.bigdl.dataset.Transformer import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path => hadoopPath} import org.apache.hadoop.io.{SequenceFile, Text} import scala.collection.Iterator object BGRImgToLocalSeqFile { def apply(blockSize: Int, baseFileName: Path, hasName: Boolean = false): BGRImgToLocalSeqFile = { new BGRImgToLocalSeqFile(blockSize, baseFileName, hasName) } } class BGRImgToLocalSeqFile(blockSize: Int, baseFileName: Path, hasName: Boolean = false) extends Transformer[(LabeledBGRImage, String), String] { private val conf: Configuration = new Configuration private var index = 0 private val preBuffer: ByteBuffer = ByteBuffer.allocate(4 * 2) override def apply(prev: Iterator[(LabeledBGRImage, String)]): Iterator[String] = { new Iterator[String] { override def hasNext: Boolean = prev.hasNext override def next(): String = { val fileName = baseFileName + s"_$index.seq" val path = new hadoopPath(fileName) val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(path), SequenceFile.Writer.keyClass(classOf[Text]), SequenceFile.Writer.valueClass(classOf[Text])) var i = 0 while (i < blockSize && prev.hasNext) { val (image, imageName) = prev.next() preBuffer.putInt(image.width()) preBuffer.putInt(image.height()) val imageByteData = image.convertToByte() val data: Array[Byte] = new Array[Byte](preBuffer.capacity + imageByteData.length) System.arraycopy(preBuffer.array, 0, data, 0, preBuffer.capacity) System.arraycopy(imageByteData, 0, data, preBuffer.capacity, imageByteData.length) preBuffer.clear val imageKey = if (hasName) s"${imageName}\n${image.label().toInt}" else s"${image.label().toInt}" writer.append(new Text(imageKey), new Text(data)) i += 1 } writer.close() index += 1 fileName } } } }
Example 25
Source File: COCOSeqFileGenerator.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.utils import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext} import java.io.File import java.nio.file.{Files, Paths} import java.util.concurrent.atomic.AtomicInteger import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.SequenceFile.Writer import org.apache.hadoop.io.compress.BZip2Codec import org.apache.hadoop.io.{BytesWritable, SequenceFile} import scala.collection.parallel.ForkJoinTaskSupport import scopt.OptionParser object COCOSeqFileGenerator { case class COCOSeqFileGeneratorParams( folder: String = ".", metaPath: String = "instances_val2014.json", output: String = ".", parallel: Int = 1, blockSize: Int = 12800 ) private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " + "Sequence File Generator") { head("BigDL COCO Sequence File Generator") opt[String]('f', "folder") .text("where you put the COCO image files") .action((x, c) => c.copy(folder = x)) opt[String]('o', "output folder") .text("where you put the generated seq files") .action((x, c) => c.copy(output = x)) opt[Int]('p', "parallel") .text("parallel num") .action((x, c) => c.copy(parallel = x)) opt[Int]('b', "blockSize") .text("block size") .action((x, c) => c.copy(blockSize = x)) opt[String]('m', "metaPath") .text("metadata json file path") .action((x, c) => c.copy(metaPath = x)) } def main(args: Array[String]): Unit = { parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param => println("Loading COCO metadata") val meta = COCODataset.load(param.metaPath, param.folder) println("Metadata loaded") val conf: Configuration = new Configuration val doneCount = new AtomicInteger(0) val tasks = meta.images.filter(img => { val path = img.path val valid = Files.exists(path) && !Files.isDirectory(path) if (!valid) { System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n") } valid }).grouped(param.blockSize).zipWithIndex.toArray.par tasks.tasksupport = new ForkJoinTaskSupport( new scala.concurrent.forkjoin.ForkJoinPool(param.parallel)) tasks.foreach { case (imgs, blkId) => val outFile = new Path(param.output, s"coco-seq-$blkId.seq") val key = new BytesWritable val value = new BytesWritable val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile .CompressionType.BLOCK, new BZip2Codec)) val context = new COCOSerializeContext imgs.foreach { img => context.clear() context.dump(img.fileName) img.dumpTo(context) context.dump(COCODataset.MAGIC_NUM) val keyBytes = context.toByteArray key.set(keyBytes, 0, keyBytes.length) val bytes = img.data value.set(bytes, 0, bytes.length) writer.append(key, value) val cnt = doneCount.incrementAndGet() if (cnt % 500 == 0) { System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}") } } writer.close() } System.err.print("\n") } } }
Example 26
Source File: FileReader.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.visualization.tensorboard import java.io.{BufferedInputStream} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.tensorflow.util.Event import scala.collection.mutable.ArrayBuffer import scala.util.matching.Regex private[bigdl] object FileReader { val fileNameRegex = """bigdl.tfevents.*""".r def readScalar(file: Path, tag: String, fs: FileSystem): Array[(Long, Float, Double)] = { require(fs.isFile(file), s"FileReader: ${file} should be a file") val bis = new BufferedInputStream(fs.open(file)) val longBuffer = new Array[Byte](8) val crcBuffer = new Array[Byte](4) val bf = new ArrayBuffer[(Long, Float, Double)] while (bis.read(longBuffer) > 0) { val l = ByteBuffer.wrap(longBuffer.reverse).getLong() bis.read(crcBuffer) // TODO: checksum // val crc1 = ByteBuffer.wrap(crcBuffer.reverse).getInt() val eventBuffer = new Array[Byte](l.toInt) bis.read(eventBuffer) val e = Event.parseFrom(eventBuffer) if (e.getSummary.getValueCount == 1 && tag.equals(e.getSummary.getValue(0).getTag())) { bf.append((e.getStep, e.getSummary.getValue(0).getSimpleValue, e.getWallTime)) } bis.read(crcBuffer) // val crc2 = ByteBuffer.wrap(crcBuffer.reverse).getInt() } bis.close() bf.toArray.sortWith(_._1 < _._1) } }
Example 27
Source File: 2-CommonFunctions.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License | 5 votes |
// Databricks notebook source import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.conf.Configuration // COMMAND ---------- val prqShrinkageFactor = 0.19 //We found a saving in space of 81% with Parquet // COMMAND ---------- def analyzeTables(databaseAndTable: String) { println("Table: " + databaseAndTable) println("....refresh table") sql("REFRESH TABLE " + databaseAndTable) println("....analyze table") sql("ANALYZE TABLE " + databaseAndTable + " COMPUTE STATISTICS") println("....done") } // COMMAND ---------- def calcOutputFileCountTxtToPrq(srcDataFile: String, targetedFileSizeMB: Int): Int = { val fs = FileSystem.get(new Configuration()) val estFileCount: Int = Math.floor((fs.getContentSummary(new Path(srcDataFile)).getLength * prqShrinkageFactor) / (targetedFileSizeMB * 1024 * 1024)).toInt if(estFileCount == 0) 1 else estFileCount } // COMMAND ---------- // Get recursive file collection you can iterate on def getRecursiveFileCollection(directoryPath: String): Seq[String] = dbutils.fs.ls(directoryPath).map(directoryItem => { // Work around double encoding bug val directoryItemPath = directoryItem.path.replace("%25", "%").replace("%25", "%") if (directoryItem.isDir) getRecursiveFileCollection(directoryItemPath) else Seq[String](directoryItemPath) }).reduce(_ ++ _) // COMMAND ---------- //Delete residual files from job operation (_SUCCESS, _start*, _committed*) def recursivelyDeleteSparkJobFlagFiles(directoryPath: String) { getRecursiveFileCollection(directoryPath).foreach(directoryItemPath => { if (directoryItemPath.indexOf("parquet") == -1) { println("Deleting...." + directoryItemPath) dbutils.fs.rm(directoryItemPath) }}) } // COMMAND ---------- dbutils.notebook.exit("Pass")
Example 28
Source File: PostUrl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.http import java.io.{BufferedReader, InputStreamReader} import java.net.URI import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.commons.httpclient.HttpClient import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.http.client.methods.HttpPost import org.apache.http.entity.StringEntity import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.sql.SparkSession class PostUrl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override val description: String = "Send a post request to the specified http" var url : String= _ var jsonPath : String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //read json from hdfs val conf = new Configuration() val fs = FileSystem.get(URI.create(jsonPath),conf) val stream: FSDataInputStream = fs.open(new Path(jsonPath)) val bufferReader = new BufferedReader(new InputStreamReader(stream)) var lineTxt = bufferReader.readLine() val buffer = new StringBuffer() while (lineTxt != null ){ buffer.append(lineTxt.mkString) lineTxt=bufferReader.readLine() } // post val client = HttpClients.createDefault() val httpClient = new HttpClient() httpClient.getParams().setContentCharset("utf-8") val post = new HttpPost(url) post.addHeader("content-Type","application/json") post.setEntity(new StringEntity(buffer.toString)) val response = client.execute(post) val entity = response.getEntity val str = EntityUtils.toString(entity,"UTF-8") println("Code is " + str) } override def setProperties(map: Map[String, Any]): Unit = { url = MapUtil.get(map,key="url").asInstanceOf[String] jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val url = new PropertyDescriptor() .name("url") .displayName("Url") .defaultValue("") .description("http request address") .required(true) .example("http://master:8002/flow/start") val jsonPath = new PropertyDescriptor() .name("jsonPath") .displayName("JsonPath") .defaultValue("") .description("json parameter path for post request") .required(true) .example("hdfs://master:9000/work/flow.json") descriptor = url :: descriptor descriptor = jsonPath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/http/PostUrl.png") } override def getGroup(): List[String] = { List(StopGroup.HttpGroup.toString) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 29
Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.json.JSONObject class Pathway extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse Pathway data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/pathway").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Pathway.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val inDf: DataFrame = in.read() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val configuration: Configuration = new Configuration() val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var fdis: FSDataInputStream = null var br: BufferedReader = null var doc: JSONObject = null var hasAnotherSequence:Boolean = true inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var count = 0 while (hasAnotherSequence) { count += 1 doc = new JSONObject hasAnotherSequence = util.KeggPathway.process(br, doc) doc.write(hdfsWriter) hdfsWriter.write("\n") } br.close() fdis.close() }) hdfsWriter.close() val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary) df.schema.printTreeString() println(df.count) out.write(df) } }
Example 30
Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.PDB import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class PDBData extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse PDB data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var doc: JSONObject = null var pdb: PDB = null var count:Int=0 inDf.collect().foreach(row => { count += 1 pathStr = row.get(0).asInstanceOf[String] pdb = new PDB(pathStr,fs) doc = pdb.getDoc doc.write(hdfsWriter) hdfsWriter.write("\n") doc = null }) hdfsWriter.close() val df: DataFrame = session.read.json(hdfsPathTemporary) out.write(df) } def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/PDB").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/PDBData.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 31
Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.ParserGff3Data import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class Ensembl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse ensembl data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/ensembl").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Ensembl.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) val parser: ParserGff3Data = new ParserGff3Data var fdis: FSDataInputStream =null var br: BufferedReader = null var doc: JSONObject = null var count:Int = 0 inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var eachStr:String=null while((eachStr = br.readLine()) != null && eachStr != null ){ doc = parser.parserGff3(eachStr) if(doc.toString.length > 2){ count += 1 doc.write(hdfsWriter) hdfsWriter.write("\n") } } br.close() fdis.close() }) hdfsWriter.close() out.write(session.read.json(hdfsPathTemporary)) } }
Example 32
Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.filesystem import java.io.{ Closeable, InputStream } import java.util.Scanner import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path } import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec } import scala.collection.convert.decorateAsScala._ import scala.util.{ Random, Try } class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val fileSystem = FileSystem.getLocal(new Configuration) private val numFiles = 10 private val baseDir = "test-dir".asHadoop private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d" private def safely[A <: Closeable, U](f: A => U) = { stream: A => val attempt = Try { f(stream) } stream.close() attempt } private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path) private def readFiles = Try { fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get } } private def openFiles = Try { fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) } } private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream => Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row => stream.writeUTF { row.mkString("", ",", "\n") } } } apply fileSystem.create { workingDir / fileName } private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match { case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings case (head, tail) => randomSplits(tail, head.mkString +: strings) } private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) } private def createFiles = Try { 0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse` } private def prepareData = for { _ <- createWorkingDir _ <- createFiles } yield () private def purgeData = Try { fileSystem.delete(workingDir, true) } override def beforeAll() = prepareData.get override def afterAll() = purgeData.get "MergeStrategies info" when { "given compressed format files" must { "throw an exception" in { an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) } } } "given data as csv" must { "drop one line and merge the rest" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size - numFiles + 1 } } apply MergeStrategies.csv.merge { openFiles.get } } } "given data as json" must { "just merge the files into one" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size } } apply MergeStrategies.json.merge { openFiles.get } } } } }
Example 33
Source File: HiveUtils.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.hive.common import java.io.File import java.nio.file.Paths import com.webank.wedatasphere.linkis.common.conf.{Configuration => CommonConfiguration} import com.webank.wedatasphere.linkis.engine.hive.exception.HadoopConfSetFailedException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.conf import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.Driver object HiveUtils { def jarOfClass(cls: Class[_]):Option[String] = { val uri = cls.getResource("/" + cls.getName.replace('.', '/') + ".class") if (uri != null) { val uriStr = uri.toString if (uriStr.startsWith("jar:file:")) { Some(uriStr.substring("jar:file:".length, uriStr.indexOf("!"))) } else { None } } else { None } } def getHiveConf:HiveConf = { val confDir:File = new File(CommonConfiguration.hadoopConfDir) if (!confDir.exists() || confDir.isFile){ throw HadoopConfSetFailedException(41001, "hadoop conf set failed, reason: conf dir does not exist") } val hadoopConf:Configuration = new Configuration() hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath)) hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath)) hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath)) new conf.HiveConf(hadoopConf, classOf[Driver]) } def msDurationToString(ms: Long): String = { val second = 1000 val minute = 60 * second val hour = 60 * minute ms match { case t if t < second => "%d ms".format(t) case t if t < minute => "%.1f s".format(t.toFloat / second) case t if t < hour => "%.1f m".format(t.toFloat / minute) case t => "%.2f h".format(t.toFloat / hour) } } def main(args: Array[String]): Unit = { jarOfClass(classOf[Driver]).foreach(println) } }
Example 34
Source File: HDFSUtils.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.hadoop.common.utils import java.io.File import java.nio.file.Paths import java.security.PrivilegedExceptionAction import com.webank.wedatasphere.linkis.common.conf.Configuration.hadoopConfDir import com.webank.wedatasphere.linkis.hadoop.common.conf.HadoopConf._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.UserGroupInformation object HDFSUtils { def getConfiguration(user: String): Configuration = getConfiguration(user, hadoopConfDir) def getConfiguration(user: String, hadoopConfDir: String): Configuration = { val confPath = new File(hadoopConfDir) if(!confPath.exists() || confPath.isFile) { throw new RuntimeException(s"Create hadoop configuration failed, path $hadoopConfDir not exists.") } val conf = new Configuration() conf.addResource(new Path(Paths.get(hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf.addResource(new Path(Paths.get(hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf.addResource(new Path(Paths.get(hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf } def getHDFSRootUserFileSystem: FileSystem = getHDFSRootUserFileSystem(getConfiguration(HADOOP_ROOT_USER.getValue)) def getHDFSRootUserFileSystem(conf: org.apache.hadoop.conf.Configuration): FileSystem = getHDFSUserFileSystem(HADOOP_ROOT_USER.getValue, conf) def getHDFSUserFileSystem(userName: String): FileSystem = getHDFSUserFileSystem(userName, getConfiguration(userName)) def getHDFSUserFileSystem(userName: String, conf: org.apache.hadoop.conf.Configuration): FileSystem = getUserGroupInformation(userName) .doAs(new PrivilegedExceptionAction[FileSystem]{ def run = FileSystem.get(conf) }) def getUserGroupInformation(userName: String): UserGroupInformation ={ if(KERBEROS_ENABLE.getValue) { val path = new File(KEYTAB_FILE.getValue , userName + ".keytab").getPath val user = getKerberosUser(userName) UserGroupInformation.setConfiguration(getConfiguration(userName)) UserGroupInformation.loginUserFromKeytabAndReturnUGI(user, path) } else { UserGroupInformation.createRemoteUser(userName) } } def getKerberosUser(userName: String): String = { var user = userName if(KEYTAB_HOST_ENABLED.getValue){ user = user+ "/" + KEYTAB_HOST.getValue } user } }
Example 35
Source File: Credentials.scala From spark-select with Apache License 2.0 | 5 votes |
package io.minio.spark.select import java.net.URI // For BasicAWSCredentials import com.amazonaws.auth.AWSCredentials import com.amazonaws.auth.AWSCredentialsProvider import com.amazonaws.auth.BasicAWSCredentials import com.amazonaws.auth.BasicSessionCredentials import com.amazonaws.auth.DefaultAWSCredentialsProviderChain import org.apache.hadoop.conf.Configuration private[spark] object Credentials { private def staticCredentialsProvider(credentials: AWSCredentials): AWSCredentialsProvider = { new AWSCredentialsProvider { override def getCredentials: AWSCredentials = credentials override def refresh(): Unit = {} } } def load(location: Option[String], hadoopConfiguration: Configuration): AWSCredentialsProvider = { val uri = new URI(location.getOrElse("")) val uriScheme = uri.getScheme uriScheme match { case "s3" | "s3a" => // This matches what S3A does, with one exception: we don't // support anonymous credentials. First, try to parse from URI: Option(uri.getUserInfo).flatMap { userInfo => if (userInfo.contains(":")) { val Array(accessKey, secretKey) = userInfo.split(":") Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey))) } else { None } }.orElse { val accessKey = hadoopConfiguration.get(s"fs.s3a.access.key", null) val secretKey = hadoopConfiguration.get(s"fs.s3a.secret.key", null) val sessionToken = hadoopConfiguration.get(s"fs.s3a.session.token", null) if (accessKey != null && secretKey != null) { if (sessionToken != null) { Some(staticCredentialsProvider(new BasicSessionCredentials(accessKey, secretKey, sessionToken))) } else { Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey))) } } else { None } }.getOrElse { // Finally, fall back on the instance profile provider new DefaultAWSCredentialsProviderChain() } case other => throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a") } } }
Example 36
Source File: HiveTezSuite.scala From connectors with Apache License 2.0 | 5 votes |
package io.delta.hive import java.io.{Closeable, File} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.MRJobConfig import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.tez.dag.api.TezConfiguration import org.apache.tez.runtime.library.api.TezRuntimeConfiguration import org.apache.tez.test.MiniTezCluster class HiveTezSuite extends HiveConnectorTest { override val engine: String = "tez" private var tezConf: Configuration = _ // scalastyle:off // scalastyle:on override def setupConfiguration(conf: Configuration): Unit = { tezConf.asScala.foreach { e => conf.set(e.getKey, e.getValue) } // Overrides values from the hive/tez-site. conf.setInt("hive.tez.container.size", 256) conf.setInt(TezConfiguration.TEZ_AM_RESOURCE_MEMORY_MB, 256) conf.setInt(TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB, 256) conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 24) conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, 10) conf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0.4f) conf.setBoolean(TezConfiguration.TEZ_IGNORE_LIB_URIS, true) } }
Example 37
Source File: HiveMRSuite.scala From connectors with Apache License 2.0 | 5 votes |
package io.delta.hive import java.io.{Closeable, File} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.{JobConf, MiniMRCluster} import org.apache.hadoop.mapreduce.MRJobConfig import org.apache.hadoop.yarn.conf.YarnConfiguration class HiveMRSuite extends HiveConnectorTest { override val engine: String = "mr" override def createCluster(namenode: String, conf: Configuration, tempPath: File): Closeable = { val jConf = new JobConf(conf); jConf.set("yarn.scheduler.capacity.root.queues", "default"); jConf.set("yarn.scheduler.capacity.root.default.capacity", "100"); jConf.setInt(MRJobConfig.MAP_MEMORY_MB, 512); jConf.setInt(MRJobConfig.REDUCE_MEMORY_MB, 512); jConf.setInt(MRJobConfig.MR_AM_VMEM_MB, 128); jConf.setInt(YarnConfiguration.YARN_MINICLUSTER_NM_PMEM_MB, 512); jConf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128); jConf.setInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, 512); val mr = new MiniMRCluster(2, namenode, 1, null, null, jConf) new Closeable { override def close(): Unit = { mr.shutdown() } } } }
Example 38
Source File: ModelLoader.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.spark.ml.Transformer trait ModelLoader[T <: Transformer] { def load(source: ModelSource): T final def load(path: String): T = { val source = if (path.startsWith("hdfs://")) { val uri = new URI(path) val p = uri.getPath ModelSource.hadoop(p, new Configuration()) } else { ModelSource.local(path) } load(source) } }
Example 39
Source File: ModelSource.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import java.io.{InputStreamReader, BufferedReader} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, FileSystem} case class ModelSource( root: String, fs: FileSystem ) { def readFile(path: String): String = { val fsPath = filePath(path) val reader = new BufferedReader(new InputStreamReader(fs.open(fsPath))) val builder = new StringBuilder() var line: String = null while ({ line = reader.readLine(); line != null }) { builder.append(line + "\n") } builder.mkString } def findFile(dir: String, recursive: Boolean, f: String => Boolean): Option[Path] = { val dirPath = filePath(dir) if (fs.exists(dirPath) & fs.isDirectory(dirPath)) { val iter = fs.listFiles(dirPath, recursive) while (iter.hasNext) { val st = iter.next() if (st.isFile && f(st.getPath.getName)) return Some(st.getPath) } None } else { None } } def filePath(path: String): Path = { new Path(s"$root/$path") } } object ModelSource { def local(path: String): ModelSource = { ModelSource(path, FileSystem.getLocal(new Configuration())) } def hadoop(path: String, conf: Configuration): ModelSource = { val fs = FileSystem.get(conf) ModelSource(path, fs) } }
Example 40
Source File: ModelDataReader.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import io.hydrosphere.spark_ml_serving.common.reader._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import parquet.format.converter.ParquetMetadataConverter.NO_FILTER import parquet.hadoop.{ParquetFileReader, ParquetReader} import parquet.schema.MessageType import scala.collection.immutable.HashMap import scala.collection.mutable object ModelDataReader { def parse(source: ModelSource, path: String): LocalData = { source.findFile(path, recursive = true, _.endsWith(".parquet")) match { case Some(p) => readData(p) case None => LocalData.empty } } private def readData(p: Path): LocalData = { val conf: Configuration = new Configuration() val metaData = ParquetFileReader.readFooter(conf, p, NO_FILTER) val schema: MessageType = metaData.getFileMetaData.getSchema val reader = ParquetReader.builder[SimpleRecord](new SimpleReadSupport(), p.getParent).build() var result = LocalData.empty try { var value = reader.read() while (value != null) { val valMap = value.struct(HashMap.empty[String, Any], schema) result = mergeMaps(result, valMap) value = reader.read() } result } finally { if (reader != null) { reader.close() } } } private def mergeMaps(acc: LocalData, map: HashMap[String, Any]) = { var result = acc map.foreach { case (k, v) => result = result.appendToColumn(k, List(v)) } result } }
Example 41
Source File: SimpleReadSupport.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common.reader import java.util import org.apache.hadoop.conf.Configuration import parquet.hadoop.api.ReadSupport.ReadContext import parquet.hadoop.api.{InitContext, ReadSupport} import parquet.io.api.RecordMaterializer import parquet.schema.MessageType class SimpleReadSupport extends ReadSupport[SimpleRecord] { override def prepareForRead( configuration: Configuration, map: util.Map[String, String], messageType: MessageType, readContext: ReadContext ): RecordMaterializer[SimpleRecord] = { new SimpleRecordMaterializer(messageType) } override def init(context: InitContext): ReadContext = { new ReadContext(context.getFileSchema) } }
Example 42
Source File: HBaseGlobalValues.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.server.hbase import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory} object HBaseGlobalValues { var appEventTableName = "app-event" var numberOfSalts = 10000 var connection:Connection = null def init(conf:Configuration, numberOfSalts:Int, appEventTableName:String): Unit = { connection = ConnectionFactory.createConnection(conf) this.numberOfSalts = numberOfSalts this.appEventTableName = appEventTableName } }
Example 43
Source File: Util.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def getTempFilePath(conf: Configuration, prefix: String): String = { val fileSystem = FileSystem.get(conf) val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") if (fileSystem.exists(path)) { fileSystem.delete(path, true) } path.getName } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 44
Source File: RecommenderSystem.scala From recommendersystem with Apache License 2.0 | 5 votes |
package com.infosupport.recommendedcontent.core import java.io.Serializable import akka.actor.{Props, Actor, ActorLogging} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.MatrixFactorizationModel private def generateRecommendations(userId: Int, count: Int) = { log.info(s"Generating ${count} recommendations for user with ID ${userId}") // Generate recommendations based on the machine learning model. // When there's no trained model return an empty list instead. val results = model match { case Some(m) => m.recommendProducts(userId,count) .map(rating => Recommendation(rating.product,rating.rating)) .toList case None => Nil } sender ! Recommendations(results) } }
Example 45
Source File: AvroSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.io.File import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} case class AvroSource(path: Path) (implicit conf: Configuration, fs: FileSystem) extends Source with Using { override lazy val schema: StructType = { using(AvroReaderFns.createAvroReader(path)) { reader => val record = reader.next() AvroSchemaFns.fromAvroSchema(record.getSchema) } } override def parts(): Seq[Publisher[Seq[Row]]] = Seq(AvroSourcePublisher(path)) } case class AvroSourcePublisher(path: Path) (implicit conf: Configuration, fs: FileSystem) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { val deserializer = new AvroDeserializer() try { using(AvroReaderFns.createAvroReader(path)) { reader => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) AvroRecordIterator(reader) .takeWhile(_ => running.get) .map(deserializer.toRow) .grouped(DataStream.DefaultBatchSize) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } } object AvroSource { def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSource = AvroSource(new Path(file.getAbsoluteFile.toString)) def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSource = apply(path.toFile) }
Example 46
Source File: AvroSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.io.File import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} case class AvroSink(path: Path, overwrite: Boolean = false, permission: Option[FsPermission] = None, inheritPermissions: Option[Boolean] = None) (implicit conf: Configuration, fs: FileSystem) extends Sink { def withOverwrite(overwrite: Boolean): AvroSink = copy(overwrite = overwrite) def withPermission(permission: FsPermission): AvroSink = copy(permission = Option(permission)) def withInheritPermission(inheritPermissions: Boolean): AvroSink = copy(inheritPermissions = Option(inheritPermissions)) override def open(schema: StructType): SinkWriter = new SinkWriter { private val writer = new AvroWriter(schema, fs.create(path, overwrite)) override def write(row: Row): Unit = writer.write(row) override def close(): Unit = { writer.close() permission match { case Some(perm) => fs.setPermission(path, perm) case None => if (inheritPermissions.getOrElse(false)) { val permission = fs.getFileStatus(path.getParent).getPermission fs.setPermission(path, permission) } } } } } object AvroSink { def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSink = AvroSink(new Path(file.getAbsoluteFile.toString)) def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSink = apply(path.toFile) }
Example 47
Source File: SequenceSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringWriter import com.univocity.parsers.csv.{CsvWriter, CsvWriterSettings} import io.eels.{Row, Sink, SinkWriter} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSink(path: Path)(implicit conf: Configuration) extends Sink { override def open(schema: StructType): SinkWriter = new SequenceSinkWriter(schema, path) class SequenceSinkWriter(schema: StructType, path: Path) extends SinkWriter { val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(path), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[BytesWritable]) ) val key = new IntWritable(0) val headers = valuesToCsv(schema.fieldNames()) writer.append(key, new BytesWritable(headers.getBytes)) override def close(): Unit = writer.close() override def write(row: Row): Unit = { this.synchronized { val csv = valuesToCsv(row.values) writer.append(key, new BytesWritable(csv.getBytes())) key.set(key.get() + 1) } } private def valuesToCsv(values: Seq[Any]): String = { val swriter = new StringWriter() val csv = new CsvWriter(swriter, new CsvWriterSettings()) csv.writeRow(values.map { case null => null case other => other.toString }: _*) csv.close() swriter.toString().trim() } } }
Example 48
Source File: SequenceSupport.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringReader import java.nio.charset.Charset import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels.component.csv.{CsvFormat, CsvSupport} import io.eels.schema.{Field, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} object SequenceSupport extends Logging with Using { def createReader(path: Path)(implicit conf: Configuration): SequenceFile.Reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path)) def toValues(v: BytesWritable): Array[String] = toValues(new String(v.copyBytes(), Charset.forName("UTF8"))) def toValues(str: String): Array[String] = { val parser = CsvSupport.createParser(CsvFormat(), false, false, false, null, null) parser.beginParsing(new StringReader(str)) val record = parser.parseNext() parser.stopParsing() record } def schema(path: Path)(implicit conf: Configuration): StructType = { logger.debug(s"Fetching sequence schema for $path") using(createReader(path)) { it => val k = new IntWritable() val v = new BytesWritable() val fields: Array[Field] = { it.next(k, v) toValues(v).map { it => new Field(it) } } StructType(fields.toList) } } }
Example 49
Source File: SequenceSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSource(path: Path)(implicit conf: Configuration) extends Source with Logging { logger.debug(s"Creating sequence source from $path") override def schema: StructType = SequenceSupport.schema(path) override def parts(): Seq[Publisher[Seq[Row]]] = List(new SequencePublisher(path)) } object SequenceReaderIterator { def apply(schema: StructType, reader: SequenceFile.Reader): Iterator[Row] = new Iterator[Row] { private val k = new IntWritable() private val v = new BytesWritable() // throw away the header reader.next(k, v) override def next(): Row = Row(schema, SequenceSupport.toValues(v).toVector) override def hasNext(): Boolean = reader.next(k, v) } } class SequencePublisher(val path: Path)(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(SequenceSupport.createReader(path)) { reader => val schema = SequenceSupport.schema(path) val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) SequenceReaderIterator(schema, reader) .takeWhile(_ => running.get) .grouped(DataStream.DefaultBatchSize) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 50
Source File: RowParquetReaderFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.api.ReadSupport import org.apache.parquet.hadoop.{ParquetFileReader, ParquetReader} import org.apache.parquet.schema.Type def apply(path: Path, predicate: Option[Predicate], readSchema: Option[Type], dictionaryFiltering: Boolean)(implicit conf: Configuration): ParquetReader[Row] = { logger.debug(s"Opening parquet reader for $path") // The parquet reader can use a projection by setting a projected schema onto the supplied conf object def configuration(): Configuration = { val newconf = new Configuration(conf) readSchema.foreach { it => newconf.set(ReadSupport.PARQUET_READ_SCHEMA, it.toString) } //newconf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, dictionaryFiltering.toString) newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString) newconf } // a filter is set when we have a predicate for the read def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build) .map(FilterCompat.get) .getOrElse(FilterCompat.NOOP) ParquetReader.builder(new RowReadSupport, path) .withConf(configuration()) .withFilter(filter()) .build() } }
Example 51
Source File: ParquetPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.parquet.util.ParquetIterator import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType class ParquetPublisher(path: Path, predicate: Option[Predicate], projection: Seq[String], caseSensitive: Boolean, dictionaryFiltering: Boolean) (implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { def readSchema: Option[MessageType] = { if (projection.isEmpty) None else { val fileSchema = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER).getFileMetaData.getSchema val structType = ParquetSchemaFns.fromParquetMessageType(fileSchema) if (caseSensitive) { assert( structType.fieldNames.toSet.size == structType.fieldNames.map(_.toLowerCase).toSet.size, "Cannot use case sensitive = true when this would result in a clash of field names" ) } val projectionSchema = StructType(projection.map { field => structType.field(field, caseSensitive).getOrError(s"Requested field $field does not exist in the parquet schema") }) ParquetSchemaFns.toParquetMessageType(projectionSchema).some } } override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(RowParquetReaderFn(path, predicate, readSchema, dictionaryFiltering)) { reader => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) ParquetIterator(reader) .grouped(DataStream.DefaultBatchSize) .takeWhile(_ => running.get) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 52
Source File: AvroParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.avro.{AvroSchemaFns, AvroSchemaMerge} import io.eels.component.parquet._ import io.eels.datastream.Publisher import io.eels.schema.StructType import io.eels.{FilePattern, Predicate, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object AvroParquetSource { def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) } case class AvroParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { private lazy val paths = pattern.toPaths() def withPredicate(pred: Predicate): AvroParquetSource = copy(predicate = pred.some) // the schema returned by the parquet source should be a merged version of the // schemas contained in all the files. override def schema: StructType = { val schemas = paths.map { path => using(AvroParquetReaderFn.apply(path, predicate, None)) { reader => val record = Option(reader.read()).getOrElse { sys.error(s"Cannot read $path for schema; file contains no records") } record.getSchema } } val avroSchema = AvroSchemaMerge("record", "namspace", schemas) AvroSchemaFns.fromAvroSchema(avroSchema) } // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"AvroParquetSource source has ${paths.size} files: $paths") paths.map { it => new AvroParquetPublisher(it, predicate) } } def footers(): List[Footer] = { logger.debug(s"AvroParquetSource source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 53
Source File: AvroParquetReaderFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import io.eels.Predicate import io.eels.component.parquet.{ParquetPredicateBuilder, ParquetReaderConfig} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.avro.{AvroParquetReader, AvroReadSupport} import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.hadoop.ParquetReader def apply(path: Path, predicate: Option[Predicate], projectionSchema: Option[Schema])(implicit conf: Configuration): ParquetReader[GenericRecord] = { // The parquet reader can use a projection by setting a projected schema onto a conf object def configuration(): Configuration = { val newconf = new Configuration(conf) projectionSchema.foreach { it => AvroReadSupport.setAvroReadSchema(newconf, it) AvroReadSupport.setRequestedProjection(newconf, it) } //conf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, "true") newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString) newconf } // a filter is set when we have a predicate for the read def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build) .map(FilterCompat.get) .getOrElse(FilterCompat.NOOP) AvroParquetReader.builder[GenericRecord](path) .withCompatibility(false) .withConf(configuration()) .withFilter(filter()) .build() .asInstanceOf[ParquetReader[GenericRecord]] } }
Example 54
Source File: AvroParquetPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels.component.avro.AvroDeserializer import io.eels.component.parquet.util.ParquetIterator import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path class AvroParquetPublisher(path: Path, predicate: Option[Predicate])(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { val deser = new AvroDeserializer() val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) using(AvroParquetReaderFn(path, predicate, None)) { reader => ParquetIterator(reader) .map(deser.toRow) .grouped(DataStream.DefaultBatchSize) .takeWhile(_ => running.get) .foreach(subscriber.next) } subscriber.completed() } catch { case t: Throwable => subscriber.error(t) } } }
Example 55
Source File: RowWriteSupport.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import io.eels.Row import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ import scala.math.BigDecimal.RoundingMode.RoundingMode // implementation of WriteSupport for Row's used by the native ParquetWriter class RowWriteSupport(schema: MessageType, roundingMode: RoundingMode, metadata: Map[String, String]) extends WriteSupport[Row] with Logging { logger.trace(s"Created parquet row write support for schema message type $schema") private var writer: RowWriter = _ override def finalizeWrite(): FinalizedWriteContext = new FinalizedWriteContext(metadata.asJava) def init(configuration: Configuration): WriteSupport.WriteContext = { new WriteSupport.WriteContext(schema, new java.util.HashMap()) } def prepareForWrite(record: RecordConsumer) { writer = new RowWriter(record, roundingMode) } def write(row: Row) { writer.write(row) } } class RowWriter(record: RecordConsumer, roundingMode: RoundingMode) { def write(row: Row): Unit = { record.startMessage() val writer = new StructRecordWriter(row.schema, roundingMode, false) writer.write(record, row.values) record.endMessage() } }
Example 56
Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} import org.apache.parquet.schema.MessageType import scala.math.BigDecimal.RoundingMode.RoundingMode object RowParquetWriterFn { class RowParquetWriterBuilder(path: Path, schema: MessageType, roundingMode: RoundingMode, metadata: Map[String, String]) extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) { override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata) override def self(): RowParquetWriterBuilder = this } def apply(path: Path, schema: StructType, metadata: Map[String, String], dictionary: Boolean, roundingMode: RoundingMode, fsConfig: Configuration): ParquetWriter[Row] = { val config = ParquetWriterConfig() val messageType = ParquetSchemaFns.toParquetMessageType(schema) new RowParquetWriterBuilder(path, messageType, roundingMode, metadata) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(dictionary) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withValidation(config.validating) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withConf(fsConfig) .build() } }
Example 57
Source File: ParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.datastream.Publisher import io.eels.{Predicate, _} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object ParquetSource { def apply(string: String)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(string)) def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) } case class ParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None, projection: Seq[String] = Nil, dictionaryFiltering: Boolean = true, caseSensitive: Boolean = true) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { logger.debug(s"Created parquet source with pattern=$pattern") lazy val paths: List[Path] = pattern.toPaths() def withDictionaryFiltering(dictionary: Boolean): ParquetSource = copy(dictionaryFiltering = dictionary) def withCaseSensitivity(caseSensitive: Boolean): ParquetSource = copy(caseSensitive = caseSensitive) def withPredicate(pred: => Predicate): ParquetSource = copy(predicate = pred.some) def withProjection(first: String, rest: String*): ParquetSource = withProjection(first +: rest) def withProjection(fields: Seq[String]): ParquetSource = { require(fields.nonEmpty) copy(projection = fields.toList) } // returns the metadata in the parquet file, or an empty map if none def metadata(): Map[String, String] = { paths.foldLeft(Map.empty[String, String]) { (metadata, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) metadata ++ footer.getFileMetaData.getKeyValueMetaData.asScala } } // todo should take the merged schema from all files lazy val schema: StructType = RowParquetReaderFn.schema(paths.headOption.getOrError("No paths found for source")) // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"Parquet source has ${paths.size} files: ${paths.mkString(", ")}") paths.map { it => new ParquetPublisher(it, predicate, projection, caseSensitive, dictionaryFiltering) } } def footers(): List[Footer] = { logger.debug(s"Parquet source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 58
Source File: HdfsWatcher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hdfs import java.util.concurrent.Executors import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import io.eels.util.HdfsIterator import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.client.HdfsAdmin import org.apache.hadoop.hdfs.inotify.Event import scala.concurrent.duration._ import scala.util.control.NonFatal class HdfsWatcher(path: Path, callback: FileCallback) (implicit fs: FileSystem, conf: Configuration) extends Logging { private val files = HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toBuffer files.foreach(callback.onStart) private val executor = Executors.newSingleThreadExecutor() private val running = new AtomicBoolean(true) private val interval = 5.seconds private val admin = new HdfsAdmin(path.toUri, conf) private val eventStream = admin.getInotifyEventStream executor.submit(new Runnable { override def run(): Unit = { while (running.get) { try { Thread.sleep(interval.toMillis) val events = eventStream.take for (event <- events.getEvents) { event match { case create: Event.CreateEvent => callback.onCreate(create) case append: Event.AppendEvent => callback.onAppend(append) case rename: Event.RenameEvent => callback.onRename(rename) case close: Event.CloseEvent => callback.onClose(close) case _ => } } } catch { case NonFatal(e) => logger.error("Error while polling fs", e) } } } }) def stop(): Unit = { running.set(false) executor.shutdownNow() } } trait FileCallback { def onStart(path: Path): Unit def onClose(close: Event.CloseEvent): Unit def onRename(rename: Event.RenameEvent): Unit def onAppend(append: Event.AppendEvent): Unit def onCreate(path: Event.CreateEvent): Unit }
Example 59
Source File: CsvSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.csv import com.univocity.parsers.csv.CsvWriter import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} case class CsvSink(path: Path, overwrite: Boolean = false, headers: Header = Header.FirstRow, format: CsvFormat = CsvFormat(), ignoreLeadingWhitespaces: Boolean = false, ignoreTrailingWhitespaces: Boolean = false) (implicit conf: Configuration, fs: FileSystem) extends Sink { override def open(schema: StructType): SinkWriter = new CsvSinkWriter(schema, path, headers, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces) def withOverwrite(overwrite: Boolean): CsvSink = copy(overwrite = overwrite) def withHeaders(headers: Header): CsvSink = copy(headers = headers) def withIgnoreLeadingWhitespaces(ignoreLeadingWhitespaces: Boolean): CsvSink = copy(ignoreLeadingWhitespaces = ignoreLeadingWhitespaces) def withIgnoreTrailingWhitespaces(ignoreTrailingWhitespaces: Boolean): CsvSink = copy(ignoreTrailingWhitespaces = ignoreTrailingWhitespaces) def withFormat(format: CsvFormat): CsvSink = copy(format = format) class CsvSinkWriter(schema: StructType, path: Path, headers: Header, format: CsvFormat, ignoreLeadingWhitespaces: Boolean = false, ignoreTrailingWhitespaces: Boolean = false) extends SinkWriter { private val lock = new AnyRef {} if (overwrite && fs.exists(path)) fs.delete(path, false) import scala.collection.JavaConverters._ private lazy val writer: CsvWriter = { val output = fs.create(path) val writer = CsvSupport.createWriter(output, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces) headers match { case Header.FirstComment => writer.commentRow(schema.fieldNames().mkString(format.delimiter.toString())) case Header.FirstRow => writer.writeHeaders(schema.fieldNames().asJava) case _ => } writer } override def close(): Unit = writer.close() override def write(row: Row): Unit = { lock.synchronized { // nulls should be written as empty strings val array = row.values.map { case null => "" case other => other.toString } writer.writeRow(array: _*) } } } } object CsvSink { def apply(path: java.nio.file.Path) (implicit conf: Configuration, fs: FileSystem): CsvSink = CsvSink(new Path(path.toString)) }
Example 60
Source File: ReadParquetEEL.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.sql.Timestamp import io.eels.component.parquet.{ParquetSink, ParquetSource} import io.eels.datastream.DataStream import io.eels.schema.{ArrayType, DecimalType, Field, IntType, Precision, Scale, StringType, StructType, TimestampMillisType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} object ReadParquetEEL extends App { def readParquet(path: Path): Unit = { implicit val hadoopConfiguration = new Configuration() implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration) val rows = ParquetSource(parquetFilePath).toDataStream().collect rows.foreach(row => println(row)) } val parquetFilePath = new Path("file:///home/sam/development/person2.parquet") implicit val hadoopConfiguration = new Configuration() implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration) val friendStruct = Field.createStructField("FRIEND", Seq( Field("NAME", StringType), Field("AGE", IntType.Signed) ) ) val personDetailsStruct = Field.createStructField("PERSON_DETAILS", Seq( Field("NAME", StringType), Field("AGE", IntType.Signed), Field("SALARY", DecimalType(Precision(38), Scale(5))), Field("CREATION_TIME", TimestampMillisType) ) ) val friendType = StructType(friendStruct) val schema = StructType(personDetailsStruct, Field("FRIENDS", ArrayType(friendType), nullable = false)) val friends = Vector( Vector(Vector("John", 25)), Vector(Vector("Adam", 26)), Vector(Vector("Steven", 27)) ) val rows = Vector( Vector(Vector("Fred", 50, BigDecimal("50000.99000"), new Timestamp(System.currentTimeMillis())), friends) ) try { DataStream.fromValues(schema, rows).to(ParquetSink(parquetFilePath).withOverwrite(true)) } catch { case e: Exception => e.printStackTrace() } try { readParquet(parquetFilePath) } catch { case e: Exception => e.printStackTrace() } }
Example 61
Source File: FilePatternTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.nio.file.Files import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class FilePatternTest extends WordSpec with Matchers { implicit val fs = FileSystem.get(new Configuration()) "FilePattern" should { "detect single hdfs path without name server" ignore { FilePattern("hdfs:///mypath").toPaths() shouldBe List(new Path("hdfs:///mypath")) } "detect single hdfs path with name server" ignore { FilePattern("hdfs://nameserver/mypath").toPaths() shouldBe List(new Path("hdfs://nameserver/mypath")) } "detect absolute local file" in { FilePattern("file:///absolute/file").toPaths() shouldBe List(new Path("file:///absolute/file")) } "detect relative local file" in { FilePattern("file:///local/file").toPaths() shouldBe List(new Path("file:///local/file")) } "detect relative local file expansion" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } val hdfsPaths = files.map { it => new Path(it.toUri) } files.foreach(file => Files.createFile(file)) FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet files.foreach(Files.deleteIfExists) Files.deleteIfExists(dir) } //not working on windows "detect relative local file expansion with schema" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } val hdfsPaths = files.map { it => new Path(it.toUri) } files.foreach(file => Files.createFile(file)) FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet files.foreach(Files.deleteIfExists) Files.deleteIfExists(dir) } "use filter if supplied" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } files.foreach { it => Files.createFile(it) } val a = FilePattern(dir.toAbsolutePath().toString() + "/*") .withFilter(_.toString().endsWith("a")) .toPaths.toSet a shouldBe Set(new Path("file:///" + dir.resolve("a"))) files.foreach { it => Files.deleteIfExists(it) } Files.deleteIfExists(dir) } } }
Example 62
Source File: ListenerTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.util.concurrent.{CountDownLatch, TimeUnit} import io.eels.component.csv.{CsvSink, CsvSource} import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} import scala.util.Random class ListenerTest extends WordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.get(conf) val schema = StructType("a", "b", "c", "d", "e") val rows = List.fill(1000)(Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(10))) val ds = DataStream.fromRows(schema, rows) val path = new Path("listener_test.csv") "DataStream" should { "support user's listeners" in { val latch = new CountDownLatch(1000) fs.delete(path, false) ds.listener(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).to(CsvSink(path)) latch.await(20, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } "propagate errors in listeners" in { class TestSink extends Sink { override def open(schema: StructType): SinkWriter = new SinkWriter { override def close(): Unit = () override def write(row: Row): Unit = () } } try { ds.listener(new Listener { override def onNext(value: Row): Unit = sys.error("boom") override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).to(new TestSink) assert(false) } catch { case _: Throwable => } } } "Source.toDataStream" should { "call on next for each row" in { val latch = new CountDownLatch(1000) fs.delete(path, false) ds.to(CsvSink(path)) CsvSource(path).toDataStream(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).collect latch.await(5, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } "call on complete once finished" in { val latch = new CountDownLatch(1001) fs.delete(path, false) ds.to(CsvSink(path)) CsvSource(path).toDataStream(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = latch.countDown() }).collect latch.await(5, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } } }
Example 63
Source File: AvroSourceTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.nio.file.Paths import com.typesafe.config.ConfigFactory import io.eels.schema.{Field, StructType} import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.scalatest.{Matchers, WordSpec} class AvroSourceTest extends WordSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) "AvroSource" should { "read schema" in { val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath) people.schema shouldBe StructType(Field("name", nullable = false), Field("job", nullable = false), Field("location", nullable = false)) } "read strings as java.lang.String when eel.avro.java.string is true" in { System.setProperty("eel.avro.java.string", "true") ConfigFactory.invalidateCaches() val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet people.map(_.values) shouldBe Set( List("clint eastwood", "actor", "carmel"), List("elton john", "musician", "pinner"), List("issac newton", "scientist", "heaven") ) System.setProperty("eel.avro.java.string", "false") ConfigFactory.invalidateCaches() } "read strings as utf8 when eel.avro.java.string is false" in { System.setProperty("eel.avro.java.string", "false") ConfigFactory.invalidateCaches() val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet people.map(_.values) shouldBe Set( List(new Utf8("clint eastwood"), new Utf8("actor"), new Utf8("carmel")), List(new Utf8("elton john"), new Utf8("musician"), new Utf8("pinner")), List(new Utf8("issac newton"), new Utf8("scientist"), new Utf8("heaven")) ) System.setProperty("eel.avro.java.string", "true") ConfigFactory.invalidateCaches() } } }
Example 64
Source File: AvroSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import io.eels.Row import io.eels.datastream.DataStream import io.eels.schema.{ArrayType, Field, MapType, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class AvroSinkTest extends WordSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val ds = DataStream.fromValues( StructType("name", "job", "location"), Seq( List("clint eastwood", "actor", "carmel"), List("elton john", "musician", "pinner"), List("issac newton", "scientist", "heaven") ) ) "AvroSink" should { "write to avro" in { val path = new Path("avro.test") fs.delete(path, false) ds.to(AvroSink(path)) fs.delete(path, false) } "support overwrite option" in { val path = new Path("overwrite_test", ".avro") fs.delete(path, false) ds.to(AvroSink(path)) ds.to(AvroSink(path).withOverwrite(true)) fs.delete(path, false) } "write lists and maps" in { val ds = DataStream.fromValues( StructType( Field("name"), Field("movies", ArrayType(StringType)), Field("characters", MapType(StringType, StringType)) ), Seq( List( "clint eastwood", List("fistful of dollars", "high plains drifters"), Map("preacher" -> "high plains", "no name" -> "good bad ugly") ) ) ) val path = new Path("array_map_avro", ".avro") fs.delete(path, false) ds.to(AvroSink(path)) AvroSource(path).toDataStream().collect shouldBe Seq( Row( ds.schema, Seq( "clint eastwood", List("fistful of dollars", "high plains drifters"), Map("preacher" -> "high plains", "no name" -> "good bad ugly") ) ) ) fs.delete(path, true) } } }
Example 65
Source File: JsonSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.json import io.eels.datastream.DataStream import io.eels.schema.{Field, StructType} import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class JsonSinkTest extends WordSpec with Matchers { val path = new Path("test.json") implicit val fs: FileSystem = FileSystem.get(new Configuration()) "JsonSink" should { "write multiple json docs to a file" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("location")) val ds = DataStream.fromValues( schema, Seq( Vector("sam", "aylesbury"), Vector("jam", "aylesbury"), Vector("ham", "buckingham") ) ) ds.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input should include("""{"name":"sam","location":"aylesbury"}""") input should include("""{"name":"jam","location":"aylesbury"}""") input should include("""{"name":"ham","location":"buckingham"}""") fs.delete(path, false) } "support arrays" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("skills")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Array("karate", "kung fu"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","skills":["karate","kung fu"]}""" fs.delete(path, false) } "support maps" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("locations")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Map("home" -> "boro", "work" -> "london"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}""" fs.delete(path, false) } "support structs" in { case class Foo(home: String, work: String) if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("locations")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Foo("boro", "london"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}""" fs.delete(path, false) } } }
Example 66
Source File: SequenceSourceTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import io.eels.Row import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, WordSpec} class SequenceSourceTest extends WordSpec with Matchers { private implicit val conf = new Configuration() private val schema = StructType(Field("name"), Field("location")) private val ds = DataStream.fromValues( schema, Seq( Vector("name", "location"), Vector("sam", "aylesbury"), Vector("jam", "aylesbury"), Vector("ham", "buckingham") ) ) "SequenceSource" should { "read sequence files" in { val schema = StructType( Field("a", StringType), Field("b", StringType), Field("c", StringType), Field("d", StringType) ) val path = new Path(getClass.getResource("/test.seq").getFile) val rows = SequenceSource(path).toDataStream().toSet rows shouldBe Set( Row(schema, "1", "2", "3", "4"), Row(schema, "5", "6", "7", "8") ) } "read header as schema" in { val path = new Path(getClass.getResource("/test.seq").getFile) SequenceSource(path).schema shouldBe StructType( Field("a", StringType), Field("b", StringType), Field("c", StringType), Field("d", StringType) ) } } }
Example 67
Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} import org.scalatest.{Matchers, WordSpec} class SequenceSinkTest extends WordSpec with Matchers { private val ds = DataStream.fromValues( StructType("a", "b", "c", "d"), Seq( List("1", "2", "3", "4"), List("5", "6", "7", "8") ) ) "SequenceSink" should { "write sequence files" in { implicit val conf = new Configuration implicit val fs = FileSystem.get(conf) val path = new Path("seqsink.seq") if (fs.exists(path)) fs.delete(path, true) ds.to(SequenceSink(path)) val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path)) val k = new IntWritable val v = new BytesWritable val set = for (_ <- 1 to 3) yield { reader.next(k, v) new String(v.copyBytes) } set.toSet shouldBe Set( "a,b,c,d", "1,2,3,4", "5,6,7,8" ) reader.close() fs.delete(path, true) } } }
Example 68
Source File: ParquetProjectionTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.{File, FilenameFilter} import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{FlatSpec, Matchers} class ParquetProjectionTest extends FlatSpec with Matchers { cleanUpResidualParquetTestFiles private val schema = StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) private val ds = DataStream.fromValues( schema, Seq( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) ) private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val file = new File(s"test_${System.currentTimeMillis()}.pq") file.deleteOnExit() private val path = new Path(file.toURI) if (fs.exists(path)) fs.delete(path, false) ds.to(ParquetSink(path).withOverwrite(true)) "ParquetSource" should "support projections" in { val rows = ParquetSource(path).withProjection("name").toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood"), Vector("elton john")) } it should "return all data when no projection is set" in { val rows = ParquetSource(path).toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner")) } private def cleanUpResidualParquetTestFiles = { new File(".").listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { (name.startsWith("test_") && name.endsWith(".pq")) || (name.startsWith(".test_") && name.endsWith(".pq.crc")) } }).foreach(_.delete()) } }
Example 69
Source File: AvroAndParquetCrossCompatibilityTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{FlatSpec, Matchers} // tests that avro source/sink and avro parquet source/sink can write/read each others files class AvroAndParquetCrossCompatibilityTest extends FlatSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) "AvroParquetSource and ParquetSource" should "be compatible" in { val path = new Path("cross.pq") if (fs.exists(path)) fs.delete(path, false) val structType = StructType( Field("name", StringType, nullable = false), Field("location", StringType, nullable = false) ) val ds = DataStream.fromValues( structType, Seq( Vector("clint eastwood", "carmel"), Vector("elton john", "pinner") ) ) ds.to(ParquetSink(path)) AvroParquetSource(path).toDataStream().collect shouldBe ds.collect fs.delete(path, false) ds.to(AvroParquetSink(path)) ParquetSource(path).toDataStream().collect shouldBe ds.collect fs.delete(path, false) } }
Example 70
Source File: ParquetSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object ParquetSpeedTest extends App with Timed { ParquetLogMute() val size = 2000000 val schema = StructType("a", "b", "c", "d", "e") val createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val path = new Path("parquet_speed.pq") fs.delete(path, false) new File(path.toString).deleteOnExit() timed("Insertion") { ds.to(AvroParquetSink(path).withOverwrite(true)) } while (true) { timed("Reading with ParquetSource") { val actual = ParquetSource(path).toDataStream().size assert(actual == size) } println("") println("---------") println("") Thread.sleep(2000) timed("Reading with AvroParquetSource") { val actual = AvroParquetSource(path).toDataStream().size assert(actual == size) } } }
Example 71
Source File: ParquetMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.StructType import io.eels.{FilePattern, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object ParquetMultipleFileSpeedTest extends App with Timed { ParquetLogMute() val size = 5000000 val count = 20 val schema = StructType("a", "b", "c", "d", "e") def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val dir = new Path("parquet-speed-test") new File(dir.toString).mkdirs() new File(dir.toString).listFiles().foreach(_.delete) timed("Insertion") { val ds = DataStream.fromRowIterator(schema, Iterator.continually(createRow).take(size)) ds.to(ParquetSink(new Path("parquet-speed-test/parquet_speed.pq")), count) } for (_ <- 1 to 25) { assert(count == FilePattern("parquet-speed-test/*").toPaths().size) timed("Reading with ParquetSource") { val actual = ParquetSource("parquet-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size assert(actual == size, s"Expected $size but was $actual") } println("") println("---------") println("") } }
Example 72
Source File: AvroParquetSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class AvroParquetSinkTest extends WordSpec with Matchers { ParquetLogMute() private val schema = StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) private val ds = DataStream.fromValues( schema, Seq( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) ) private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path("test.pq") "ParquetSink" should { "write schema" in { if (fs.exists(path)) fs.delete(path, false) ds.to(AvroParquetSink(path)) val people = ParquetSource(path) people.schema shouldBe StructType( Field("name", StringType, false), Field("job", StringType, false), Field("location", StringType, false) ) fs.delete(path, false) } "write data" in { if (fs.exists(path)) fs.delete(path, false) ds.to(AvroParquetSink(path)) AvroParquetSource(path).toDataStream().toSet.map(_.values) shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) fs.delete(path, false) } "support overwrite" in { val path = new Path("overwrite_test.pq") fs.delete(path, false) val schema = StructType(Field("a", StringType)) val ds = DataStream.fromRows(schema, Row(schema, Vector("x")), Row(schema, Vector("y")) ) ds.to(AvroParquetSink(path)) ds.to(AvroParquetSink(path).withOverwrite(true)) fs.delete(path, false) } } }
Example 73
Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.UUID import io.eels.component.avro.AvroSchemaFns import io.eels.component.parquet.avro.AvroParquetReaderFn import io.eels.schema.{DoubleType, Field, LongType, StructType} import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec} class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path(UUID.randomUUID().toString()) override def afterAll(): Unit = { val fs = FileSystem.get(new Configuration()) fs.delete(path, false) } private val avroSchema = SchemaBuilder.record("com.chuckle").fields() .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord() private val writer = AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .build() private val record = new GenericData.Record(avroSchema) record.put("str", "wibble") record.put("looong", 999L) record.put("dooble", 12.34) writer.write(record) writer.close() val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true)) "AvroParquetReaderFn" should { "support projections on doubles" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong")))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("dooble") shouldBe 12.34 } "support projections on longs" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str")))) val record = reader.read() reader.close() record.get("looong") shouldBe 999L } "support full projections" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("looong") shouldBe 999L record.get("dooble") shouldBe 12.34 } "support non projections" in { val reader = AvroParquetReaderFn(path, None, None) val group = reader.read() reader.close() group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" group.get("looong") shouldBe 999L group.get("dooble") shouldBe 12.34 } } }
Example 74
Source File: DecimalWriterTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.schema.{DecimalType, Field, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.FunSuite import scala.math.BigDecimal.RoundingMode class DecimalWriterTest extends FunSuite { test("negativeDecimalTest") { implicit val configuration = new Configuration val expectedBigDecimals = Seq(BigDecimal(-5025176.39), BigDecimal(-5), BigDecimal(-999.56434), BigDecimal(-10000.9890)) assertBigDecimals("bigd_negative.parquet", expectedBigDecimals) } test("positiveDecimalTest") { implicit val configuration = new Configuration val expectedBigDecimals = Seq(BigDecimal(5025176.39), BigDecimal(5), BigDecimal(999.56434), BigDecimal(-10000.9890)) assertBigDecimals("bigd_positive.parquet", expectedBigDecimals) } private def assertBigDecimals(filename: String, expectedBigDecimals: Seq[BigDecimal])(implicit configuration: Configuration): Unit = { val schema = StructType(Field(name = "bd", dataType = DecimalType(38, 10))) val path = new Path(filename) val fileSystem = path.getFileSystem(configuration) if (fileSystem.exists(path)) fileSystem.delete(path, false) // Write out the decimal values val parquetWriter = RowParquetWriterFn(path = path, schema = schema, metadata = Map.empty, dictionary = false, roundingMode = RoundingMode.UP, fileSystem.getConf) expectedBigDecimals.foreach { expectedBigDecimal => println(s"Writing row with value $expectedBigDecimal") parquetWriter.write(Row.fromMap(schema, Map("bd" -> expectedBigDecimal))) } parquetWriter.close() // Read back all the writes and assert their values val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(schema) val parquetReader = RowParquetReaderFn(path, None, Option(parquetProjectionSchema), dictionaryFiltering = true) for (i <- 0 until expectedBigDecimals.length) { val readRow = parquetReader.read println(s"read row: $readRow") assert(readRow.values.head == expectedBigDecimals(i)) } parquetReader.close() } }
Example 75
Source File: CsvSourceTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.csv import java.nio.file.Paths import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.scalatest.{Matchers, WordSpec} class CsvSourceTest extends WordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(conf) "CsvSource" should { "read schema" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).schema shouldBe StructType( Field("a", StringType, true), Field("b", StringType, true), Field("c", StringType, true) ) } "support null cell value option as null" in { val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI() val path = Paths.get(file) CsvSource(path).withNullValue(null).toDataStream().toSet.map(_.values) shouldBe Set(Vector("1", null, "3")) } "support null cell value replacement value" in { val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI() val path = Paths.get(file) CsvSource(path).withNullValue("foo").toDataStream().toSet.map(_.values) shouldBe Set(Vector("1", "foo", "3")) } "read from path" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstRow).toDataStream().size shouldBe 3 CsvSource(path).withHeader(Header.None).toDataStream().size shouldBe 4 } "allow specifying manual schema" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) val schema = StructType( Field("test1", StringType, true), Field("test2", StringType, true), Field("test3", StringType, true) ) CsvSource(path).withSchema(schema).toDataStream().schema shouldBe schema } "support reading header" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstRow).toDataStream().collect.map(_.values).toSet shouldBe Set(Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6")) } "support skipping header" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe Set(Vector("a", "b", "c"), Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6")) } "support delimiters" in { val file = getClass.getResource("/io/eels/component/csv/psv.psv").toURI() val path = Paths.get(file) CsvSource(path).withDelimiter('|').toDataStream().collect.map(_.values).toSet shouldBe Set(Vector("e", "f", "g")) CsvSource(path).withDelimiter('|').withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe Set(Vector("a", "b", "c"), Vector("e", "f", "g")) } "support comments for headers" in { val file = getClass.getResource("/io/eels/component/csv/comments.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType( Field("a", StringType, true), Field("b", StringType, true), Field("c", StringType, true) ) CsvSource(path).withHeader(Header.FirstComment).toDataStream().toSet.map(_.values) shouldBe Set(Vector("1", "2", "3"), Vector("e", "f", "g"), Vector("4", "5", "6")) } "terminate if asking for first comment but no comments" in { val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType( Field("", StringType, true) ) } "support skipping corrupt rows" ignore { val file = getClass.getResource("/io/eels/component/csv/corrupt.csv").toURI() val path = Paths.get(file) CsvSource(path).withHeader(Header.FirstRow).toDataStream().toVector.map(_.values) shouldBe Vector(Vector("1", "2", "3")) } } }
Example 76
Source File: Main.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object Main extends App { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf // the first parameter determines the command to run, just like in git, eg git pull, or in hadoop, eg hadoop fs val command = args.head val params = args.tail command match { case "schema" => ShowSchemaMain(params) case "stream" => StreamMain(params) case "apply-spec" => ApplySpecMain(params) case "fetch-spec" => FetchSpecMain(params) case "analyze" => AnalyzeMain(params) case other => System.err.println(s"Unknown command $other") } } case class Options(from: String = "", to: String = "", workerThreads: Int = 1, sourceIOThreads: Int = 1)
Example 77
Source File: FetchSpecMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import io.eels.{Constants, SourceParser} import io.eels.component.hive.{HiveSource, HiveSpec} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object FetchSpecMain { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel fetch-spec", Constants.EelVersion) opt[String]("dataset") required() action { (source, o) => o.copy(source = source) } text "specify dataset, eg hive:database:table" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val source = builder() source match { case hive: HiveSource => val spec = hive.spec val json = HiveSpec.writeAsJson(spec.copy(tables = spec.tables.filter(_.tableName == hive.tableName))) println(json) case _ => sys.error(s"Unsupported source $source") } case _ => } } case class Options(source: String = null) }
Example 78
Source File: ApplySpecMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import java.nio.file.{Path, Paths} import io.eels.{Constants, SourceParser} import io.eels.component.hive.{HiveOps, HiveSource, HiveSpec} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient object ApplySpecMain { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf implicit val client = new HiveMetaStoreClient(hiveConf) def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel apply-spec", Constants.EelVersion) opt[String]("dataset") required() action { (source, o) => o.copy(source = source) } text "specify dataset, eg hive:database:table" opt[String]("spec") required() action { (schema, o) => o.copy(specPath = Paths.get(schema)) } text "specify path to eel spec" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val source = builder() source match { case hive: HiveSource => HiveOps.applySpec(HiveSpec(options.specPath), false) case _ => sys.error(s"Unsupported source $source") } case _ => } } case class Options(source: String = null, specPath: Path = null) }
Example 79
Source File: ShowSchemaMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import io.eels.{Constants, SourceParser} import io.eels.component.avro.AvroSchemaFn import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object ShowSchemaMain { implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel schema", Constants.EelVersion) opt[String]("source") required() action { (source, o) => o.copy(source = source) } text "specify source, eg hive:database:table or parquet:/path/to/file" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val source = builder() val schema = source.schema val avroSchema = AvroSchemaFn.toAvro(schema) out.println(avroSchema) case _ => } } case class Options(source: String = "") }
Example 80
Source File: AnalyzeMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import java.io.PrintStream import io.eels.{Constants, SourceParser} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object AnalyzeMain { import scala.concurrent.ExecutionContext.Implicits.global implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String], out: PrintStream = System.out): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel analyze", Constants.EelVersion) opt[String]("dataset") required() action { (source, o) => o.copy(source = source) } text "specify dataset, eg hive:database:table" opt[Boolean]("reverse") optional() action { (reverse, o) => o.copy(reverse = reverse) } text "specify reverse ordering of columns, eg most distinct first" } parser.parse(args, Options()) match { case Some(options) => val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}")) val result = builder().counts.toSeq.sortBy(_._2.size) val orderedResults = if (options.reverse) result.reverse else result for ((columnName, columnCounts) <- orderedResults) { println(columnName) for ((value, counts) <- columnCounts) { println(s"\t$value ($counts)") } } case _ => } } case class Options(source: String = null, reverse: Boolean = false) }
Example 81
Source File: StreamMain.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.cli import io.eels.{Constants, Sink, SinkParser, SourceParser} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.conf.HiveConf object StreamMain { import scala.concurrent.ExecutionContext.Implicits.global implicit val fs = FileSystem.get(new Configuration) implicit val hiveConf = new HiveConf def apply(args: Seq[String]): Unit = { val parser = new scopt.OptionParser[Options]("eel") { head("eel", Constants.EelVersion) opt[String]("source") required() action { (source, o) => o.copy(from = source) } text "specify source, eg hive:database:table" opt[String]("sink") required() action { (sink, o) => o.copy(to = sink) } text "specify sink, eg hive:database:table" opt[Int]("sourceThreads") optional() action { (threads, options) => options.copy(sourceIOThreads = threads) } text "number of source io threads, defaults to 1" opt[Int]("workerThreads") optional() action { (threads, options) => options.copy(workerThreads = threads) } text "number of worker threads, defaults to 1" } parser.parse(args, Options()) match { case Some(options) => val sourceBuilder = SourceParser(options.from).orNull val source = sourceBuilder() val sinkBuilder = SinkParser(options.to).orNull val sink = sinkBuilder() val result = source.toFrame(options.sourceIOThreads).to(sink) println(s"Completed with $result rows") case _ => } } }
Example 82
Source File: HbaseTests.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hbase import java.nio.file.Paths import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase._ import org.apache.hadoop.hdfs.MiniDFSCluster trait HbaseTests { val MINI_CLUSTER_ROOT = "miniclusters" def startHBaseCluster(clusterName: String): MiniHBaseCluster = { // Setup the underlying HDFS mini cluster for HBASE mini cluster System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA) val clusterFolder = s"${clusterName}_${UUID.randomUUID().toString}" val clusterPath = Paths.get(MINI_CLUSTER_ROOT, clusterFolder) val conf = new Configuration() conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, clusterPath.toAbsolutePath.toString) val miniDFSCluster = new MiniDFSCluster.Builder(conf).build() // Now setup and start the HBASE mini cluster val hBaseTestingUtility = new HBaseTestingUtility hBaseTestingUtility.setDFSCluster(miniDFSCluster) hBaseTestingUtility.startMiniCluster(1, 1) val cluster = hBaseTestingUtility.getHBaseCluster cluster.waitForActiveAndReadyMaster() cluster } }
Example 83
Source File: HiveFilePublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.io.Using import io.eels.datastream.{Subscription, Publisher, Subscriber} import io.eels.schema.{Partition, StructType} import io.eels.{Predicate, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus} class HiveFilePublisher(dialect: HiveDialect, file: LocatedFileStatus, metastoreSchema: StructType, projectionSchema: StructType, predicate: Option[Predicate], partition: Partition) (implicit fs: FileSystem, conf: Configuration) extends Publisher[Seq[Row]] with Using { require(projectionSchema.fieldNames.forall { it => it == it.toLowerCase() }, s"Use only lower case field names with hive") override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { val partitionMap: Map[String, Any] = partition.entries.map { it => (it.key, it.value) }.toMap // the schema we send to the dialect must have any partition fields removed, because those // fields won't exist in the data files. This is because partitions are not always written // and instead inferred from the partition itself. val projectionFields = projectionSchema.fields.filterNot(field => partition.containsKey(field.name)) val projectionWithoutPartitions = StructType(projectionFields) // since we removed the partition fields from the target schema, we must repopulate them after the read // we also need to throw away the dummy field if we had an empty schema val publisher = dialect.input(file.getPath, metastoreSchema, projectionWithoutPartitions, predicate) publisher.subscribe(new Subscriber[Seq[Row]] { override def subscribed(s: Subscription): Unit = subscriber.subscribed(s) override def next(chunk: Seq[Row]): Unit = { val aligned = chunk.map { row => if (projectionFields.isEmpty) { val values = projectionSchema.fieldNames().map(partitionMap.apply) Row(projectionSchema, values.toVector) } else { RowUtils.rowAlign(row, projectionSchema, partitionMap) } } subscriber.next(aligned) } override def completed(): Unit = subscriber.completed() override def error(t: Throwable): Unit = subscriber.error(t) }) } }
Example 84
Source File: HiveStats.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import io.eels.schema.PartitionConstraint import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import scala.collection.JavaConverters._ trait HiveStats { // total number of records def count: Long = count(Nil) // total number of records in the partitions that match the constraints def count(constraints: Seq[PartitionConstraint]): Long // returns the minimum value of this field def min(field: String): Any = min(field, Nil) // returns the maximum value of this field def max(field: String): Any = max(field, Nil) // returns the minimum value of this field for the partitions that match the constraints def min(field: String, constraints: Seq[PartitionConstraint]): Any // returns the maximum value of this field for the partitions that match the constraints def max(field: String, constraints: Seq[PartitionConstraint]): Any } class ParquetHiveStats(dbName: String, tableName: String, table: HiveTable) (implicit fs: FileSystem, conf: Configuration, client: IMetaStoreClient) extends HiveStats with Logging { private val ops = new HiveOps(client) private def count(path: Path) = { val blocks = ParquetFileReader.readFooter(fs.getConf, path, ParquetMetadataConverter.NO_FILTER).getBlocks.asScala blocks.map(_.getRowCount).sum } override def count(constraints: Seq[PartitionConstraint]): Long = { val counts = HiveTableFilesFn(dbName, tableName, table.location, constraints) .flatMap(_._2) .map(_.getPath).map(count) if (counts.isEmpty) 0 else counts.sum } private def minmax(field: String, constraints: Seq[PartitionConstraint]): (Any, Any) = { def stats[T]: (Any, Any) = { def min(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) <= 0) a else b } def max(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) >= 0) a else b } val location = new Path(ops.location(dbName, tableName)) val (mins, maxes) = HiveTableFilesFn(dbName, tableName, location, constraints).toSeq.flatMap { case (_, files) => logger.debug(s"Calculating min,max in file $files") files.flatMap { file => val footer = ParquetFileReader.readFooter(conf, file, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.map { block => val column = block.getColumns.asScala.find(_.getPath.toDotString == field).getOrError(s"Unknown column $field") val min = column.getStatistics.genericGetMin.asInstanceOf[Comparable[T]] val max = column.getStatistics.genericGetMax.asInstanceOf[Comparable[T]] (min, max) } } }.unzip (min(mins), max(maxes)) } stats[Any] } override def min(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._1 override def max(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._2 }
Example 85
Source File: ParquetHiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.dialect import java.util.concurrent.atomic.AtomicInteger import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.hive.{HiveDialect, HiveOps, HiveOutputStream} import io.eels.component.parquet._ import io.eels.component.parquet.util.{ParquetIterator, ParquetLogMute} import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe import org.apache.hadoop.hive.ql.io.parquet.{MapredParquetInputFormat, MapredParquetOutputFormat} import scala.math.BigDecimal.RoundingMode.RoundingMode case class ParquetHiveDialect(options: ParquetWriteOptions = ParquetWriteOptions()) extends HiveDialect with Logging with Using { override val serde: String = classOf[ParquetHiveSerDe].getCanonicalName override val inputFormat: String = classOf[MapredParquetInputFormat].getCanonicalName override val outputFormat: String = classOf[MapredParquetOutputFormat].getCanonicalName override def input(path: Path, ignore: StructType, projectionSchema: StructType, predicate: Option[Predicate]) (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] { val client = new HiveMetaStoreClient(new HiveConf) val ops = new HiveOps(client) override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { // convert the eel projection schema into a parquet schema which will be used by the native parquet reader try { val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(projectionSchema) using(RowParquetReaderFn(path, predicate, parquetProjectionSchema.some, true)) { reader => val subscription = new Subscription { override def cancel(): Unit = reader.close() } subscriber.subscribed(subscription) ParquetIterator(reader).grouped(DataStream.DefaultBatchSize).foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } } override def output(schema: StructType, path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String]) (implicit fs: FileSystem, conf: Configuration): HiveOutputStream = { val path_x = path new HiveOutputStream { ParquetLogMute() private val _records = new AtomicInteger(0) logger.debug(s"Creating parquet writer at $path") private val writer = RowParquetWriterFn(path, schema, metadata, true, roundingMode, fs.getConf) override def write(row: Row) { require(row.values.nonEmpty, "Attempting to write an empty row") writer.write(row) _records.incrementAndGet() } override def close(): Unit = { logger.debug(s"Closing hive parquet writer $path") writer.close() // after the files are closed, we should set permissions if we've been asked to, this allows // all the files we create to stay consistent permission.foreach(fs.setPermission(path, _)) } override def records: Int = _records.get() override def path: Path = path_x } } }
Example 86
Source File: OrcHiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.dialect import com.sksamuel.exts.Logging import io.eels.component.hive.{HiveDialect, HiveOutputStream} import io.eels.component.orc.{OrcPublisher, OrcWriteOptions, OrcWriter} import io.eels.datastream.{Publisher, Subscriber} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde} import scala.math.BigDecimal.RoundingMode.RoundingMode case class OrcHiveDialect(options: OrcWriteOptions = OrcWriteOptions()) extends HiveDialect with Logging { override val serde: String = classOf[OrcSerde].getCanonicalName override val inputFormat: String = classOf[OrcInputFormat].getCanonicalName override val outputFormat: String = classOf[OrcOutputFormat].getCanonicalName override def input(path: Path, metastoreSchema: StructType, projectionSchema: StructType, predicate: Option[Predicate]) (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { new OrcPublisher(path, projectionSchema.fieldNames(), predicate).subscribe(subscriber) } } override def output(schema: StructType, path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String])(implicit fs: FileSystem, conf: Configuration): HiveOutputStream = { val path_x = path val writer = new OrcWriter(path, schema, options) new HiveOutputStream { override def write(row: Row): Unit = { require(row.values.nonEmpty, "Attempting to write an empty row") writer.write(row) } override def close(): Unit = { writer.close() permission.foreach(fs.setPermission(path, _)) } override def records: Int = writer.records override def path: Path = path_x } } }
Example 87
Source File: HiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import io.eels.component.hive.dialect.{OrcHiveDialect, ParquetHiveDialect} import io.eels.datastream.Publisher import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.api.Table import scala.math.BigDecimal.RoundingMode.RoundingMode trait HiveDialect extends Logging { def serde: String def inputFormat: String def outputFormat: String def output(schema: StructType, // schema without partition information path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String]) (implicit fs: FileSystem, conf: Configuration): HiveOutputStream def stats(getPath: Path)(implicit fs: FileSystem): Long = throw new UnsupportedOperationException } object HiveDialect extends Logging { def apply(format: String): HiveDialect = format match { case input if input.contains("ParquetInputFormat") => ParquetHiveDialect() case input if input.contains("OrcInputFormat") => OrcHiveDialect() //case input if input.contains("AvroHiveDialect") || input.contains("AvroContainerInputFormat") => AvroHiveDialect // "org.apache.hadoop.mapred.TextInputFormat" -> TextHiveDialect case _ => throw new UnsupportedOperationException(s"Unknown hive input format $format") } def apply(table: Table): HiveDialect = { val format = table.getSd.getInputFormat logger.debug(s"Table format is $format") val dialect = HiveDialect(format) logger.debug(s"HiveDialect is $dialect") dialect } }
Example 88
Source File: ParquetVsOrcSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.io.File import java.math.MathContext import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.component.orc.{OrcSink, OrcSource} import io.eels.component.parquet.{ParquetSink, ParquetSource} import io.eels.datastream.DataStream import io.eels.schema._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.math.BigDecimal.RoundingMode import scala.util.Random object ParquetVsOrcSpeedTest extends App with Timed { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val size = 5000000 val structType = StructType( Field("name", StringType), Field("age", IntType.Signed), Field("height", DoubleType), Field("amazing", BooleanType), Field("fans", LongType.Signed), Field("rating", DecimalType(4, 2)) ) def iter: Iterator[Vector[Any]] = Iterator.continually(Vector( Random.nextString(10), Random.nextInt(), Random.nextDouble(), Random.nextBoolean(), Random.nextLong(), BigDecimal(Random.nextDouble(), new MathContext(4)).setScale(2, RoundingMode.UP) )) def ds: DataStream = DataStream.fromIterator(structType, iter.take(size).map(Row(structType, _))) val ppath = new Path("parquet_speed.pq") fs.delete(ppath, false) val opath = new Path("orc_speed.orc") fs.delete(opath, false) new File(ppath.toString).deleteOnExit() new File(opath.toString).deleteOnExit() timed("Orc Insertion") { ds.to(OrcSink(opath)) } timed("Parquet Insertion") { ds.to(ParquetSink(ppath)) } while (true) { timed("Reading with OrcSource") { val actual = OrcSource(opath).toDataStream().size assert(actual == size, s"$actual != $size") } timed("Reading with ParquetSource") { val actual = ParquetSource(ppath).toDataStream().size assert(actual == size, s"$actual != $size") } } }
Example 89
Source File: HiveTableFilesFnTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.nio.file.Paths import com.sksamuel.exts.Logging import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hdfs.MiniDFSCluster import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.scalatest.mockito.MockitoSugar import org.scalatest.{FlatSpec, Matchers} class HiveTableFilesFnTest extends FlatSpec with Matchers with Logging with MockitoSugar { System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA) val clusterPath = Paths.get("miniclusters", "cluster") val conf = new Configuration() conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, clusterPath.toAbsolutePath.toString) val cluster = new MiniDFSCluster.Builder(conf).build() implicit val fs = cluster.getFileSystem "HiveTableFilesFn" should "detect all files in root when no partitions" in { implicit val client = mock[IMetaStoreClient] org.mockito.Mockito.when(client.getTable("default", "mytable")).thenReturn(new Table) val root = new Path("tab1") fs.mkdirs(root) // table scanner will skip 0 length files val a = fs.create(new Path(root, "a")) a.write(1) a.close() val b = fs.create(new Path(root, "b")) b.write(1) b.close() HiveTableFilesFn("default", "mytable", fs.resolvePath(root), Nil).values.flatten.map(_.getPath.getName).toSet shouldBe Set("a", "b") } it should "ignore hidden files in root when no partitions" in { implicit val client = mock[IMetaStoreClient] org.mockito.Mockito.when(client.getTable("default", "mytable")).thenReturn(new Table) val root = new Path("tab2") fs.mkdirs(root) // table scanner will skip 0 length files val a = fs.create(new Path(root, "a")) a.write(1) a.close() val b = fs.create(new Path(root, "_b")) b.write(1) b.close() HiveTableFilesFn("default", "mytable", fs.resolvePath(root), Nil).values.flatten.map(_.getPath.getName).toSet shouldBe Set("a") } }
Example 90
Source File: HiveBenchmarkApp.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.util.UUID import com.sksamuel.exts.metrics.Timed import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient import scala.util.Random object HiveBenchmarkApp extends App with Timed { val states = List( "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming").map(_.replace(' ', '_').toLowerCase) import HiveConfig._ val schema = StructType("id", "state") val rows = List.fill(1000000)(List(UUID.randomUUID.toString, states(Random.nextInt(50)))) logger.info(s"Generated ${rows.size} rows") new HiveOps(client).createTable( "sam", "people", schema, List("state"), overwrite = true ) logger.info("Table created") val sink = HiveSink("sam", "people") DataStream.fromValues(schema, rows).to(sink) logger.info("Write complete") while (true) { timed("datastream took") { val result = HiveSource("sam", "people").toDataStream().collect println(result.size) } } }
Example 91
Source File: OrcWriter.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.util.concurrent.atomic.AtomicInteger import java.util.function.IntUnaryOperator import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.exec.vector.ColumnVector import org.apache.orc.{OrcConf, OrcFile, TypeDescription} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer // performs the actual write out of orc data, to be used by an orc sink class OrcWriter(path: Path, structType: StructType, options: OrcWriteOptions)(implicit conf: Configuration) extends Logging { private val schema: TypeDescription = OrcSchemaFns.toOrcSchema(structType) logger.trace(s"Creating orc writer for schema $schema") private val batchSize = { val size = ConfigFactory.load().getInt("eel.orc.sink.batchSize") Math.max(Math.min(1024, size), 1) } logger.debug(s"Orc writer will use batchsize=$batchSize") private val buffer = new ArrayBuffer[Row](batchSize) private val serializers = schema.getChildren.asScala.map(OrcSerializer.forType).toArray private val batch = schema.createRowBatch(batchSize) OrcConf.COMPRESSION_STRATEGY.setString(conf, options.compressionStrategy.name) OrcConf.COMPRESS.setString(conf, options.compressionKind.name) options.encodingStrategy.map(_.name).foreach(OrcConf.ENCODING_STRATEGY.setString(conf, _)) options.compressionBufferSize.foreach(OrcConf.BUFFER_SIZE.setLong(conf, _)) private val woptions = OrcFile.writerOptions(conf).setSchema(schema) options.rowIndexStride.foreach { size => woptions.rowIndexStride(size) logger.debug(s"Using stride size = $size") } if (options.bloomFilterColumns.nonEmpty) { woptions.bloomFilterColumns(options.bloomFilterColumns.mkString(",")) logger.debug(s"Using bloomFilterColumns = $options.bloomFilterColumns") } private lazy val writer = OrcFile.createWriter(path, woptions) private val counter = new AtomicInteger(0) def write(row: Row): Unit = { buffer.append(row) if (buffer.size == batchSize) flush() } def records: Int = counter.get() def flush(): Unit = { def writecol[T <: ColumnVector](rowIndex: Int, colIndex: Int, row: Row): Unit = { val value = row.values(colIndex) val vector = batch.cols(colIndex).asInstanceOf[T] val serializer = serializers(colIndex).asInstanceOf[OrcSerializer[T]] serializer.writeToVector(rowIndex, vector, value) } // don't use foreach here, using old school for loops for perf for (rowIndex <- buffer.indices) { val row = buffer(rowIndex) for (colIndex <- batch.cols.indices) { writecol(rowIndex, colIndex, row) } } batch.size = buffer.size writer.addRowBatch(batch) counter.updateAndGet(new IntUnaryOperator { override def applyAsInt(operand: Int): Int = operand + batch.size }) buffer.clear() batch.reset() } def close(): Long = { if (buffer.nonEmpty) flush() writer.close() val count = writer.getNumberOfRows logger.info(s"Orc writer wrote $count rows") count } }
Example 92
Source File: OrcBatchIterator.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import com.sksamuel.exts.Logging import io.eels.{Predicate, Row} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector import org.apache.orc.Reader object OrcBatchIterator extends Logging { def apply(reader: Reader, fileSchema: StructType, projection: Seq[String], predicate: Option[Predicate]) (implicit conf: Configuration): Iterator[Seq[Row]] = new Iterator[Seq[Row]] { val options = new Reader.Options() // if we have a projection then we need to return a schema that matches // the projection and not the full file schema val schema = if (projection.isEmpty) fileSchema else { val fields = projection.flatMap(name => fileSchema.field(name)) StructType(fields) } logger.trace(s"Orc read will use projection=$schema") // a projection is column index based, so the given projection columns must be // resolved against the file schema to work out which column indices are required if (projection.nonEmpty) { // we have to include a true for the containing struct itself val includes = true +: fileSchema.fieldNames.map(projection.contains) logger.debug(s"Setting included columns=${includes.mkString(",")}") options.include(includes.toArray) } val searchArg = predicate.foreach { predicate => val searchArg = OrcPredicateBuilder.build(predicate) options.searchArgument(searchArg, predicate.fields.toArray) logger.info(s"Setting predicate=$searchArg") } // if true then the predicate is applied to rows as well as being pushed down into the stripes, // this is because orc will either skip a stripe or return the whole stripe. // it is useful to disable for unit testing val rowLevelFilter = conf.get("eel.orc.predicate.row.filter", "true") != "false" logger.debug(s"Row level filtering = $rowLevelFilter") val batch = reader.getSchema().createRowBatch() val rows = reader.rows(options) val vector = new StructColumnVector(batch.numCols, batch.cols: _*) val projectionIndices = schema.fields.map(fileSchema.indexOf) val deserializer = new StructDeserializer(schema.fields, projectionIndices) override def hasNext(): Boolean = rows.nextBatch(batch) && !batch.endOfFile && batch.size > 0 override def next(): Seq[Row] = { val rows = Vector.newBuilder[Row] for (rowIndex <- 0 until batch.size) { val values = deserializer.readFromVector(rowIndex, vector) val row = Row(schema, values) if (rowLevelFilter && predicate.isDefined) { if (predicate.get.eval(row)) { rows += row } } else { rows += row } } batch.reset() rows.result() } } }
Example 93
Source File: OrcSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.orc.OrcFile.ReaderOptions import org.apache.orc._ import scala.collection.JavaConverters._ object OrcSource { def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(path)) def apply(str: String)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(str)) } case class OrcSource(pattern: FilePattern, projection: Seq[String] = Nil, predicate: Option[Predicate] = None) (implicit fs: FileSystem, conf: Configuration) extends Source with Using { override def parts(): Seq[Publisher[Seq[Row]]] = pattern.toPaths().map(new OrcPublisher(_, projection, predicate)) def withPredicate(predicate: Predicate): OrcSource = copy(predicate = predicate.some) def withProjection(first: String, rest: String*): OrcSource = withProjection(first +: rest) def withProjection(fields: Seq[String]): OrcSource = { require(fields.nonEmpty) copy(projection = fields.toList) } override def schema: StructType = { val reader = OrcFile.createReader(pattern.toPaths().head, new ReaderOptions(conf)) val schema = reader.getSchema OrcSchemaFns.fromOrcType(schema).asInstanceOf[StructType] } private def reader() = { val options = new ReaderOptions(conf) OrcFile.createReader(pattern.toPaths().head, options) } def count(): Long = reader().getNumberOfRows def statistics(): Seq[ColumnStatistics] = reader().getStatistics.toVector def stripes(): Seq[StripeInformation] = reader().getStripes.asScala def stripeStatistics(): Seq[StripeStatistics] = reader().getStripeStatistics.asScala } class OrcPublisher(path: Path, projection: Seq[String], predicate: Option[Predicate])(implicit conf: Configuration) extends Publisher[Seq[Row]] { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { val reader = OrcFile.createReader(path, new ReaderOptions(conf)) val fileSchema = OrcSchemaFns.fromOrcType(reader.getSchema).asInstanceOf[StructType] val iterator: Iterator[Row] = OrcBatchIterator(reader, fileSchema, projection, predicate).flatten val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) iterator.grouped(DataStream.DefaultBatchSize).takeWhile(_ => running.get).foreach(subscriber.next) subscriber.completed() } catch { case t: Throwable => subscriber.error(t) } } }
Example 94
Source File: OrcSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.config.ConfigSupport import com.typesafe.config.ConfigFactory import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.orc.OrcFile.{CompressionStrategy, EncodingStrategy} import org.apache.orc.OrcProto.CompressionKind case class OrcWriteOptions(overwrite: Boolean = false, compressionKind: CompressionKind, compressionStrategy: CompressionStrategy, compressionBufferSize: Option[Int], encodingStrategy: Option[EncodingStrategy], bloomFilterColumns: Seq[String] = Nil, permission: Option[FsPermission] = None, inheritPermissions: Option[Boolean] = None, rowIndexStride: Option[Int] = None) { def withCompressionKind(kind: CompressionKind): OrcWriteOptions = copy(compressionKind = kind) def withCompressionStrategy(strategy: CompressionStrategy): OrcWriteOptions = copy(compressionStrategy = strategy) def withCompressionBufferSize(size: Int): OrcWriteOptions = copy(compressionBufferSize = size.some) def withEncodingStrategy(strategy: EncodingStrategy): OrcWriteOptions = copy(encodingStrategy = strategy.some) def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcWriteOptions = copy(bloomFilterColumns = bloomFilterColumns) def withRowIndexStride(stride: Int): OrcWriteOptions = copy(rowIndexStride = stride.some) def withOverwrite(overwrite: Boolean): OrcWriteOptions = copy(overwrite = overwrite) def withPermission(permission: FsPermission): OrcWriteOptions = copy(permission = permission.some) def withInheritPermission(inheritPermissions: Boolean): OrcWriteOptions = copy(inheritPermissions = inheritPermissions.some) } object OrcWriteOptions extends ConfigSupport { // creates a config from the typesafe reference.confs def apply(): OrcWriteOptions = { val config = ConfigFactory.load() OrcWriteOptions( false, CompressionKind valueOf config.getString("eel.orc.writer.compression-kind"), CompressionStrategy valueOf config.getString("eel.orc.writer.compression-strategy"), config.getIntOpt("eel.orc.writer.compression-buffer-size"), config.getStringOpt("eel.orc.writer.encoding-strategy").map(EncodingStrategy.valueOf) ) } } case class OrcSink(path: Path, options: OrcWriteOptions = OrcWriteOptions()) (implicit fs: FileSystem, conf: Configuration) extends Sink with Logging { // -- convenience options -- def withCompressionKind(kind: CompressionKind): OrcSink = copy(options = options.copy(compressionKind = kind)) def withCompressionStrategy(strategy: CompressionStrategy): OrcSink = copy(options = options.copy(compressionStrategy = strategy)) def withCompressionBufferSize(size: Int): OrcSink = copy(options = options.copy(compressionBufferSize = size.some)) def withEncodingStrategy(strategy: EncodingStrategy): OrcSink = copy(options = options.copy(encodingStrategy = strategy.some)) def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcSink = copy(options = options.copy(bloomFilterColumns = bloomFilterColumns)) def withRowIndexStride(stride: Int): OrcSink = copy(options = options.copy(rowIndexStride = stride.some)) def withOverwrite(overwrite: Boolean): OrcSink = copy(options = options.copy(overwrite = overwrite)) def withPermission(permission: FsPermission): OrcSink = copy(options = options.copy(permission = permission.some)) def withInheritPermission(inheritPermissions: Boolean): OrcSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some)) override def open(schema: StructType, n: Int): Seq[SinkWriter] = { if (n == 1) Seq(create(schema, path)) else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) } } override def open(schema: StructType): SinkWriter = create(schema, path) private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter { if (options.overwrite && fs.exists(path)) fs.delete(path, false) val writer = new OrcWriter(path, schema, options) override def write(row: Row): Unit = writer.write(row) override def close(): Unit = { writer.close() options.permission match { case Some(perm) => fs.setPermission(path, perm) case None => if (options.inheritPermissions.getOrElse(false)) { val permission = fs.getFileStatus(path.getParent).getPermission fs.setPermission(path, permission) } } } } }
Example 95
Source File: OrcMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.datastream.DataStream import io.eels.schema.StructType import io.eels.{FilePattern, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object OrcMultipleFileSpeedTest extends App with Timed { val size = 5000000 val count = 20 val schema = StructType("a", "b", "c", "d", "e") def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val dir = new Path("orc-speed-test") new File(dir.toString).mkdirs() timed("Insertion") { val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size)) new File(dir.toString).listFiles().foreach(_.delete) ds.to(OrcSink(new Path("orc-speed-test/orc_speed.pq")).withOverwrite(true), count) } for (_ <- 1 to 25) { assert(count == FilePattern("orc-speed-test/*").toPaths().size) timed("Reading with OrcSource") { val actual = OrcSource("orc-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size assert(actual == size, s"Expected $size but was $actual") } println("") println("---------") println("") } }
Example 96
Source File: OrcPredicateTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.io.{File, FilenameFilter} import io.eels.Predicate import io.eels.datastream.DataStream import io.eels.schema.{Field, LongType, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} class OrcPredicateTest extends FlatSpec with Matchers with BeforeAndAfterAll { cleanUpResidualOrcTestFiles val schema = StructType( Field("name", StringType, nullable = true), Field("city", StringType, nullable = true), Field("age", LongType.Signed, nullable = true) ) val values = Vector.fill(1000) { Vector("sam", "middlesbrough", 37) } ++ Vector.fill(1000) { Vector("laura", "iowa city", 24) } val ds = DataStream.fromValues(schema, values) implicit val conf = new Configuration() implicit val fs = FileSystem.get(new Configuration()) val path = new Path("test.orc") if (fs.exists(path)) fs.delete(path, false) new File(path.toString).deleteOnExit() ds.to(OrcSink(path).withRowIndexStride(1000)) override protected def afterAll(): Unit = fs.delete(path, false) "OrcSource" should "support string equals predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L)) } it should "support gt predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.gt("age", 30L)).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L)) } it should "support lt predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.lt("age", 30)).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("laura", "iowa city", 24L)) } it should "enable row level filtering with predicates by default" in { conf.set("eel.orc.predicate.row.filter", "true") val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect rows.head.schema shouldBe schema rows.head.values shouldBe Vector("sam", "middlesbrough", 37L) } private def cleanUpResidualOrcTestFiles = { new File(".").listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { (name.startsWith("test_") && name.endsWith(".orc")) || (name.startsWith(".test_") && name.endsWith(".orc.crc")) } }).foreach(_.delete()) } }
Example 97
Source File: InputFormatConf.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.io.{ LongWritable, Text, Writable } import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader } import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat } import scala.collection.immutable trait InputFormatConf[K, V] extends Serializable { type IF <: InputFormat[K, V] type Split <: InputSplit with Writable type KExtract <: Extract[K] type VExtract <: Extract[V] def kExtract: KExtract def vExtract: VExtract def makeInputFormat(): IF // I'm unsure if we should WriSer them for them def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]] // TODO do we want to require typing of the RecordReader as well? final def createRecordReader(hadoopConf: Configuration, split: Split, inputFormat: IF = makeInputFormat()): RecordReader[K, V] = { val tac = ConfOnlyTAC(hadoopConf) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) recordReader } } case class TextInputFormatConf(file: String, partitions: Int) extends InputFormatConf[LongWritable, Text] { type IF = TextInputFormat type Split = FileSplit // TODO now that we figured out what's up, see if we can't eliminate the need for this... val internalK = Extract.unit[LongWritable] val internalV = Extract.text type KExtract = internalK.type type VExtract = internalV.type override val kExtract: KExtract = internalK override val vExtract: VExtract = internalV def makeInputFormat() = new TextInputFormat() def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = { val job = Job.getInstance(hadoopConf) FileInputFormat.setInputPaths(job, file) val path = new Path(file) val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen val size_per = math.round(len / partitions.toDouble) ((0 until partitions - 1).map { p => new FileSplit(path, size_per * p, size_per, null) } :+ { val fin = size_per * (partitions - 1) new FileSplit(path, fin, len - fin, null) }).map(WriSer(_)) } } // TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf object CSVInputFormatConf { def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract } = new InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract override val kExtract: KExtract = ifc.kExtract override val vExtract: VExtract = ifc.vExtract override def makeInputFormat() = ifc.makeInputFormat() override def makeSplits(hadoopConf: Configuration) = { val splits = ifc.makeSplits(hadoopConf) splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) { case WriSer(head) => val rr = createRecordReader(hadoopConf, head) require(rr.nextKeyValue, "csv has no header, first line was empty") val afterHeader = rr.getCurrentKey.get require(rr.nextKeyValue, "first split is empty") WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +: splits.tail } } } }
Example 98
Source File: Hadoop.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkContext import com.twosigma.flint.rdd.Range import grizzled.slf4j.Logger object Hadoop { val logger = Logger(Hadoop.getClass) def fileSplits[K1, V1, K: Ordering]( sc: SparkContext, file: String, ifConf: InputFormatConf[K1, V1] // TODO consider just straight up making this (K, K) as we CAN get it, it's just a pain. )(parseKey: (ifConf.KExtract#Extracted, ifConf.VExtract#Extracted) => K): Map[Int, (Range[K], WriSer[ifConf.Split])] = { val splits = ifConf.makeSplits(new Configuration()) logger.info(s"Total number of splits: ${splits.size}") splits.foreach { s => logger.debug(s.get.toString) } // TODO implement the version which does the more rigorous thing, at least for splits that // support it val m = getSplitTimes(sc, ifConf)(parseKey, splits) .sortBy(_._1) .zip(splits) .map { case ((index, time), split) => (index, (time, split)) } .toMap m.map { case (k, (b, w)) => (k, (Range(b, m.get(k + 1).map(_._1)), w)) } } def getSplitTimes[K1, V1, K]( sc: SparkContext, ifConf: InputFormatConf[K1, V1] )( parseKey: (ifConf.KExtract#Extracted, ifConf.VExtract#Extracted) => K, splits: Seq[WriSer[ifConf.Split]] ): Vector[(Int, K)] = sc.parallelize(splits.zipWithIndex).map { case (serSplit, num) => val (a, b) = readRecords(ifConf)(serSplit).next val time = parseKey(a, b) Vector((num, time)) }.reduce(_ ++ _) def readRecords[K, V](ifConf: InputFormatConf[K, V])( serSplit: WriSer[ifConf.Split] ): Iterator[(ifConf.KExtract#Extracted, ifConf.VExtract#Extracted)] = { val inputFormat = ifConf.makeInputFormat() val split = serSplit.get val tac = ConfOnlyTAC(new Configuration()) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) logger.info(s"Beginning to read lines from split: $split") new Iterator[(ifConf.KExtract#Extracted, ifConf.VExtract#Extracted)] { var stillMore = false lazy val init = stillMore = recordReader.nextKeyValue() override def hasNext = { init stillMore } override def next = { init if (!stillMore) sys.error("hit end of iterator") val toReturn = ( ifConf.kExtract(recordReader.getCurrentKey), ifConf.vExtract(recordReader.getCurrentValue) ) stillMore = recordReader.nextKeyValue toReturn } } } }
Example 99
Source File: ConfOnlyTAC.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{ Counter, TaskAttemptID, Job, TaskAttemptContext } // This exists just because of a quirk of the record reader api. case class ConfOnlyTAC(_conf: Configuration) extends Job with TaskAttemptContext { // JobContextImpl and JobContext override def getConfiguration: Configuration = _conf // TaskAttemptContext override def getTaskAttemptID: TaskAttemptID = sys.error("not implemented") override def setStatus(msg: String): Unit = sys.error("not implemented") override def getStatus = sys.error("not implemented") override def getProgress: Float = sys.error("not implemented") override def getCounter(counterName: Enum[_]): Counter = sys.error("not implemented") override def getCounter(groupName: String, counterName: String): Counter = sys.error("not implemented") // Progressable override def progress(): Unit = sys.error("not implemented") }
Example 100
Source File: RMCallbackHandler.scala From DataXServer with Apache License 2.0 | 5 votes |
package org.tianlangstudio.data.hamal.yarn import java.io.File import java.util.{Collections, List} import org.tianlangstudio.data.hamal.core.{Constants, HamalConf} import org.tianlangstudio.data.hamal.core.HamalConf //import java.util.Collections import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path, FileContext} import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.client.api.{AMRMClient, NMClient} import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import scala.jdk.CollectionConverters._ //import scala.collection.JavaConverters._ /** * Created by zhuhq on 2016/4/29. */ class RMCallbackHandler(nmClient:NMClient,containerCmd:Container => String,hamalConf: HamalConf,yarnConfiguration: Configuration) extends AMRMClientAsync.CallbackHandler { private val logging = org.slf4j.LoggerFactory.getLogger(classOf[RMCallbackHandler]) override def onContainersCompleted(statuses: List[ContainerStatus]): Unit = { for(containerStatus <- statuses.asScala) { logging.info(s"containerId:${containerStatus} exitStatus:${containerStatus}") } } override def onError(e: Throwable): Unit = { logging.error("on error",e) } override def getProgress: Float = { 0 } override def onShutdownRequest(): Unit = { logging.info("on shutdown request") } override def onNodesUpdated(updatedNodes: List[NodeReport]): Unit = { logging.info("on nodes updated") for(nodeReport <- updatedNodes.asScala) { logging.info(s"node id:${nodeReport} node labels:${nodeReport}"); } } override def onContainersAllocated(containers: List[Container]): Unit = { logging.info("on containers allocated"); for (container:Container <- containers.asScala) { try { // Launch container by create ContainerLaunchContext val ctx = Records.newRecord(classOf[ContainerLaunchContext]); //ctx.setCommands(Collections.singletonList(""" echo "begin";sleep 900;echo "end"; """)) ctx.setCommands(Collections.singletonList(containerCmd(container))) val packagePath = hamalConf.getString(Constants.DATAX_EXECUTOR_FILE,"executor.zip"); val archiveStat = FileSystem.get(yarnConfiguration).getFileStatus(new Path(packagePath)) val packageUrl = ConverterUtils.getYarnUrlFromPath( FileContext.getFileContext.makeQualified(new Path(packagePath))); val packageResource = Records.newRecord[LocalResource](classOf[LocalResource]) packageResource.setResource(packageUrl); packageResource.setSize(archiveStat.getLen); packageResource.setTimestamp(archiveStat.getModificationTime); packageResource.setType(LocalResourceType.ARCHIVE); packageResource.setVisibility(LocalResourceVisibility.APPLICATION) ctx.setLocalResources(Collections.singletonMap(Constants.DATAX_EXECUTOR_ARCHIVE_FILE_NAME,packageResource)) logging.info("[AM] Launching container " + container.getId()); nmClient.startContainer(container, ctx); } catch { case ex:Exception => logging.info("[AM] Error launching container " + container.getId() + " " + ex); } } } }
Example 101
Source File: HbRddConfig.scala From hbrdd with Apache License 2.0 | 5 votes |
package top.spoofer.hbrdd.config import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.HBaseConfiguration class HbRddConfig(config: Configuration) extends Serializable { def getHbaseConfig = HBaseConfiguration.create(config) } object HbRddConfig { type configOption = (String, String) private[HbRddConfig] case class HbaseOption(name: String, value: String) def apply(config: Configuration): HbRddConfig = new HbRddConfig(config) def apply(configs: configOption*): HbRddConfig = { val hbConfig = HBaseConfiguration.create() for { option <- configs hbOption = HbaseOption(option._1, option._2) //使用新的case class 只是为了表达更加清晰 } hbConfig.set(hbOption.name, hbOption.value) this.apply(hbConfig) } def apply(configs: { def rootDir: String; def quorum: String }): HbRddConfig = { apply( "hbase.rootdir" -> configs.rootDir, "hbase.zookeeper.quorum" -> configs.quorum ) } def apply(configs: Map[String, String]): HbRddConfig = { val hbConfig = HBaseConfiguration.create() configs.keys foreach { name => hbConfig.set(name, configs(name)) } this.apply(hbConfig) } def apply(configs: TraversableOnce[configOption]): HbRddConfig = { val hbConfig = HBaseConfiguration.create() configs foreach { option => val hbOption = HbaseOption(option._1, option._2) hbConfig.set(hbOption.name, hbOption.value) } this.apply(hbConfig) } }
Example 102
Source File: KerberosLoginProvider.scala From rokku with Apache License 2.0 | 5 votes |
package com.ing.wbaa.rokku.proxy.provider import java.io.File import com.ing.wbaa.rokku.proxy.config.KerberosSettings import com.typesafe.scalalogging.LazyLogging import org.apache.commons.lang.StringUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.UserGroupInformation import scala.util.{ Failure, Success, Try } trait KerberosLoginProvider extends LazyLogging { protected[this] def kerberosSettings: KerberosSettings loginUserFromKeytab(kerberosSettings.keytab, kerberosSettings.principal) private def loginUserFromKeytab(keytab: String, principal: String): Unit = { if (StringUtils.isNotBlank(keytab) && StringUtils.isNotBlank(principal)) { if (!new File(keytab).exists()) { logger.info("keytab file does not exist {}", keytab) } else { Try { UserGroupInformation.setConfiguration(new Configuration()) UserGroupInformation.loginUserFromKeytab(principal, keytab) } match { case Success(_) => logger.info("kerberos credentials provided {}", UserGroupInformation.getLoginUser) case Failure(exception) => logger.error("kerberos login error {}", exception) } } } else { logger.info("kerberos credentials are not provided") } } }
Example 103
Source File: ImageLoaderUtils.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import java.awt.image.BufferedImage import java.io.{InputStream, ByteArrayInputStream} import java.net.URI import java.util.zip.GZIPInputStream import javax.imageio.ImageIO import keystoneml.loaders.VOCLoader._ import org.apache.commons.compress.archivers.ArchiveStreamFactory import org.apache.commons.compress.archivers.tar.TarArchiveInputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import keystoneml.pipelines.Logging import keystoneml.utils._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag object ImageLoaderUtils extends Logging { def loadFiles[L, I <: AbstractLabeledImage[L] : ClassTag]( filePathsRDD: RDD[URI], labelsMap: String => L, imageBuilder: (Image, L, Option[String]) => I, // TODO(etrain): We can probably do this with implicits. namePrefix: Option[String] = None): RDD[I] = { filePathsRDD.flatMap(fileUri => loadFile(fileUri, labelsMap, imageBuilder, namePrefix)) } private def loadFile[L, I <: AbstractLabeledImage[L]]( fileUri: URI, labelsMap: String => L, imageBuilder: (Image, L, Option[String]) => I, namePrefix: Option[String]): Iterator[I] = { val filePath = new Path(fileUri) val conf = new Configuration(true) val fs = FileSystem.get(filePath.toUri(), conf) val fStream = fs.open(filePath) val tarStream = new ArchiveStreamFactory().createArchiveInputStream( "tar", fStream).asInstanceOf[TarArchiveInputStream] var entry = tarStream.getNextTarEntry() val imgs = new ArrayBuffer[I] while (entry != null) { if (!entry.isDirectory && (namePrefix.isEmpty || entry.getName.startsWith(namePrefix.get))) { var offset = 0 var ret = 0 val content = new Array[Byte](entry.getSize().toInt) while (ret >= 0 && offset != entry.getSize()) { ret = tarStream.read(content, offset, content.length - offset) if (ret >= 0) { offset += ret } } val bais = new ByteArrayInputStream(content) val image = ImageUtils.loadImage(bais).map { img => imageBuilder(img, labelsMap(entry.getName), Some(entry.getName)) } imgs ++= image } entry = tarStream.getNextTarEntry() } imgs.iterator } }
Example 104
Source File: OrcFileOperator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import java.io.IOException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[hive] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None, ignoreCorruptFiles: Boolean = false) : Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => val reader = try { Some(OrcFile.createReader(fs, path)) } catch { case e: IOException => if (ignoreCorruptFiles) { logWarning(s"Skipped the footer in the corrupted file: $path", e) None } else { throw new SparkException(s"Could not read footer for file: $path", e) } } path -> reader }.collectFirst { case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean) : Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.toIterator.map(getFileReader(_, conf, ignoreCorruptFiles)).collectFirst { case Some(reader) => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 105
Source File: HiveExternalCatalogSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.types.StructType class HiveExternalCatalogSuite extends ExternalCatalogSuite { private val externalCatalog: HiveExternalCatalog = { val catalog = new HiveExternalCatalog(new SparkConf, new Configuration) catalog.client.reset() catalog } protected override val utils: CatalogTestUtils = new CatalogTestUtils { override val tableInputFormat: String = "org.apache.hadoop.mapred.SequenceFileInputFormat" override val tableOutputFormat: String = "org.apache.hadoop.mapred.SequenceFileOutputFormat" override def newEmptyCatalog(): ExternalCatalog = externalCatalog override val defaultProvider: String = "hive" } protected override def resetState(): Unit = { externalCatalog.client.reset() } import utils._ test("SPARK-18647: do not put provider in table properties for Hive serde table") { val catalog = newBasicCatalog() val hiveTable = CatalogTable( identifier = TableIdentifier("hive_tbl", Some("db1")), tableType = CatalogTableType.MANAGED, storage = storageFormat, schema = new StructType().add("col1", "int").add("col2", "string"), provider = Some("hive")) catalog.createTable(hiveTable, ignoreIfExists = false) val rawTable = externalCatalog.client.getTable("db1", "hive_tbl") assert(!rawTable.properties.contains(HiveExternalCatalog.DATASOURCE_PROVIDER)) assert(DDLUtils.isHiveTable(externalCatalog.getTable("db1", "hive_tbl"))) } Seq("parquet", "hive").foreach { format => test(s"Partition columns should be put at the end of table schema for the format $format") { val catalog = newBasicCatalog() val newSchema = new StructType() .add("col1", "int") .add("col2", "string") .add("partCol1", "int") .add("partCol2", "string") val table = CatalogTable( identifier = TableIdentifier("tbl", Some("db1")), tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat.empty, schema = new StructType() .add("col1", "int") .add("partCol1", "int") .add("partCol2", "string") .add("col2", "string"), provider = Some(format), partitionColumnNames = Seq("partCol1", "partCol2")) catalog.createTable(table, ignoreIfExists = false) val restoredTable = externalCatalog.getTable("db1", "tbl") assert(restoredTable.schema == newSchema) } } test("SPARK-22306: alter table schema should not erase the bucketing metadata at hive side") { val catalog = newBasicCatalog() externalCatalog.client.runSqlHive( """ |CREATE TABLE db1.t(a string, b string) |CLUSTERED BY (a, b) SORTED BY (a, b) INTO 10 BUCKETS |STORED AS PARQUET """.stripMargin) val newSchema = new StructType().add("a", "string").add("b", "string").add("c", "string") catalog.alterTableDataSchema("db1", "t", newSchema) assert(catalog.getTable("db1", "t").schema == newSchema) val bucketString = externalCatalog.client.runSqlHive("DESC FORMATTED db1.t") .filter(_.contains("Num Buckets")).head assert(bucketString.contains("10")) } test("SPARK-23001: NullPointerException when running desc database") { val catalog = newBasicCatalog() catalog.createDatabase(newDb("dbWithNullDesc").copy(description = null), ignoreIfExists = false) assert(catalog.getDatabase("dbWithNullDesc").description == "") } }
Example 106
Source File: HiveClientBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.util.VersionInfo import org.apache.spark.SparkConf import org.apache.spark.util.Utils private[client] object HiveClientBuilder { // In order to speed up test execution during development or in Jenkins, you can specify the path // of an existing Ivy cache: private val ivyPath: Option[String] = { sys.env.get("SPARK_VERSIONS_SUITE_IVY_PATH").orElse( Some(new File(sys.props("java.io.tmpdir"), "hive-ivy-cache").getAbsolutePath)) } private def buildConf(extraConf: Map[String, String]) = { lazy val warehousePath = Utils.createTempDir() lazy val metastorePath = Utils.createTempDir() metastorePath.delete() extraConf ++ Map( "javax.jdo.option.ConnectionURL" -> s"jdbc:derby:;databaseName=$metastorePath;create=true", "hive.metastore.warehouse.dir" -> warehousePath.toString) } // for testing only def buildClient( version: String, hadoopConf: Configuration, extraConf: Map[String, String] = Map.empty, sharesHadoopClasses: Boolean = true): HiveClient = { IsolatedClientLoader.forVersion( hiveMetastoreVersion = version, hadoopVersion = VersionInfo.getVersion, sparkConf = new SparkConf(), hadoopConf = hadoopConf, config = buildConf(extraConf), ivyPath = ivyPath, sharesHadoopClasses = sharesHadoopClasses).createClient() } }
Example 107
Source File: HiveVersionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import org.apache.hadoop.conf.Configuration import org.scalactic.source.Position import org.scalatest.Tag import org.apache.spark.SparkFunSuite import org.apache.spark.sql.hive.HiveUtils private[client] abstract class HiveVersionSuite(version: String) extends SparkFunSuite { override protected val enableAutoThreadAudit = false protected var client: HiveClient = null protected def buildClient( hadoopConf: Configuration, sharesHadoopClasses: Boolean = true): HiveClient = { // Hive changed the default of datanucleus.schema.autoCreateAll from true to false and // hive.metastore.schema.verification from false to true since 2.0 // For details, see the JIRA HIVE-6113 and HIVE-12463 if (version == "2.0" || version == "2.1" || version == "2.2" || version == "2.3") { hadoopConf.set("datanucleus.schema.autoCreateAll", "true") hadoopConf.set("hive.metastore.schema.verification", "false") } HiveClientBuilder.buildClient( version, hadoopConf, HiveUtils.formatTimeVarsForHiveClient(hadoopConf), sharesHadoopClasses = sharesHadoopClasses) } override def suiteName: String = s"${super.suiteName}($version)" override protected def test(testName: String, testTags: Tag*)(testFun: => Any) (implicit pos: Position): Unit = { super.test(s"$version: $testName", testTags: _*)(testFun) } }
Example 108
Source File: DataSourceManagerFactory.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.util.Utils object DataSourceManagerFactory { def create( datasourceType: String, conf: SparkConf, hadoopConf: Configuration): DataSourceManager = { val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[DataSourceManager], loader) var cls: Class[_] = null // As we use ServiceLoader to support creating any user provided DataSourceManager here, // META-INF/services/org.apache.spark.sql.sources.DataSourceRegister must be packaged properly // in user's jar, and the implementation of DataSourceManager must have a public parameterless // constructor. For scala language, def this() = this(null...) just work. try { cls = serviceLoader.asScala .filter(_.shortName().equals(datasourceType)) .toList match { case head :: Nil => head.getClass case _ => throw new SparkException(s"error when instantiate datasource ${datasourceType}") } } catch { case _: Exception => throw new SparkException( s"""Can't find corresponding DataSourceManager for ${datasourceType} type, |please check |1. META-INF/services/org.apache.spark.sql.sources.DataSourceRegister is packaged |2. your implementation of DataSourceManager's shortname is ${datasourceType} |3. your implementation of DataSourceManager must have a public parameterless | constructor. For scala language, def this() = this(null, null, ...) just work. """.stripMargin) } try { val constructor = cls.getConstructor(classOf[SparkConf], classOf[Configuration]) val newHadoopConf = new Configuration(hadoopConf) constructor.newInstance(conf, newHadoopConf).asInstanceOf[DataSourceManager] } catch { case _: NoSuchMethodException => try { cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[DataSourceManager] } catch { case _: NoSuchMethodException => cls.getConstructor().newInstance().asInstanceOf[DataSourceManager] } } } }
Example 109
Source File: CompressionCodecs.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import java.util.Locale import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress._ import org.apache.spark.util.Utils object CompressionCodecs { private val shortCompressionCodecNames = Map( "none" -> null, "uncompressed" -> null, "bzip2" -> classOf[BZip2Codec].getName, "deflate" -> classOf[DeflateCodec].getName, "gzip" -> classOf[GzipCodec].getName, "lz4" -> classOf[Lz4Codec].getName, "snappy" -> classOf[SnappyCodec].getName) def setCodecConfiguration(conf: Configuration, codec: String): Unit = { if (codec != null) { conf.set("mapreduce.output.fileoutputformat.compress", "true") conf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString) conf.set("mapreduce.output.fileoutputformat.compress.codec", codec) conf.set("mapreduce.map.output.compress", "true") conf.set("mapreduce.map.output.compress.codec", codec) } else { // This infers the option `compression` is set to `uncompressed` or `none`. conf.set("mapreduce.output.fileoutputformat.compress", "false") conf.set("mapreduce.map.output.compress", "false") } } }
Example 110
Source File: CodecStreams.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{InputStream, OutputStream, OutputStreamWriter} import java.nio.charset.{Charset, StandardCharsets} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.compress._ import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext object CodecStreams { private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = { val compressionCodecs = new CompressionCodecFactory(config) Option(compressionCodecs.getCodec(file)) } def createInputStream(config: Configuration, file: Path): InputStream = { val fs = file.getFileSystem(config) val inputStream: InputStream = fs.open(file) getDecompressionCodec(config, file) .map(codec => codec.createInputStream(inputStream)) .getOrElse(inputStream) } def getCompressionExtension(context: JobContext): String = { getCompressionCodec(context) .map(_.getDefaultExtension) .getOrElse("") } }
Example 111
Source File: HadoopFileLinesReader.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, lineSeparator: Option[Array[Byte]], conf: Configuration) extends Iterator[Text] with Closeable { def this(file: PartitionedFile, conf: Configuration) = this(file, None, conf) private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = lineSeparator match { case Some(sep) => new LineRecordReader(sep) // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`. case _ => new LineRecordReader() } reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 112
Source File: CatalogFileIndex.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.StructType private class PrunedInMemoryFileIndex( sparkSession: SparkSession, tableBasePath: Path, fileStatusCache: FileStatusCache, override val partitionSpec: PartitionSpec, override val metadataOpsTimeNs: Option[Long]) extends InMemoryFileIndex( sparkSession, partitionSpec.partitions.map(_.path), Map.empty, Some(partitionSpec.partitionColumns), fileStatusCache)
Example 113
Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.FileNotFoundException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.{SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.SerializableConfiguration class BasicWriteJobStatsTracker( serializableHadoopConf: SerializableConfiguration, @transient val metrics: Map[String, SQLMetric]) extends WriteJobStatsTracker { override def newTaskInstance(): WriteTaskStatsTracker = { new BasicWriteTaskStatsTracker(serializableHadoopConf.value) } override def processStats(stats: Seq[WriteTaskStats]): Unit = { val sparkContext = SparkContext.getActive.get var numPartitions: Long = 0L var numFiles: Long = 0L var totalNumBytes: Long = 0L var totalNumOutput: Long = 0L val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats]) basicStats.foreach { summary => numPartitions += summary.numPartitions numFiles += summary.numFiles totalNumBytes += summary.numBytes totalNumOutput += summary.numRows } metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput) metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions) val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList) } } object BasicWriteJobStatsTracker { private val NUM_FILES_KEY = "numFiles" private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes" private val NUM_OUTPUT_ROWS_KEY = "numOutputRows" private val NUM_PARTS_KEY = "numParts" def metrics: Map[String, SQLMetric] = { val sparkContext = SparkContext.getActive.get Map( NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"), NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"), NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"), NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part") ) } }
Example 114
Source File: HadoopFileWholeTextReader.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.input.WholeTextFileRecordReader class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new CombineFileSplit( Array(new Path(new URI(file.filePath))), Array(file.start), Array(file.length), // TODO: Implement Locality Array.empty[String]) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0) reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 115
Source File: DataWritingCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.util.SerializableConfiguration def logicalPlanOutputWithNames( query: LogicalPlan, names: Seq[String]): Seq[Attribute] = { // Save the output attributes to a variable to avoid duplicated function calls. val outputAttributes = query.output assert(outputAttributes.length == names.length, "The length of provided names doesn't match the length of output attributes.") outputAttributes.zip(names).map { case (attr, outputName) => attr.withName(outputName) } } }
Example 116
Source File: FileStreamSink.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormat, FileFormatWriter} import org.apache.spark.util.SerializableConfiguration object FileStreamSink extends Logging { // The name of the subdirectory that is used to store metadata about which files are valid. val metadataDir = "_spark_metadata" class FileStreamSink( sparkSession: SparkSession, path: String, fileFormat: FileFormat, partitionColumnNames: Seq[String], options: Map[String, String]) extends Sink with Logging { private val basePath = new Path(path) private val logPath = new Path(basePath, FileStreamSink.metadataDir) private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toUri.toString) private val hadoopConf = sparkSession.sessionState.newHadoopConf() private def basicWriteJobStatsTracker: BasicWriteJobStatsTracker = { val serializableHadoopConf = new SerializableConfiguration(hadoopConf) new BasicWriteJobStatsTracker(serializableHadoopConf, BasicWriteJobStatsTracker.metrics) } override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) { logInfo(s"Skipping already committed batch $batchId") } else { val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, outputPath = path) committer match { case manifestCommitter: ManifestFileCommitProtocol => manifestCommitter.setupManifestOptions(fileLog, batchId) case _ => // Do nothing } // Get the actual partition columns as attributes after matching them by name with // the given columns names. val partitionColumns: Seq[Attribute] = partitionColumnNames.map { col => val nameEquality = data.sparkSession.sessionState.conf.resolver data.logicalPlan.output.find(f => nameEquality(f.name, col)).getOrElse { throw new RuntimeException(s"Partition column $col not found in schema ${data.schema}") } } val qe = data.queryExecution FileFormatWriter.write( sparkSession = sparkSession, plan = qe.executedPlan, fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec(path, Map.empty, qe.analyzed.output), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = None, statsTrackers = Seq(basicWriteJobStatsTracker), options = options) } } override def toString: String = s"FileSink[$path]" }
Example 117
Source File: StreamMetadata.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import java.util.ConcurrentModificationException import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataInputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: CancellableFSDataOutputStream = null try { val fileManager = CheckpointFileManager.create(metadataFile.getParent, hadoopConf) output = fileManager.createAtomic(metadataFile, overwriteIfPossible = false) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case e: FileAlreadyExistsException => if (output != null) { output.cancel() } throw new ConcurrentModificationException( s"Multiple streaming queries are concurrently using $metadataFile", e) case e: Throwable => if (output != null) { output.cancel() } logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } } }
Example 118
Source File: StreamMetadataSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.streaming.StreamTest class StreamMetadataSuite extends StreamTest { test("writing and reading") { withTempDir { dir => val id = UUID.randomUUID.toString val metadata = StreamMetadata(id) val file = new Path(new File(dir, "test").toString) StreamMetadata.write(metadata, file, hadoopConf) val readMetadata = StreamMetadata.read(file, hadoopConf) assert(readMetadata.nonEmpty) assert(readMetadata.get.id === id) } } test("read Spark 2.1.0 format") { // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0 assert( readForResource("query-metadata-logs-version-2.1.0.txt") === StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e")) } private def readForResource(fileName: String): StreamMetadata = { val input = getClass.getResource(s"/structured-streaming/$fileName") StreamMetadata.read(new Path(input.toString), hadoopConf).get } private val hadoopConf = new Configuration() }
Example 119
Source File: ExcelOutputWriter.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.math.BigDecimal import java.sql.Date import java.sql.Timestamp import java.text.DateFormat import java.text.SimpleDateFormat import java.util.Calendar import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.NullWritable import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.Row import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import org.zuinnote.hadoop.office.format.common.util.msexcel.MSExcelUtil import org.zuinnote.hadoop.office.format.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import java.util.Locale import java.text.DecimalFormat import org.zuinnote.hadoop.office.format.common.converter.ExcelConverterSimpleSpreadSheetCellDAO import java.text.NumberFormat // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[excel] class ExcelOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext, options: Map[String, String]) extends OutputWriter { def write(row: Row): Unit = { // check useHeader if (useHeader) { val headers = row.schema.fieldNames var i = 0 for (x <- headers) { val headerColumnSCD = new SpreadSheetCellDAO(x, "", "", MSExcelUtil.getCellAddressA1Format(currentRowNum, i), defaultSheetName) recordWriter.write(NullWritable.get(), headerColumnSCD) i += 1 } currentRowNum += 1 useHeader = false } // for each value in the row if (row.size>0) { var currentColumnNum = 0; val simpleObject = new Array[AnyRef](row.size) for (i <- 0 to row.size - 1) { // for each element of the row val obj = row.get(i) if ((obj.isInstanceOf[Seq[String]]) && (obj.asInstanceOf[Seq[String]].length==5)) { val formattedValue = obj.asInstanceOf[Seq[String]](0) val comment = obj.asInstanceOf[Seq[String]](1) val formula = obj.asInstanceOf[Seq[String]](2) val address = obj.asInstanceOf[Seq[String]](3) val sheetName = obj.asInstanceOf[Seq[String]](4) simpleObject(i) = new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName) } else { simpleObject(i)=obj.asInstanceOf[AnyRef] } } // convert row to spreadsheetcellDAO val spreadSheetCellDAORow = simpleConverter.getSpreadSheetCellDAOfromSimpleDataType(simpleObject, defaultSheetName, currentRowNum) // write it for (x<- spreadSheetCellDAORow) { recordWriter.write(NullWritable.get(), x) } } currentRowNum += 1 } override def close(): Unit = { recordWriter.close(context) currentRowNum = 0; } }
Example 120
Source File: HadoopFileExcelReader.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.io.Closeable import java.net.URI import org.apache.spark.sql.execution.datasources._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{ FileSplit, LineRecordReader } import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.execution.datasources.RecordReaderIterator import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileInputFormat import org.zuinnote.hadoop.office.format.mapreduce.ExcelRecordReader import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log class HadoopFileExcelReader( file: PartitionedFile, conf: Configuration) extends Iterator[ArrayWritable] with Closeable { val LOG = LogFactory.getLog(classOf[HadoopFileExcelReader]) private var reader: RecordReader[Text, ArrayWritable] = null private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, Array.empty) // todo: implement locality (replace Array.empty with the locations) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val inputFormat = new ExcelFileInputFormat() reader = inputFormat.createRecordReader(fileSplit, hadoopAttemptContext) reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } def getReader: RecordReader[Text, ArrayWritable] = reader override def hasNext: Boolean = iterator.hasNext override def next(): ArrayWritable = iterator.next() override def close(): Unit = { if (reader != null) { reader.close() } } }
Example 121
Source File: ExcelOutputWriterFactory.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.sql.execution.datasources.{ OutputWriter, OutputWriterFactory } import org.apache.spark.sql.types.StructType import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileOutputFormat import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration private[excel] class ExcelOutputWriterFactory(options: Map[String, String]) extends OutputWriterFactory { def newInstance( path: String, bucketId: Option[Int], dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new ExcelOutputWriter(path, dataSchema, context, options) } def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new ExcelOutputWriter(path, dataSchema, context, options) } def getFileExtension(context: TaskAttemptContext): String = { val conf = context.getConfiguration(); val defaultConf = conf.get(HadoopOfficeWriteConfiguration.CONF_MIMETYPE, ExcelFileOutputFormat.DEFAULT_MIMETYPE); conf.set(HadoopOfficeWriteConfiguration.CONF_MIMETYPE, defaultConf); ExcelFileOutputFormat.getSuffix(conf.get(HadoopOfficeWriteConfiguration.CONF_MIMETYPE)) } }
Example 122
Source File: HadoopBundleFileSystem.scala From mleap with Apache License 2.0 | 5 votes |
package ml.bundle.hdfs import java.io.File import java.net.URI import java.nio.file.{Files, Paths} import com.typesafe.config.Config import ml.combust.bundle.fs.BundleFileSystem import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Try import scala.collection.JavaConverters._ object HadoopBundleFileSystem { lazy val defaultSchemes: Seq[String] = Seq("hdfs") def createHadoopConfiguration(config: Config): Configuration = { val options: Map[String, String] = if(config.hasPath("options")) { config.getConfig("options").entrySet().asScala.map { entry => (entry.getKey, entry.getValue.unwrapped().toString) }.toMap } else { Map() } val c = new Configuration() for ((key, value) <- options) { c.set(key, value) } c } def createSchemes(config: Config): Seq[String] = if (config.hasPath("schemes")) { config.getStringList("schemes").asScala } else { Seq("hdfs") } } class HadoopBundleFileSystem(fs: FileSystem, override val schemes: Seq[String] = HadoopBundleFileSystem.defaultSchemes) extends BundleFileSystem { def this(config: Config) = { this(FileSystem.get(HadoopBundleFileSystem.createHadoopConfiguration(config)), HadoopBundleFileSystem.createSchemes(config)) } override def load(uri: URI): Try[File] = Try { val tmpDir = Files.createTempDirectory("hdfs-bundle") val tmpFile = Paths.get(tmpDir.toString, "bundle.zip") fs.copyToLocalFile(new Path(uri.toString), new Path(tmpFile.toString)) tmpFile.toFile } override def save(uri: URI, localFile: File): Unit = { fs.copyFromLocalFile(new Path(localFile.toString), new Path(uri.toString)) } }
Example 123
Source File: HadoopBundleFileSystemSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.bundle.hdfs import java.net.URI import java.nio.file.{Files, Paths} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.FunSpec class HadoopBundleFileSystemSpec extends FunSpec { private val fs = FileSystem.get(new Configuration()) private val bundleFs = new HadoopBundleFileSystem(fs) describe("scheme") { it("returns hdfs") { assert(bundleFs.schemes == Seq("hdfs")) } } describe("load") { it("loads a file from hadoop and saves to a local file") { val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt") Files.write(testFile.toAbsolutePath, "HELLO".getBytes()) val loadedFile = bundleFs.load(testFile.toUri).get val contents = new String(Files.readAllBytes(loadedFile.toPath)) assert(contents == "HELLO") } } describe("save") { it("saves local file to HDFS") { val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt") Files.write(testFile.toAbsolutePath, "HELLO".getBytes()) val tmpDir = Files.createTempDirectory("HadoopBundleFileSystemSpec") val tmpFile = new URI(s"file://$tmpDir/test.txt") bundleFs.save(tmpFile, testFile.toFile) val contents = new String(Files.readAllBytes(Paths.get(tmpFile))) assert(contents == "HELLO") } } }
Example 124
Source File: OptionsParsing.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path object OptionsParsing { def parse(args: Array[String], hadoopConfiguration: Configuration): Config = { val parser = new scopt.OptionParser[Config]("") { opt[Unit]("i") .action((_, c) => c.copyOptions(_.copy(ignoreErrors = true))) .text("Ignore failures") opt[String]("log") .action((log, c) => c.copyOptions(_.copy(log = Some(new URI(log))))) .text("Write logs to a URI") opt[Unit]("dryrun") .action((_, c) => c.copyOptions(_.copy(dryRun = true))) .text("Perform a trial run with no changes made") opt[Unit]("verbose") .action((_, c) => c.copyOptions(_.copy(verbose = true))) .text("Run in verbose mode") opt[Unit]("overwrite") .action((_, c) => c.copyOptions(_.copy(overwrite = true))) .text("Overwrite destination") opt[Unit]("update") .action((_, c) => c.copyOptions(_.copy(update = true))) .text("Overwrite if source and destination differ in size, or checksum") opt[String]("filters") .action((f, c) => c.copyOptions(_.withFiltersFromFile(new URI(f), hadoopConfiguration))) .text("The path to a file containing a list of pattern strings, one string per line, such that paths matching the pattern will be excluded from the copy.") opt[Unit]("delete") .action((_, c) => c.copyOptions(_.copy(delete = true))) .text("Delete the files existing in the dst but not in src") opt[Int]("numListstatusThreads") .action((i, c) => c.copyOptions(_.copy(numListstatusThreads = i))) .text("Number of threads to use for building file listing") opt[Unit]("consistentPathBehaviour") .action((_, c) => c.copyOptions(_.copy(consistentPathBehaviour = true))) .text("Revert the path behaviour when using overwrite or update to the path behaviour of non-overwrite/non-update") opt[Int]("maxFilesPerTask") .action((i, c) => c.copyOptions(_.copy(maxFilesPerTask = i))) .text("Maximum number of files to copy in a single Spark task") opt[Long]("maxBytesPerTask") .action((i, c) => c.copyOptions(_.copy(maxBytesPerTask = i))) .text("Maximum number of bytes to copy in a single Spark task") help("help").text("prints this usage text") arg[String]("[source_path...] <target_path>") .unbounded() .minOccurs(2) .action((u, c) => c.copy(URIs = c.URIs :+ new URI(u))) } parser.parse(args, Config()) match { case Some(config) => config.options.validateOptions() config case _ => throw new RuntimeException("Failed to parse arguments") } } } case class Config(options: SparkDistCPOptions = SparkDistCPOptions(), URIs: Seq[URI] = Seq.empty) { def copyOptions(f: SparkDistCPOptions => SparkDistCPOptions): Config = { this.copy(options = f(options)) } def sourceAndDestPaths: (Seq[Path], Path) = { URIs.reverse match { case d :: s :: ts => ((s :: ts).reverse.map(u => new Path(u)), new Path(d)) case _ => throw new RuntimeException("Incorrect number of URIs") } } }
Example 125
Source File: TestSpec.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata import java.io.ByteArrayInputStream import java.nio.file.Files import com.coxautodata.objects.SerializableFileStatus import com.coxautodata.utils.FileListing import org.apache.commons.io.{FileUtils, IOUtils} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers} trait TestSpec extends FunSpec with Matchers with BeforeAndAfterEach { var testingBaseDir: java.nio.file.Path = _ var testingBaseDirName: String = _ var testingBaseDirPath: Path = _ var localFileSystem: LocalFileSystem = _ override def beforeEach(): Unit = { super.beforeEach() testingBaseDir = Files.createTempDirectory("test_output") testingBaseDirName = testingBaseDir.toString localFileSystem = FileSystem.getLocal(new Configuration()) testingBaseDirPath = localFileSystem.makeQualified(new Path(testingBaseDirName)) } override def afterEach(): Unit = { super.afterEach() FileUtils.deleteDirectory(testingBaseDir.toFile) } def createFile(relativePath: Path, content: Array[Byte]): SerializableFileStatus = { val path = new Path(testingBaseDirPath, relativePath) localFileSystem.mkdirs(path.getParent) val in = new ByteArrayInputStream(content) val out = localFileSystem.create(path) IOUtils.copy(in, out) in.close() out.close() SerializableFileStatus(localFileSystem.getFileStatus(path)) } def fileStatusToResult(f: SerializableFileStatus): FileListing = { FileListing(f.getPath.toString, if (f.isFile) Some(f.getLen) else None) } }
Example 126
Source File: L6-18Cassandra.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.Text import java.nio.ByteBuffer import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat import org.apache.cassandra.hadoop.ConfigHelper import org.apache.cassandra.thrift.ColumnOrSuperColumn import org.apache.cassandra.thrift.Column import org.apache.cassandra.utils.ByteBufferUtil import org.apache.cassandra.thrift.Mutation import java.util.Arrays object CassandraSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val jobConf = new Configuration() ConfigHelper.setOutputRpcPort(jobConf, cassandraPort) ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost) ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName) ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner") rdd.map(rec => { val c = new Column() c.setName(ByteBufferUtil.bytes(columnName)) c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval))) c.setTimestamp(System.currentTimeMillis) val m = new Mutation() m.setColumn_or_supercolumn(new ColumnOrSuperColumn()) m.column_or_supercolumn.setColumn(c) (ByteBufferUtil.bytes(rec._1), Arrays.asList(m)) }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 127
Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.io.Text import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HBaseSinkApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val hbaseConf = HBaseConfiguration.create() hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName) hbaseConf.set("hbase.master", hbaseMaster) val jobConf = new Configuration(hbaseConf) jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName) rdd.map(rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) (rec._1, put) }).saveAsNewAPIHadoopDataset(jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 128
Source File: DistCpTransformation.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.dsl.transformations import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.tools.{DistCp, DistCpOptions} import org.schedoscope.dsl.View import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver} import scala.collection.JavaConverters._ object DistCpTransformation { def copyToView(sourceView: View, targetView: View): DistCpTransformation = { val target = targetView.fullPath.split("/").dropRight(1).mkString("/") DistCpTransformation(targetView, List(sourceView.fullPath), target) } def copyToDirToView(sourcePath: String, targetView: View): DistCpTransformation = { val target = targetView.fullPath.split("/").drop(1).mkString("/") DistCpTransformation(targetView, List(sourcePath), target) } def copyToFileToView(sourceFile: String, targetView: View): DistCpTransformation = { DistCpTransformation(targetView, List(sourceFile), targetView.fullPath) } } case class DistCpTransformation(v: View, var sources: List[String], var target: String, deleteViewPath: Boolean = false, config: Configuration = new Configuration()) extends MapreduceBaseTransformation { var directoriesToDelete = if (deleteViewPath) List(v.fullPath) else List() override def stringsToChecksum: List[String] = target :: sources override def fileResourcesToChecksum = List() override val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState lazy val job: Job = { val distCp = new DistCp(config, distCpOptions) val createJob = distCp.getClass.getDeclaredMethod("createJob") createJob.setAccessible(true) val job = createJob.invoke(distCp).asInstanceOf[Job] val prepareFileListing = distCp.getClass.getDeclaredMethod("prepareFileListing", job.getClass) prepareFileListing.setAccessible(true) prepareFileListing.invoke(distCp, job) job } def distCpOptions: DistCpOptions = if (configuration.nonEmpty) { DistCpConfiguration .fromConfig(configuration.toMap) .toDistCpOptions(sources.map(new Path(_)), new Path(target)) } else { val s = sources.map(new Path(_)).asJava new DistCpOptions(s, new Path(target)) } }
Example 129
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 130
Source File: ShxReaderSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import magellan.TestSparkContext import magellan.io.PolygonReader import org.apache.commons.io.EndianUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{ArrayWritable, LongWritable, Text} import org.scalatest.FunSuite class ShxReaderSuite extends FunSuite with TestSparkContext { test("Read shx file") { val path = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shx").getPath val conf = new Configuration() conf.set("mapreduce.input.fileinputformat.split.maxsize", "10000") val data = sc.newAPIHadoopFile( path, classOf[ShxInputFormat], classOf[Text], classOf[ArrayWritable], conf ).map { case (txt: Text, splits: ArrayWritable) => val fileName = txt.toString val s = splits.get() val size = s.length var i = 0 val v = Array.fill(size)(0L) while (i < size) { v.update(i, s(i).asInstanceOf[LongWritable].get()) i += 1 } (fileName, v) } assert(data.count() === 1) val (fileName, splits) = data.first() assert(fileName === "tl_2016_us_state") // the offsets should be correct val firstOffset = splits(0) val secondOffset = splits(1) // skipping to the first offset in the Shapefile should allow me to read the first polygon val shpFilePath = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shp").getPath val fs = FileSystem.get(sc.hadoopConfiguration) var dis = fs.open(new Path(shpFilePath)) // skip firstOffset # of bytes dis.seek(firstOffset) // skip record number assert(dis.readInt() === 1) // read content length var contentLength = 16 * (dis.readInt() + 4) // extract the shape type var shapeType = EndianUtils.swapInteger(dis.readInt()) // expect a Polygon assert(shapeType === 5) // the first polygon's content should follow from here val polygonReader = new PolygonReader() val polygon = polygonReader.readFields(dis) assert(polygon != null) // seek to the second offset dis.seek(secondOffset) assert(dis.readInt() === 2) } }
Example 131
Source File: TikaParquetParser.scala From project-matt with MIT License | 5 votes |
package org.datafy.aws.app.matt.extras import java.io.{File, FileOutputStream, IOException, InputStream} import java.util import scala.collection.JavaConverters._ import org.xml.sax.{ContentHandler, SAXException} import org.apache.tika.metadata.Metadata import org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE import org.apache.tika.mime.MediaType import org.apache.tika.parser.{AbstractParser, ParseContext} import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.hadoop.ParquetReader import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.parquet.tools.json.JsonRecordFormatter import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord} import org.apache.tika.exception.TikaException import org.apache.tika.sax.XHTMLContentHandler import scala.util.Random class TikaParquetParser extends AbstractParser { // make some stuff here final val PARQUET_RAW = MediaType.application("x-parquet") private val SUPPORTED_TYPES: Set[MediaType] = Set(PARQUET_RAW) def getSupportedTypes(context: ParseContext): util.Set[MediaType] = { SUPPORTED_TYPES.asJava } @throws(classOf[IOException]) @throws(classOf[SAXException]) @throws(classOf[TikaException]) def parse(stream: InputStream, handler: ContentHandler, metadata: Metadata, context: ParseContext): Unit = { // create temp file from stream val fileNamePrefix = Random.alphanumeric.take(5).mkString val tempFile = File.createTempFile(s"parquet-${fileNamePrefix}", ".parquet") IOUtils.copy(stream, new FileOutputStream(tempFile)) val conf = new Configuration() val path = new Path(tempFile.getAbsolutePath) val parquetMetadata = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) var defaultReader: ParquetReader[SimpleRecord] = null val columns = parquetMetadata.getFileMetaData.getSchema.getFields metadata.set(CONTENT_TYPE, PARQUET_RAW.toString) metadata.set("Total Number of Columns", columns.size.toString) metadata.set("Parquet Column Names", columns.toString) val xhtml = new XHTMLContentHandler(handler, metadata) xhtml.startDocument() xhtml.startElement("p") // ::TODO:: ensure parquet reader reads all files not only file row try { defaultReader = ParquetReader.builder(new SimpleReadSupport(), new Path(tempFile.getAbsolutePath)).build() if(defaultReader.read() != null) { val values: SimpleRecord = defaultReader.read() val jsonFormatter = JsonRecordFormatter.fromSchema(parquetMetadata.getFileMetaData.getSchema) val textContent: String = jsonFormatter.formatRecord(values) xhtml.characters(textContent) xhtml.endElement("p") xhtml.endDocument() } } catch { case e: Throwable => e.printStackTrace() if (defaultReader != null) { try { defaultReader.close() } catch{ case _: Throwable => } } } finally { if (tempFile != null) tempFile.delete() } } }
Example 132
Source File: TikaHadoopOrcParser.scala From project-matt with MIT License | 5 votes |
package org.datafy.aws.app.matt.extras import java.io.{File, FileOutputStream, IOException, InputStream} import java.util import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.serde2.objectinspector.StructField import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.orc.OrcFile import org.apache.orc.OrcFile.ReaderOptions import org.apache.orc.Reader import org.apache.orc.RecordReader import org.apache.tika.exception.TikaException import org.apache.tika.metadata.Metadata import org.apache.tika.mime.MediaType import org.apache.tika.parser.{AbstractParser, ParseContext} import org.xml.sax.{ContentHandler, SAXException} import scala.util.Random class TikaHadoopOrcParser extends AbstractParser { final val ORC_RAW = MediaType.application("x-orc") private val SUPPORTED_TYPES: Set[MediaType] = Set(ORC_RAW) def getSupportedTypes(context: ParseContext): util.Set[MediaType] = { SUPPORTED_TYPES.asJava } @throws(classOf[IOException]) @throws(classOf[SAXException]) @throws(classOf[TikaException]) def parse(stream: InputStream, handler: ContentHandler, metadata: Metadata, context: ParseContext): Unit = { // create temp file from stream try { val fileNamePrefix = Random.alphanumeric.take(5).mkString val tempFile = File.createTempFile(s"orc-${fileNamePrefix}", ".orc") IOUtils.copy(stream, new FileOutputStream(tempFile)) val path = new Path(tempFile.getAbsolutePath) val conf = new Configuration() val orcReader = OrcFile.createReader(path, new ReaderOptions(conf)) val records: RecordReader = orcReader.rows() val storeRecord = null val firstBlockKey = null } catch { case e: Throwable => e.printStackTrace() } // val fields = } }
Example 133
Source File: WorkbookReader.scala From spark-excel with Apache License 2.0 | 5 votes |
package com.crealytics.spark.excel import java.io.InputStream import com.crealytics.spark.excel.Utils.MapIncluding import com.github.pjfanning.xlsx.StreamingReader import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory} trait WorkbookReader { protected def openWorkbook(): Workbook def withWorkbook[T](f: Workbook => T): T = { val workbook = openWorkbook() val res = f(workbook) workbook.close() res } def sheetNames: Seq[String] = { withWorkbook( workbook => for (sheetIx <- (0 until workbook.getNumberOfSheets())) yield { workbook.getSheetAt(sheetIx).getSheetName() } ) } } object WorkbookReader { val WithLocationMaxRowsInMemoryAndPassword = MapIncluding(Seq("path"), optionally = Seq("maxRowsInMemory", "workbookPassword")) def apply(parameters: Map[String, String], hadoopConfiguration: Configuration): WorkbookReader = { def readFromHadoop(location: String) = { val path = new Path(location) FileSystem.get(path.toUri, hadoopConfiguration).open(path) } parameters match { case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(Some(maxRowsInMemory), passwordOption)) => new StreamingWorkbookReader(readFromHadoop(location), passwordOption, maxRowsInMemory.toInt) case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(None, passwordOption)) => new DefaultWorkbookReader(readFromHadoop(location), passwordOption) } } } class DefaultWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String]) extends WorkbookReader { protected def openWorkbook(): Workbook = workbookPassword .fold(WorkbookFactory.create(inputStreamProvider))( password => WorkbookFactory.create(inputStreamProvider, password) ) } class StreamingWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String], maxRowsInMem: Int) extends WorkbookReader { override protected def openWorkbook(): Workbook = { val builder = StreamingReader .builder() .rowCacheSize(maxRowsInMem) .bufferSize(4096) workbookPassword .fold(builder)(password => builder.password(password)) .open(inputStreamProvider) } }
Example 134
Source File: FilterHelper.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.utils import org.apache.hadoop.conf.Configuration import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate} import org.apache.parquet.hadoop.ParquetInputFormat import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.parquet.ParquetFiltersWrapper import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType object FilterHelper { def tryToPushFilters( sparkSession: SparkSession, requiredSchema: StructType, filters: Seq[Filter]): Option[FilterPredicate] = { tryToPushFilters(sparkSession.sessionState.conf, requiredSchema, filters) } def tryToPushFilters( conf: SQLConf, requiredSchema: StructType, filters: Seq[Filter]): Option[FilterPredicate] = { if (conf.parquetFilterPushDown) { filters // Collects all converted Parquet filter predicates. Notice that not all predicates can be // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` // is used here. .flatMap(ParquetFiltersWrapper.createFilter(conf, requiredSchema, _)) .reduceOption(FilterApi.and) } else { None } } def setFilterIfExist(configuration: Configuration, pushed: Option[FilterPredicate]): Unit = { pushed match { case Some(filters) => ParquetInputFormat.setFilterPredicate(configuration, filters) case _ => // do nothing } } }
Example 135
Source File: BitmapReaderV2.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.execution.datasources.oap.filecache.BitmapFiberId import org.apache.spark.sql.execution.datasources.oap.index.impl.IndexFileReaderImpl import org.apache.spark.sql.execution.datasources.oap.utils.{BitmapUtils, OapBitmapWrappedFiberCache} import org.apache.spark.sql.types.StructType private[oap] class BitmapReaderV2( fileReader: IndexFileReaderImpl, intervalArray: ArrayBuffer[RangeInterval], internalLimit: Int, keySchema: StructType, conf: Configuration) extends BitmapReader(fileReader, intervalArray, keySchema, conf) with Iterator[Int] { @transient private var bmRowIdIterator: Iterator[Int] = _ private var bmWfcSeq: Seq[OapBitmapWrappedFiberCache] = _ private var empty: Boolean = _ override def hasNext: Boolean = if (!empty && bmRowIdIterator.hasNext) { true } else { clearCache() false } override def next(): Int = bmRowIdIterator.next() override def toString: String = "BitmapReaderV2" override def clearCache(): Unit = { super.clearCache() if (bmWfcSeq != null) { bmWfcSeq.foreach(wfc => wfc.release) } } private def getDesiredWfcSeq(): Seq[OapBitmapWrappedFiberCache] = { val keySeq = readBmUniqueKeyList(bmUniqueKeyListCache) intervalArray.flatMap{ case range if !range.isNullPredicate => val (startIdx, endIdx) = getKeyIdx(keySeq, range) if (startIdx == -1 || endIdx == -1) { Seq.empty } else { (startIdx until (endIdx + 1)).map(idx => { val curIdxOffset = getIdxOffset(bmOffsetListCache, 0L, idx) val entrySize = getIdxOffset(bmOffsetListCache, 0L, idx + 1) - curIdxOffset val entryFiber = BitmapFiberId(() => fileReader.readFiberCache(curIdxOffset, entrySize), fileReader.getName, BitmapIndexSectionId.entryListSection, idx) new OapBitmapWrappedFiberCache(fiberCacheManager.get(entryFiber)) }) } case range if range.isNullPredicate => val nullListCache = new OapBitmapWrappedFiberCache(fiberCacheManager.get(bmNullListFiber)) if (nullListCache.size != 0) { Seq(nullListCache) } else { Seq.empty } } } def initRowIdIterator(): Unit = { initDesiredSegments() bmWfcSeq = getDesiredWfcSeq if (bmWfcSeq.nonEmpty) { val iterator = BitmapUtils.iterator(bmWfcSeq) bmRowIdIterator = if (internalLimit > 0) iterator.take(internalLimit) else iterator empty = false } else { empty = true } } }
Example 136
Source File: BPlusTreeScanner.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.execution.datasources.oap._ import org.apache.spark.sql.execution.datasources.oap.statistics.StatsAnalysisResult // we scan the index from the smallest to the largest, // this will scan the B+ Tree (index) leaf node. private[oap] class BPlusTreeScanner(idxMeta: IndexMeta) extends IndexScanner(idxMeta) { override def toString(): String = "BPlusTreeScanner" @transient var recordReader: BTreeIndexRecordReader = _ // Set by analyzeStatistics() private var _totalRows: Long = 0 override def totalRows(): Long = _totalRows def initialize(dataPath: Path, conf: Configuration): IndexScanner = { assert(keySchema ne null) val indexPath = IndexUtils.getIndexFilePath( conf, dataPath, meta.name, meta.time) logDebug("Loading Index File: " + indexPath) logDebug("\tFile Size: " + indexPath.getFileSystem(conf).getFileStatus(indexPath).getLen) recordReader = BTreeIndexRecordReader(conf, keySchema, indexPath) recordReader.initialize(indexPath, intervalArray) // For some case, analyzeStatistics will be skipped, so we have to get totalRows here as well. _totalRows = recordReader.totalRows() this } override protected def analyzeStatistics( indexPath: Path, conf: Configuration): StatsAnalysisResult = { var recordReader = BTreeIndexRecordReader(conf, keySchema, indexPath) try { val result = recordReader.analyzeStatistics(keySchema, intervalArray) _totalRows = recordReader.totalRows() result } finally { if (recordReader != null) { recordReader.close() recordReader = null } } } override def hasNext: Boolean = recordReader.hasNext override def next(): Int = recordReader.next() }
Example 137
Source File: BitMapScanner.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.IndexMeta import org.apache.spark.sql.execution.datasources.oap.index.OapIndexProperties.IndexVersion import org.apache.spark.sql.execution.datasources.oap.index.impl.IndexFileReaderImpl import org.apache.spark.sql.execution.datasources.oap.statistics.StatsAnalysisResult private[oap] case class BitMapScanner(idxMeta: IndexMeta) extends IndexScanner(idxMeta) { private var _totalRows: Long = 0 @transient private var bmRowIdIterator: Iterator[Int] = _ override def hasNext: Boolean = bmRowIdIterator.hasNext override def next(): Int = bmRowIdIterator.next override def totalRows(): Long = _totalRows // TODO: If the index file is not changed, bypass the repetitive initialization for queries. override def initialize(dataPath: Path, conf: Configuration): IndexScanner = { assert(keySchema ne null) // Currently OAP index type supports the column with one single field. assert(keySchema.fields.length == 1) val indexPath = IndexUtils.getIndexFilePath( conf, dataPath, meta.name, meta.time) val fileReader = IndexFileReaderImpl(conf, indexPath) val bitmapReader = IndexUtils.readVersion(fileReader) match { case Some(version) => IndexVersion(version) match { case IndexVersion.OAP_INDEX_V1 => val reader = new BitmapReaderV1( fileReader, intervalArray, internalLimit, keySchema, conf) reader.initRowIdIterator bmRowIdIterator = reader reader case IndexVersion.OAP_INDEX_V2 => val reader = new BitmapReaderV2( fileReader, intervalArray, internalLimit, keySchema, conf) reader.initRowIdIterator bmRowIdIterator = reader reader } case None => throw new OapException("not a valid index file") } _totalRows = bitmapReader.totalRows fileReader.close() this } override protected def analyzeStatistics( idxPath: Path, conf: Configuration): StatsAnalysisResult = { val fileReader = IndexFileReaderImpl(conf, idxPath) val reader = BitmapReader(fileReader, intervalArray, keySchema, conf) _totalRows = reader.totalRows try { reader.analyzeStatistics() } finally { fileReader.close() } } override def toString: String = "BitMapScanner" }
Example 138
Source File: IndexFileWriterImpl.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index.impl import java.io.OutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.execution.datasources.oap.index.IndexFileWriter private[index] case class IndexFileWriterImpl( configuration: Configuration, indexPath: Path) extends IndexFileWriter { protected override val os: OutputStream = indexPath.getFileSystem(configuration).create(indexPath, true) // Give RecordWriter a chance which file it's writing to. override def getName: String = indexPath.toString override def tempRowIdWriter: IndexFileWriter = { val tempFileName = new Path(indexPath.getParent, indexPath.getName + ".id") IndexFileWriterImpl(configuration, tempFileName) } override def writeRowId(tempWriter: IndexFileWriter): Unit = { val path = new Path(tempWriter.getName) val is = path.getFileSystem(configuration).open(path) val length = path.getFileSystem(configuration).getFileStatus(path).getLen val bufSize = configuration.getInt("io.file.buffer.size", 4096) val bytes = new Array[Byte](bufSize) var remaining = length while (remaining > 0) { val readSize = math.min(bufSize, remaining).toInt is.readFully(bytes, 0, readSize) os.write(bytes, 0, readSize) remaining -= readSize } is.close() path.getFileSystem(configuration).delete(path, false) } }
Example 139
Source File: StatisticsType.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.statistics import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.types.StructType private[oap] object StatisticsType { val TYPE_MIN_MAX: Int = 0 val TYPE_SAMPLE_BASE: Int = 1 val TYPE_PART_BY_VALUE: Int = 2 val TYPE_BLOOM_FILTER: Int = 3 def unapply(t: Int): Option[StructType => StatisticsReader] = t match { case TYPE_MIN_MAX => Some(new MinMaxStatisticsReader(_)) case TYPE_SAMPLE_BASE => Some(new SampleBasedStatisticsReader(_)) case TYPE_PART_BY_VALUE => Some(new PartByValueStatisticsReader(_)) case TYPE_BLOOM_FILTER => Some(new BloomFilterStatisticsReader(_)) case _ => None } def unapply(name: String): Option[(StructType, Configuration) => StatisticsWriter] = name match { case "MINMAX" => Some((schema: StructType, conf: Configuration) => new MinMaxStatisticsWriter(schema, conf)) case "SAMPLE" => Some((schema: StructType, conf: Configuration) => new SampleBasedStatisticsWriter(schema, conf)) case "PARTBYVALUE" => Some((schema: StructType, conf: Configuration) => new PartByValueStatisticsWriter(schema, conf)) case "BLOOM" => Some((schema: StructType, conf: Configuration) => new BloomFilterStatisticsWriter(schema, conf)) case _ => None } }
Example 140
Source File: ParquetFiberDataLoader.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import java.io.IOException import java.time.ZoneId import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.ParquetFiberDataReader import org.apache.parquet.hadoop.api.InitContext import org.apache.parquet.hadoop.utils.Collections3 import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache import org.apache.spark.sql.execution.datasources.parquet.{ParquetReadSupportWrapper, VectorizedColumnReader} import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector import org.apache.spark.sql.oap.OapRuntime import org.apache.spark.sql.types._ private[oap] case class ParquetFiberDataLoader( configuration: Configuration, reader: ParquetFiberDataReader, blockId: Int) { @throws[IOException] def loadSingleColumn: FiberCache = { val footer = reader.getFooter val fileSchema = footer.getFileMetaData.getSchema val fileMetadata = footer.getFileMetaData.getKeyValueMetaData val readContext = new ParquetReadSupportWrapper() .init(new InitContext(configuration, Collections3.toSetMultiMap(fileMetadata), fileSchema)) val requestedSchema = readContext.getRequestedSchema val sparkRequestedSchemaString = configuration.get(ParquetReadSupportWrapper.SPARK_ROW_REQUESTED_SCHEMA) val sparkSchema = StructType.fromString(sparkRequestedSchemaString) assert(sparkSchema.length == 1, s"Only can get single column every time " + s"by loadSingleColumn, the columns = ${sparkSchema.mkString}") val dataType = sparkSchema.fields(0).dataType // Notes: rowIds is IntegerType in oap index. val rowCount = reader.getFooter.getBlocks.get(blockId).getRowCount.toInt val columnDescriptor = requestedSchema.getColumns.get(0) val originalType = requestedSchema.asGroupType.getFields.get(0).getOriginalType val blockMetaData = footer.getBlocks.get(blockId) val fiberData = reader.readFiberData(blockMetaData, columnDescriptor) val columnReader = new VectorizedColumnReader(columnDescriptor, originalType, fiberData.getPageReader(columnDescriptor), ZoneId.systemDefault, true) if (OapRuntime.getOrCreate.fiberCacheManager.dataCacheCompressEnable) { ParquetDataFiberCompressedWriter.dumpToCache( columnReader, rowCount, dataType) } else { val column = new OnHeapColumnVector(rowCount, dataType) columnReader.readBatch(rowCount, column) ParquetDataFiberWriter.dumpToCache( column.asInstanceOf[OnHeapColumnVector], rowCount) } } }
Example 141
Source File: CodecFactory.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream} import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.compress.{CodecPool, CompressionCodec} import org.apache.hadoop.util.ReflectionUtils import org.apache.parquet.format.{CompressionCodec => ParquetCodec} import org.apache.parquet.hadoop.metadata.CompressionCodecName // This is a simple version of parquet's CodeFactory. // TODO: [linhong] Need change this into Scala Code style private[oap] class CodecFactory(conf: Configuration) { private val compressors = new mutable.HashMap[ParquetCodec, BytesCompressor] private val decompressors = new mutable.HashMap[ParquetCodec, BytesDecompressor] private val codecByName = new mutable.HashMap[String, CompressionCodec] private def getCodec(codecString: String): Option[CompressionCodec] = { codecByName.get(codecString) match { case Some(codec) => Some(codec) case None => val codecName = CompressionCodecName.valueOf(codecString) val codecClass = codecName.getHadoopCompressionCodecClass if (codecClass == null) { None } else { val codec = ReflectionUtils.newInstance(codecClass, conf).asInstanceOf[CompressionCodec] codecByName.put(codecString, codec) Some(codec) } } } def getCompressor(codec: ParquetCodec): BytesCompressor = { compressors.getOrElseUpdate(codec, new BytesCompressor(getCodec(codec.name))) } def getDecompressor(codec: ParquetCodec): BytesDecompressor = { decompressors.getOrElseUpdate(codec, new BytesDecompressor(getCodec(codec.name))) } def release(): Unit = { compressors.values.foreach(_.release()) compressors.clear() decompressors.values.foreach(_.release()) decompressors.clear() } } private[oap] class BytesCompressor(compressionCodec: Option[CompressionCodec]) { private lazy val compressedOutBuffer = new ByteArrayOutputStream() private lazy val compressor = compressionCodec match { case Some(codec) => CodecPool.getCompressor(codec) case None => null } def compress(bytes: Array[Byte]): Array[Byte] = { compressionCodec match { case Some(codec) => compressedOutBuffer.reset() // null compressor for non-native gzip if (compressor != null) { compressor.reset() } val cos = codec.createOutputStream(compressedOutBuffer, compressor) cos.write(bytes) cos.finish() cos.close() compressedOutBuffer.toByteArray case None => bytes } } def release(): Unit = CodecPool.returnCompressor(compressor) } private[oap] class BytesDecompressor(compressionCodec: Option[CompressionCodec]) { private lazy val decompressor = compressionCodec match { case Some(codec) => CodecPool.getDecompressor(codec) case None => null } def decompress(bytes: Array[Byte], uncompressedSize: Int): Array[Byte] = { compressionCodec match { case Some(codec) => decompressor.reset() val cis = codec.createInputStream(new ByteArrayInputStream(bytes), decompressor) val decompressed = new Array[Byte](uncompressedSize) new DataInputStream(cis).readFully(decompressed) decompressed case None => bytes } } def release(): Unit = CodecPool.returnDecompressor(decompressor) }
Example 142
Source File: OrcDataFileMeta.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FSDataInputStream import org.apache.hadoop.fs.Path import org.apache.orc.OrcFile import org.apache.orc.Reader import org.apache.orc.mapred.OrcInputFormat private[oap] class OrcDataFileMeta(val path: Path, val configuration: Configuration) extends DataFileMeta { val fs = path.getFileSystem(configuration) private val readerOptions = OrcFile.readerOptions(configuration).filesystem(fs) private val fileReader = OrcFile.createReader(path, readerOptions) val length = fs.getFileStatus(path).getLen // val options: Reader.Options = OrcInputFormat.buildOptions(configuration, fileReader, 0, length) // Record reader from ORC row batch. // val recordReader = fileReader.rows(options) def getOrcFileReader(): Reader = fileReader val listStripeInformation = fileReader.getStripes() def numberOfRows: Long = fileReader.getNumberOfRows() override def len: Long = fileReader.getContentLength() override def getGroupCount: Int = fileReader.getStripes().size() override def getFieldCount: Int = fileReader.getSchema().getFieldNames().size() // Not used by orc data file. override def fin: FSDataInputStream = null }
Example 143
Source File: ParquetDataFileMeta.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.apache.hadoop.util.StringUtils import org.apache.parquet.hadoop.OapParquetFileReader import org.apache.parquet.hadoop.metadata.ParquetFooter private[oap] class ParquetDataFileMeta(val footer: ParquetFooter) extends DataFileMeta { require(footer != null, "footer of ParquetDataFileMeta should not be null.") override def fin: FSDataInputStream = null override def len: Long = 0 override def getGroupCount: Int = footer.getBlocks.size() override def getFieldCount: Int = footer.getFileMetaData.getSchema.getColumns.size() } private[oap] object ParquetDataFileMeta { def apply(conf: Configuration, pathString: String): ParquetDataFileMeta = { val path = new Path(StringUtils.unEscapeString(pathString)) new ParquetDataFileMeta(OapParquetFileReader.readParquetFooter(conf, path)) } }
Example 144
Source File: ParquetReadSupportWrapper.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import java.util.{Map => JMap} import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema._ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow override def prepareForRead( conf: Configuration, keyValueMetaData: JMap[String, String], fileSchema: MessageType, readContext: ReadContext): RecordMaterializer[InternalRow] = { readSupport.prepareForRead(conf, keyValueMetaData, fileSchema, readContext) } } object ParquetReadSupportWrapper { // Proxy ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA value. val SPARK_ROW_REQUESTED_SCHEMA: String = ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA }
Example 145
Source File: FilterHelperSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.utils import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.ParquetInputFormat import org.apache.spark.SparkFunSuite import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ class FilterHelperSuite extends SparkFunSuite { val conf = SQLConf.get test("Pushed And Set") { val requiredSchema = new StructType() .add(StructField("a", IntegerType)) .add(StructField("b", StringType)) val filters = Seq(GreaterThan("a", 1), EqualTo("b", "2")) val expected = s"""and(gt(a, 1), eq(b, Binary{"2"}))""" conf.setConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED, true) val pushed = FilterHelper.tryToPushFilters(conf, requiredSchema, filters) assert(pushed.isDefined) assert(pushed.get.toString.equals(expected)) val config = new Configuration() FilterHelper.setFilterIfExist(config, pushed) val humanReadable = config.get(ParquetInputFormat.FILTER_PREDICATE + ".human.readable") assert(humanReadable.nonEmpty) assert(humanReadable.equals(expected)) } test("Not Pushed") { val requiredSchema = new StructType() .add(StructField("a", IntegerType)) .add(StructField("b", StringType)) val filters = Seq(GreaterThan("a", 1), EqualTo("b", "2")) conf.setConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED, false) val pushed = FilterHelper.tryToPushFilters(conf, requiredSchema, filters) assert(pushed.isEmpty) val config = new Configuration() FilterHelper.setFilterIfExist(config, pushed) assert(config.get(ParquetInputFormat.FILTER_PREDICATE) == null) assert(config.get(ParquetInputFormat.FILTER_PREDICATE + ".human.readable") == null) } }
Example 146
Source File: OapBitmapWrappedFiberCacheSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.utils import java.io.{ByteArrayOutputStream, DataOutputStream, FileOutputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.roaringbitmap.RoaringBitmap import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.filecache.{BitmapFiberId, FiberCache} import org.apache.spark.sql.oap.OapRuntime import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils // Below are used to test the functionality of OapBitmapWrappedFiberCache class. class OapBitmapWrappedFiberCacheSuite extends QueryTest with SharedOapContext { private def loadRbFile(fin: FSDataInputStream, offset: Long, size: Int): FiberCache = OapRuntime.getOrCreate.fiberCacheManager.toIndexFiberCache(fin, offset, size) test("test the functionality of OapBitmapWrappedFiberCache class") { val CHUNK_SIZE = 1 << 16 val dataForRunChunk = (1 to 9).toSeq val dataForArrayChunk = Seq(1, 3, 5, 7, 9) val dataForBitmapChunk = (1 to 10000).filter(_ % 2 == 1) val dataCombination = dataForBitmapChunk ++ dataForArrayChunk ++ dataForRunChunk val dataArray = Array(dataForRunChunk, dataForArrayChunk, dataForBitmapChunk, dataCombination) dataArray.foreach(dataIdx => { val dir = Utils.createTempDir() val rb = new RoaringBitmap() dataIdx.foreach(rb.add) val rbFile = dir.getAbsolutePath + "rb.bin" rb.runOptimize() val rbFos = new FileOutputStream(rbFile) val rbBos = new ByteArrayOutputStream() val rbDos = new DataOutputStream(rbBos) rb.serialize(rbDos) rbBos.writeTo(rbFos) rbBos.close() rbDos.close() rbFos.close() val rbPath = new Path(rbFile.toString) val conf = new Configuration() val fin = rbPath.getFileSystem(conf).open(rbPath) val rbFileSize = rbPath.getFileSystem(conf).getFileStatus(rbPath).getLen val rbFiber = BitmapFiberId( () => loadRbFile(fin, 0L, rbFileSize.toInt), rbPath.toString, 0, 0) val rbWfc = new OapBitmapWrappedFiberCache( OapRuntime.getOrCreate.fiberCacheManager.get(rbFiber)) rbWfc.init val chunkLength = rbWfc.getTotalChunkLength val length = dataIdx.size / CHUNK_SIZE assert(chunkLength == (length + 1)) val chunkKeys = rbWfc.getChunkKeys assert(chunkKeys(0).toInt == 0) rbWfc.setOffset(0) val chunk = rbWfc.getIteratorForChunk(0) chunk match { case RunChunkIterator(rbWfc) => assert(chunk == RunChunkIterator(rbWfc)) case ArrayChunkIterator(rbWfc, 0) => assert(chunk == ArrayChunkIterator(rbWfc, 0)) case BitmapChunkIterator(rbWfc) => assert(chunk == BitmapChunkIterator(rbWfc)) case _ => throw new OapException("unexpected chunk in OapBitmapWrappedFiberCache.") } rbWfc.release fin.close dir.delete }) } }
Example 147
Source File: CodecFactorySuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.parquet.format.CompressionCodec import org.scalacheck.{Arbitrary, Gen, Properties} import org.scalacheck.Prop.forAllNoShrink import org.scalatest.prop.Checkers import org.apache.spark.SparkFunSuite import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter class CodecFactoryCheck extends Properties("CodecFactory") { private val codecFactory = new CodecFactory(new Configuration()) private val gen = Gen.sized { size => for { codec <- Arbitrary.arbitrary[CompressionCodec] times <- Gen.posNum[Int] bytes <- Gen.containerOfN[Array, Byte](size * 100, Arbitrary.arbitrary[Byte]) } yield (codec, times, bytes) } property("compress/decompress") = forAllNoShrink(gen) { // Array[Array[Byte]] means one group of fibers' data case (codec, times, bytes) => val compressor = codecFactory.getCompressor(codec) val decompressor = codecFactory.getDecompressor(codec) (0 until times).forall(_ => decompressor.decompress(compressor.compress(bytes), bytes.length) .sameElements(bytes)) } implicit lazy val arbCompressionCodec: Arbitrary[CompressionCodec] = { Arbitrary(genCompressionCodec) } private lazy val genCompressionCodec: Gen[CompressionCodec] = Gen.oneOf( CompressionCodec.UNCOMPRESSED, CompressionCodec.GZIP, CompressionCodec.SNAPPY, CompressionCodec.LZO) } class CodecFactorySuite extends SparkFunSuite with Checkers { test("Check CodecFactory Compress/Decompress") { check(PropertiesAdapter.getProp(new CodecFactoryCheck())) } }
Example 148
Source File: TestDataFile.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType private[oap] case class TestDataFile(path: String, schema: StructType, configuration: Configuration) extends DataFile { override def iterator( requiredIds: Array[Int], filters: Seq[Filter]): OapCompletionIterator[Any] = new OapCompletionIterator(Iterator.empty, {}) override def iteratorWithRowIds( requiredIds: Array[Int], rowIds: Array[Int], filters: Seq[Filter]): OapCompletionIterator[Any] = new OapCompletionIterator(Iterator.empty, {}) override def totalRows(): Long = 0 override def getDataFileMeta(): DataFileMeta = throw new UnsupportedOperationException override def cache(groupId: Int, fiberId: Int): FiberCache = throw new UnsupportedOperationException }
Example 149
Source File: DataFileSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.OapFileFormat import org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils class DataFileSuite extends QueryTest with SharedOapContext { override def beforeEach(): Unit = { val path = Utils.createTempDir().getAbsolutePath } // Override afterEach because OapDataFile will open a InputStream for OapDataFileMeta // but no method to manual close it and we can not to check open streams. override def afterEach(): Unit = {} test("apply and cache") { val data = (0 to 10).map(i => (i, (i + 'a').toChar.toString)) val schema = new StructType() val config = new Configuration() withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.format("oap").save(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile = DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config) assert(datafile.path == file) assert(datafile.schema == schema) assert(datafile.configuration == config) } withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.parquet(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile = DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config) assert(datafile.path == file) assert(datafile.schema == schema) assert(datafile.configuration == config) } withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.format("orc").save(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile = DataFile(file, schema, OapFileFormat.ORC_DATA_FILE_CLASSNAME, config) assert(datafile.path == file) assert(datafile.schema == schema) assert(datafile.configuration == config) } // DataFile object is global. After OrcDataFile is added, then need to change to 3 if // we run the whole tests. assert(DataFile.cachedConstructorCount == 3) intercept[OapException] { DataFile("nofile", schema, "NotExistClass", config) assert(DataFile.cachedConstructorCount == 2) } } test("DataFile equals") { val data = (0 to 10).map(i => (i, (i + 'a').toChar.toString)) val schema = new StructType() val config = new Configuration() withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.parquet(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile1 = DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config) val datafile2 = DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config) assert(datafile1.equals(datafile2)) assert(datafile1.hashCode() == datafile2.hashCode()) } withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.format("oap").save(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile1 = DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config) val datafile2 = DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config) assert(datafile1.equals(datafile2)) assert(datafile1.hashCode() == datafile2.hashCode()) } } }
Example 150
Source File: SharedOapContext.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test.oap import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.{OapExtensions, SparkSession} import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, SparkPlan} import org.apache.spark.sql.execution.datasources.oap.{IndexType, OapFileFormat} import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.oap.{OapDriverRuntime, OapRuntime} import org.apache.spark.sql.test.OapSharedSQLContext trait SharedOapContext extends SharedOapContextBase { protected override def createSparkSession: SparkSession = { SparkSession.cleanupAnyExistingSession() val session = SparkSession.builder() .master("local[2]") .appName("test-oap-context") .config(oapSparkConf).getOrCreate() OapRuntime.getOrCreate.asInstanceOf[OapDriverRuntime].setTestSession(session) session } } protected def withFileSystem(f: FileSystem => Unit): Unit = { var fs: FileSystem = null try { fs = FileSystem.get(configuration) f(fs) } finally { if (fs != null) { fs.close() } } } } case class TestPartition(key: String, value: String) case class TestIndex( tableName: String, indexName: String, partitions: TestPartition*)
Example 151
Source File: ArrowFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.arrow import scala.collection.JavaConverters._ import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions} import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._ import org.apache.arrow.dataset.scanner.ScanOptions import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap; class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable { val batchSize = 4096 def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = { ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava)) } override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { convert(files, options) } override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { throw new UnsupportedOperationException("Write is not supported for Arrow source") } override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true override def buildReaderWithPartitionValues(sparkSession: SparkSession, dataSchema: StructType, partitionSchema: StructType, requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { (file: PartitionedFile) => { val sqlConf = sparkSession.sessionState.conf; val enableFilterPushDown = sqlConf.arrowFilterPushDown val factory = ArrowUtils.makeArrowDiscovery( file.filePath, new ArrowOptions( new CaseInsensitiveStringMap( options.asJava).asScala.toMap)) // todo predicate validation / pushdown val dataset = factory.finish(); val filter = if (enableFilterPushDown) { ArrowFilters.translateFilters(filters) } else { org.apache.arrow.dataset.filter.Filter.EMPTY } val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray, filter, batchSize) val scanner = dataset.newScan(scanOptions) val itrList = scanner .scan() .iterator() .asScala .map(task => task.scan()) .toList val itr = itrList .toIterator .flatMap(itr => itr.asScala) .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema)) new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]] } } override def shortName(): String = "arrow" } object ArrowFileFormat { class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] { override def hasNext: Boolean = delegate.hasNext override def next(): T = delegate.next() } }
Example 152
Source File: WithHDFSSupport.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileUtil import org.apache.hadoop.hdfs.MiniDFSCluster import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, Suite} trait WithHDFSSupport extends BeforeAndAfterAll { self: Suite => protected var sparkSession: SparkSession = _ private var hdfsCluster: MiniDFSCluster = _ protected var hdfsURI: String = _ private def cleanupAnyExistingSession(): Unit = { val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession) if (session.isDefined) { session.get.sessionState.catalog.reset() session.get.stop() SparkSession.clearActiveSession() SparkSession.clearDefaultSession() } } override protected def beforeAll(): Unit = { super.beforeAll() cleanupAnyExistingSession() val baseDir = new File("./target/hdfs/").getAbsoluteFile() FileUtil.fullyDelete(baseDir) val conf = new Configuration() conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()) val builder = new MiniDFSCluster.Builder(conf) hdfsCluster = builder.build() hdfsURI = s"hdfs://localhost:${hdfsCluster.getNameNodePort()}/" sparkSession = SparkSession.builder() .master("local") .appName(this.getClass.getCanonicalName) .enableHiveSupport() .config("spark.hadoop.fs.defaultFS", hdfsURI) .config("spark.ui.enabled", "false") .getOrCreate() } override protected def afterAll(): Unit = { try { sparkSession.sessionState.catalog.reset() sparkSession.stop() SparkSession.clearActiveSession() SparkSession.clearDefaultSession() } finally { sparkSession = null } System.clearProperty("spark.driver.port") hdfsCluster.shutdown(true) super.afterAll() } }
Example 153
Source File: HadoopConfig.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import scala.language.implicitConversions import org.apache.hadoop.conf.Configuration import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.util.Constants._ class HadoopConfig(config: UserConfig) { def withHadoopConf(conf: Configuration): UserConfig = { config.withBytes(HADOOP_CONF, serializeHadoopConf(conf)) } def hadoopConf: Configuration = deserializeHadoopConf(config.getBytes(HADOOP_CONF).get) private def serializeHadoopConf(conf: Configuration): Array[Byte] = { val out = new ByteArrayOutputStream() val dataOut = new DataOutputStream(out) conf.write(dataOut) dataOut.close() out.toByteArray } private def deserializeHadoopConf(bytes: Array[Byte]): Configuration = { val in = new ByteArrayInputStream(bytes) val dataIn = new DataInputStream(in) val result = new Configuration() result.readFields(dataIn) dataIn.close() result } } object HadoopConfig { def empty: HadoopConfig = new HadoopConfig(UserConfig.empty) def apply(config: UserConfig): HadoopConfig = new HadoopConfig(config) implicit def userConfigToHadoopConfig(userConf: UserConfig): HadoopConfig = { HadoopConfig(userConf) } }
Example 154
Source File: SequenceFileIO.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import org.apache.hadoop.conf.Configuration import org.slf4j.Logger import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.cluster.client.ClientContext import org.apache.gearpump.cluster.main.{ArgumentsParser, CLIOption, ParseResult} import org.apache.gearpump.streaming.partitioner.ShufflePartitioner import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._ import org.apache.gearpump.streaming.{Processor, StreamApplication} import org.apache.gearpump.util.Graph._ import org.apache.gearpump.util.{AkkaApp, Graph, LogUtil} object SequenceFileIO extends AkkaApp with ArgumentsParser { private val LOG: Logger = LogUtil.getLogger(getClass) override val options: Array[(String, CLIOption[Any])] = Array( "source" -> CLIOption[Int]("<sequence file reader number>", required = false, defaultValue = Some(1)), "sink" -> CLIOption[Int]("<sequence file writer number>", required = false, defaultValue = Some(1)), "input" -> CLIOption[String]("<input file path>", required = true), "output" -> CLIOption[String]("<output file directory>", required = true) ) def application(config: ParseResult): StreamApplication = { val spoutNum = config.getInt("source") val boltNum = config.getInt("sink") val input = config.getString("input") val output = config.getString("output") val appConfig = UserConfig.empty.withString(SeqFileStreamProducer.INPUT_PATH, input) .withString(SeqFileStreamProcessor.OUTPUT_PATH, output) val hadoopConfig = appConfig.withHadoopConf(new Configuration()) val partitioner = new ShufflePartitioner() val streamProducer = Processor[SeqFileStreamProducer](spoutNum) val streamProcessor = Processor[SeqFileStreamProcessor](boltNum) val app = StreamApplication("SequenceFileIO", Graph(streamProducer ~ partitioner ~> streamProcessor), hadoopConfig) app } override def main(akkaConf: Config, args: Array[String]): Unit = { val config = parse(args) val context = ClientContext(akkaConf) val appId = context.submit(application(config)) context.close() } }
Example 155
Source File: HadoopConfigSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import org.apache.hadoop.conf.Configuration import org.scalatest.{Matchers, WordSpec} import org.apache.gearpump.cluster.UserConfig class HadoopConfigSpec extends WordSpec with Matchers { "HadoopConfig" should { "serialize and deserialze hadoop configuration properly" in { val hadoopConf = new Configuration() val key = "test_key" val value = "test_value" hadoopConf.set(key, value) val user = UserConfig.empty import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._ assert(user.withHadoopConf(hadoopConf).hadoopConf.get(key) == value) } } }
Example 156
Source File: SeqFileStreamProcessorSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import java.io.File import java.time.Instant import scala.collection.mutable.ArrayBuffer import akka.actor.ActorSystem import akka.testkit.TestProbe import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.SequenceFile.Reader import org.apache.hadoop.io.{SequenceFile, Text} import org.mockito.Mockito._ import org.scalacheck.Gen import org.scalatest.prop.PropertyChecks import org.scalatest.{BeforeAndAfter, Matchers, PropSpec} import org.apache.gearpump.Message import org.apache.gearpump.cluster.{TestUtil, UserConfig} import org.apache.gearpump.streaming.task.TaskId import org.apache.gearpump.streaming.{MockUtil, Processor} class SeqFileStreamProcessorSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter { val kvPairs = new ArrayBuffer[(String, String)] val outputDirectory = "SeqFileStreamProcessor_Test" val sequenceFilePath = new Path(outputDirectory + File.separator + TaskId(0, 0)) val hadoopConf = new Configuration() val fs = FileSystem.get(hadoopConf) val textClass = new Text().getClass val _key = new Text() val _value = new Text() val kvGenerator = for { key <- Gen.alphaStr value <- Gen.alphaStr } yield (key, value) before { implicit val system1 = ActorSystem("SeqFileStreamProcessor", TestUtil.DEFAULT_CONFIG) val system2 = ActorSystem("Reporter", TestUtil.DEFAULT_CONFIG) val watcher = TestProbe()(system1) val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProcessor.OUTPUT_PATH, outputDirectory)).withHadoopConf(new Configuration()) val context = MockUtil.mockTaskContext val processorDescription = Processor.ProcessorToProcessorDescription(id = 0, Processor[SeqFileStreamProcessor](1)) val taskId = TaskId(0, 0) when(context.taskId).thenReturn(taskId) val processor = new SeqFileStreamProcessor(context, conf) processor.onStart(Instant.EPOCH) forAll(kvGenerator) { kv => val (key, value) = kv kvPairs.append((key, value)) processor.onNext(Message(key + "++" + value)) } processor.onStop() } property("SeqFileStreamProcessor should write the key-value pairs to a sequence file") { val reader = new SequenceFile.Reader(hadoopConf, Reader.file(sequenceFilePath)) kvPairs.foreach { kv => val (key, value) = kv if (value.length > 0 && reader.next(_key, _value)) { assert(_key.toString == key && _value.toString == value) } } reader.close() } after { fs.deleteOnExit(new Path(outputDirectory)) } }
Example 157
Source File: SeqFileStreamProducerSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.fsio import java.time.Instant import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.SequenceFile.Writer import org.apache.hadoop.io.{SequenceFile, Text} import org.mockito.Mockito._ import org.scalacheck.Gen import org.scalatest.prop.PropertyChecks import org.scalatest.{BeforeAndAfter, Matchers, PropSpec} import org.apache.gearpump.Message import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.streaming.MockUtil import org.apache.gearpump.streaming.MockUtil._ class SeqFileStreamProducerSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter { val kvPairs = new ArrayBuffer[(String, String)] val inputFile = "SeqFileStreamProducer_Test" val sequenceFilePath = new Path(inputFile) val hadoopConf = new Configuration() val fs = FileSystem.get(hadoopConf) val textClass = new Text().getClass val _key = new Text() val _value = new Text() val kvGenerator = for { key <- Gen.alphaStr value <- Gen.alphaStr } yield (key, value) before { fs.deleteOnExit(sequenceFilePath) val writer = SequenceFile.createWriter(hadoopConf, Writer.file(sequenceFilePath), Writer.keyClass(textClass), Writer.valueClass(textClass)) forAll(kvGenerator) { kv => _key.set(kv._1) _value.set(kv._2) kvPairs.append((kv._1, kv._2)) writer.append(_key, _value) } writer.close() } property("SeqFileStreamProducer should read the key-value pairs from " + "a sequence file and deliver them") { val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProducer.INPUT_PATH, inputFile)).withHadoopConf(new Configuration()) val context = MockUtil.mockTaskContext val producer = new SeqFileStreamProducer(context, conf) producer.onStart(Instant.EPOCH) producer.onNext(Message("start")) val expected = kvPairs.map(kv => kv._1 + "++" + kv._2).toSet verify(context).output(argMatch[Message](msg => expected.contains(msg.value.asInstanceOf[String]))) } after { fs.deleteOnExit(sequenceFilePath) } }
Example 158
Source File: WindowAverageApp.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.state import akka.actor.ActorSystem import org.apache.hadoop.conf.Configuration import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.cluster.client.ClientContext import org.apache.gearpump.cluster.main.{ArgumentsParser, CLIOption, ParseResult} import org.apache.gearpump.streaming.partitioner.HashPartitioner import org.apache.gearpump.streaming.examples.state.processor.{NumberGeneratorProcessor, WindowAverageProcessor} import org.apache.gearpump.streaming.hadoop.HadoopCheckpointStoreFactory import org.apache.gearpump.streaming.state.impl.{PersistentStateConfig, WindowConfig} import org.apache.gearpump.streaming.{Processor, StreamApplication} import org.apache.gearpump.util.Graph.Node import org.apache.gearpump.util.{AkkaApp, Graph} object WindowAverageApp extends AkkaApp with ArgumentsParser { override val options: Array[(String, CLIOption[Any])] = Array( "gen" -> CLIOption("<how many gen tasks>", required = false, defaultValue = Some(1)), "window" -> CLIOption("<how mange window tasks", required = false, defaultValue = Some(1)), "window_size" -> CLIOption("<window size in milliseconds>", required = false, defaultValue = Some(5000)), "window_step" -> CLIOption("<window step in milliseconds>", required = false, defaultValue = Some(5000)) ) def application(config: ParseResult)(implicit system: ActorSystem): StreamApplication = { val windowSize = config.getInt("window_size") val windowStep = config.getInt("window_step") val checkpointStoreFactory = new HadoopCheckpointStoreFactory("MessageCount", new Configuration) val taskConfig = UserConfig.empty. withBoolean(PersistentStateConfig.STATE_CHECKPOINT_ENABLE, true) .withLong(PersistentStateConfig.STATE_CHECKPOINT_INTERVAL_MS, 1000L) .withValue(PersistentStateConfig.STATE_CHECKPOINT_STORE_FACTORY, checkpointStoreFactory) .withValue(WindowConfig.NAME, WindowConfig(windowSize, windowStep)) val gen = Processor[NumberGeneratorProcessor](config.getInt("gen")) val count = Processor[WindowAverageProcessor](config.getInt("window"), taskConf = taskConfig) val partitioner = new HashPartitioner() val app = StreamApplication("WindowAverage", Graph(gen ~ partitioner ~> count), UserConfig.empty) app } override def main(akkaConf: Config, args: Array[String]): Unit = { val config = parse(args) val context = ClientContext(akkaConf) implicit val system = context.system val appId = context.submit(application(config)) context.close() } }
Example 159
Source File: MessageCountApp.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.state import java.util.Properties import akka.actor.ActorSystem import org.apache.gearpump.streaming.kafka.util.KafkaConfig import org.apache.hadoop.conf.Configuration import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.cluster.client.ClientContext import org.apache.gearpump.cluster.main.{ArgumentsParser, CLIOption, ParseResult} import org.apache.gearpump.streaming.partitioner.HashPartitioner import org.apache.gearpump.streaming.examples.state.processor.CountProcessor import org.apache.gearpump.streaming.hadoop.HadoopCheckpointStoreFactory import org.apache.gearpump.streaming.hadoop.lib.rotation.FileSizeRotation import org.apache.gearpump.streaming.kafka.{KafkaStoreFactory, KafkaSink, KafkaSource} import org.apache.gearpump.streaming.sink.DataSinkProcessor import org.apache.gearpump.streaming.source.DataSourceProcessor import org.apache.gearpump.streaming.state.impl.PersistentStateConfig import org.apache.gearpump.streaming.{Processor, StreamApplication} import org.apache.gearpump.util.Graph.Node import org.apache.gearpump.util.{AkkaApp, Graph} object MessageCountApp extends AkkaApp with ArgumentsParser { val SOURCE_TASK = "sourceTask" val COUNT_TASK = "countTask" val SINK_TASK = "sinkTask" val SOURCE_TOPIC = "sourceTopic" val SINK_TOPIC = "sinkTopic" val ZOOKEEPER_CONNECT = "zookeeperConnect" val BROKER_LIST = "brokerList" val DEFAULT_FS = "defaultFS" override val options: Array[(String, CLIOption[Any])] = Array( SOURCE_TASK -> CLIOption[Int]("<how many kafka source tasks>", required = false, defaultValue = Some(1)), COUNT_TASK -> CLIOption("<how many count tasks>", required = false, defaultValue = Some(1)), SINK_TASK -> CLIOption[Int]("<how many kafka sink tasks>", required = false, defaultValue = Some(1)), SOURCE_TOPIC -> CLIOption[String]("<kafka source topic>", required = true), SINK_TOPIC -> CLIOption[String]("<kafka sink topic>", required = true), ZOOKEEPER_CONNECT -> CLIOption[String]("<Zookeeper connect string, e.g. localhost:2181/kafka>", required = true), BROKER_LIST -> CLIOption[String]("<Kafka broker list, e.g. localhost:9092>", required = true), DEFAULT_FS -> CLIOption[String]("<name of the default file system, e.g. hdfs://localhost:9000>", required = true) ) def application(config: ParseResult)(implicit system: ActorSystem): StreamApplication = { val appName = "MessageCount" val hadoopConfig = new Configuration hadoopConfig.set("fs.defaultFS", config.getString(DEFAULT_FS)) val checkpointStoreFactory = new HadoopCheckpointStoreFactory("MessageCount", hadoopConfig, // Rotates on 1KB new FileSizeRotation(1000)) val taskConfig = UserConfig.empty .withBoolean(PersistentStateConfig.STATE_CHECKPOINT_ENABLE, true) .withLong(PersistentStateConfig.STATE_CHECKPOINT_INTERVAL_MS, 1000L) .withValue(PersistentStateConfig.STATE_CHECKPOINT_STORE_FACTORY, checkpointStoreFactory) val properties = new Properties properties.put(KafkaConfig.ZOOKEEPER_CONNECT_CONFIG, config.getString(ZOOKEEPER_CONNECT)) val brokerList = config.getString(BROKER_LIST) properties.put(KafkaConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList) properties.put(KafkaConfig.CHECKPOINT_STORE_NAME_PREFIX_CONFIG, appName) val kafkaStoreFactory = new KafkaStoreFactory(properties) val sourceTopic = config.getString(SOURCE_TOPIC) val kafkaSource = new KafkaSource(sourceTopic, properties) kafkaSource.setCheckpointStore(kafkaStoreFactory) val sourceProcessor = DataSourceProcessor(kafkaSource, config.getInt(SOURCE_TASK)) val countProcessor = Processor[CountProcessor](config.getInt(COUNT_TASK), taskConf = taskConfig) val kafkaSink = new KafkaSink(config.getString(SINK_TOPIC), properties) val sinkProcessor = DataSinkProcessor(kafkaSink, config.getInt(SINK_TASK)) val partitioner = new HashPartitioner() val graph = Graph(sourceProcessor ~ partitioner ~> countProcessor ~ partitioner ~> sinkProcessor) val app = StreamApplication(appName, graph, UserConfig.empty) app } def main(akkaConf: Config, args: Array[String]): Unit = { val config = parse(args) val context = ClientContext(akkaConf) implicit val system = context.system val appId = context.submit(application(config)) context.close() } }
Example 160
Source File: DFSJarStore.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.jarstore.dfs import java.io.{InputStream, OutputStream} import org.apache.gearpump.util.Constants import org.apache.gearpump.jarstore.JarStore import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import com.typesafe.config.Config import org.apache.hadoop.fs.permission.{FsAction, FsPermission} override def getFile(fileName: String): InputStream = { val filePath = new Path(rootPath, fileName) val fs = filePath.getFileSystem(new Configuration()) fs.open(filePath) } private def createDirIfNotExists(path: Path): Unit = { val fs = path.getFileSystem(new Configuration()) if (!fs.exists(path)) { fs.mkdirs(path, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)) } } }
Example 161
Source File: HadoopCheckpointStoreFactory.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.hadoop import java.io.{ObjectInputStream, ObjectOutputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.gearpump.streaming.hadoop.lib.HadoopUtil import org.apache.gearpump.streaming.hadoop.lib.rotation.{FileSizeRotation, Rotation} import org.apache.gearpump.streaming.transaction.api.{CheckpointStore, CheckpointStoreFactory} object HadoopCheckpointStoreFactory { val VERSION = 1 } class HadoopCheckpointStoreFactory( dir: String, @transient private var hadoopConfig: Configuration, rotation: Rotation = new FileSizeRotation(128 * Math.pow(2, 20).toLong)) extends CheckpointStoreFactory { import org.apache.gearpump.streaming.hadoop.HadoopCheckpointStoreFactory._ private def readObject(in: ObjectInputStream): Unit = { in.defaultReadObject() hadoopConfig = new Configuration(false) hadoopConfig.readFields(in) } override def getCheckpointStore(name: String): CheckpointStore = { val dirPath = new Path(dir + Path.SEPARATOR + s"v$VERSION", name) val fs = HadoopUtil.getFileSystemForPath(dirPath, hadoopConfig) new HadoopCheckpointStore(dirPath, fs, hadoopConfig, rotation) } }
Example 162
Source File: HadoopCheckpointStoreReader.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.hadoop.lib import java.io.EOFException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.gearpump.Time.MilliSeconds class HadoopCheckpointStoreReader( path: Path, hadoopConfig: Configuration) extends Iterator[(MilliSeconds, Array[Byte])] { private val stream = HadoopUtil.getInputStream(path, hadoopConfig) private var nextTimeStamp: Option[MilliSeconds] = None private var nextData: Option[Array[Byte]] = None override def hasNext: Boolean = { if (nextTimeStamp.isDefined) { true } else { try { nextTimeStamp = Some(stream.readLong()) val length = stream.readInt() val buffer = new Array[Byte](length) stream.readFully(buffer) nextData = Some(buffer) true } catch { case e: EOFException => close() false case e: Exception => close() throw e } } } override def next(): (MilliSeconds, Array[Byte]) = { val timeAndData = for { time <- nextTimeStamp data <- nextData } yield (time, data) nextTimeStamp = None nextData = None timeAndData.get } def close(): Unit = { stream.close() } }
Example 163
Source File: HadoopCheckpointStoreWriter.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.hadoop.lib import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.gearpump.Time.MilliSeconds class HadoopCheckpointStoreWriter(path: Path, hadoopConfig: Configuration) { private lazy val stream = HadoopUtil.getOutputStream(path, hadoopConfig) def write(timestamp: MilliSeconds, data: Array[Byte]): Long = { stream.writeLong(timestamp) stream.writeInt(data.length) stream.write(data) stream.hflush() stream.getPos() } def close(): Unit = { stream.close() } }
Example 164
Source File: HadoopUtil.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.hadoop.lib import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.security.UserGroupInformation import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.util.{Constants, FileUtils} private[hadoop] object HadoopUtil { def getOutputStream(path: Path, hadoopConfig: Configuration): FSDataOutputStream = { val dfs = getFileSystemForPath(path, hadoopConfig) val stream: FSDataOutputStream = { if (dfs.isFile(path)) { dfs.append(path) } else { dfs.create(path) } } stream } def getInputStream(path: Path, hadoopConfig: Configuration): FSDataInputStream = { val dfs = getFileSystemForPath(path, hadoopConfig) val stream = dfs.open(path) stream } def getFileSystemForPath(path: Path, hadoopConfig: Configuration): FileSystem = { // For local file systems, return the raw local file system, such calls to flush() // actually flushes the stream. val fs = path.getFileSystem(hadoopConfig) fs match { case localFs: LocalFileSystem => localFs.getRawFileSystem case _ => fs } } def login(userConfig: UserConfig, configuration: Configuration): Unit = { if (UserGroupInformation.isSecurityEnabled) { val principal = userConfig.getString(Constants.GEARPUMP_KERBEROS_PRINCIPAL) val keytabContent = userConfig.getBytes(Constants.GEARPUMP_KEYTAB_FILE) if (principal.isEmpty || keytabContent.isEmpty) { val errorMsg = s"HDFS is security enabled, user should provide kerberos principal in " + s"${Constants.GEARPUMP_KERBEROS_PRINCIPAL} " + s"and keytab file in ${Constants.GEARPUMP_KEYTAB_FILE}" throw new Exception(errorMsg) } val keytabFile = File.createTempFile("login", ".keytab") FileUtils.writeByteArrayToFile(keytabFile, keytabContent.get) keytabFile.setExecutable(false) keytabFile.setWritable(false) keytabFile.setReadable(true, true) UserGroupInformation.setConfiguration(configuration) UserGroupInformation.loginUserFromKeytab(principal.get, keytabFile.getAbsolutePath) keytabFile.delete() } } }
Example 165
Source File: HadoopCheckpointStoreIntegrationSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.mockito.Mockito._ import org.scalacheck.Gen import org.scalatest.mock.MockitoSugar import org.scalatest.prop.PropertyChecks import org.scalatest.{Matchers, PropSpec} import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.streaming.MockUtil import org.apache.gearpump.streaming.hadoop.lib.HadoopUtil import org.apache.gearpump.streaming.hadoop.lib.rotation.FileSizeRotation import org.apache.gearpump.streaming.task.TaskId class HadoopCheckpointStoreIntegrationSpec extends PropSpec with PropertyChecks with MockitoSugar with Matchers { property("HadoopCheckpointStore should persist and recover checkpoints") { val fileSizeGen = Gen.chooseNum[Int](100, 1000) forAll(fileSizeGen) { (fileSize: Int) => val userConfig = UserConfig.empty val taskContext = MockUtil.mockTaskContext val hadoopConfig = new Configuration() when(taskContext.appId).thenReturn(0) when(taskContext.taskId).thenReturn(TaskId(0, 0)) val rootDirName = "test" val rootDir = new Path(rootDirName + Path.SEPARATOR + s"v${HadoopCheckpointStoreFactory.VERSION}") val subDirName = "app0-task0_0" val subDir = new Path(rootDir, subDirName) val fs = HadoopUtil.getFileSystemForPath(rootDir, hadoopConfig) fs.delete(rootDir, true) fs.exists(rootDir) shouldBe false val checkpointStoreFactory = new HadoopCheckpointStoreFactory( rootDirName, hadoopConfig, new FileSizeRotation(fileSize)) val checkpointStore = checkpointStoreFactory.getCheckpointStore(subDirName) checkpointStore.persist(0L, Array(0.toByte)) val tempFile = new Path(subDir, "checkpoints-0.store") fs.exists(tempFile) shouldBe true checkpointStore.persist(1L, Array.fill(fileSize)(0.toByte)) fs.exists(tempFile) shouldBe false fs.exists(new Path(subDir, "checkpoints-0-1.store")) shouldBe true checkpointStore.persist(2L, Array(0.toByte)) val newTempFile = new Path(subDir, "checkpoints-2.store") fs.exists(newTempFile) shouldBe true for (i <- 0 to 2) { val optCp = checkpointStore.recover(i) optCp should not be empty } fs.exists(newTempFile) shouldBe false fs.exists(new Path(subDir, "checkpoints-2-2.store")) shouldBe true checkpointStore.close() fs.delete(rootDir, true) fs.close() } } }
Example 166
Source File: HBaseSinkSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.external.hbase import akka.actor.ActorSystem import org.apache.gearpump.Message import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.external.hbase.HBaseSink.{HBaseWriter, HBaseWriterFactory} import org.apache.gearpump.streaming.MockUtil import org.apache.gearpump.streaming.task.TaskContext import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.util.Bytes import org.mockito.Mockito._ import org.scalacheck.Gen import org.scalatest.mock.MockitoSugar import org.scalatest.prop.PropertyChecks import org.scalatest.{Matchers, PropSpec} class HBaseSinkSpec extends PropSpec with PropertyChecks with Matchers with MockitoSugar { property("HBaseSink should invoke HBaseWriter for writing message to HBase") { val hbaseWriter = mock[HBaseWriter] val hbaseWriterFactory = mock[HBaseWriterFactory] implicit val system: ActorSystem = MockUtil.system val userConfig = UserConfig.empty val tableName = "hbase" when(hbaseWriterFactory.getHBaseWriter(userConfig, tableName)) .thenReturn(hbaseWriter) val hbaseSink = new HBaseSink(userConfig, tableName, hbaseWriterFactory) hbaseSink.open(MockUtil.mockTaskContext) forAll(Gen.alphaStr) { (value: String) => val message = Message(value) hbaseSink.write(message) verify(hbaseWriter, atLeastOnce()).put(value) } hbaseSink.close() verify(hbaseWriter).close() } property("HBaseWriter should insert a row successfully") { val table = mock[Table] val config = mock[Configuration] val connection = mock[Connection] val taskContext = mock[TaskContext] val map = Map[String, String]("HBASESINK" -> "hbasesink", "TABLE_NAME" -> "hbase.table.name", "COLUMN_FAMILY" -> "hbase.table.column.family", "COLUMN_NAME" -> "hbase.table.column.name", "HBASE_USER" -> "hbase.user", "GEARPUMP_KERBEROS_PRINCIPAL" -> "gearpump.kerberos.principal", "GEARPUMP_KEYTAB_FILE" -> "gearpump.keytab.file" ) val userConfig = new UserConfig(map) val tableName = "hbase" val row = "row" val group = "group" val name = "name" val value = "3.0" when(connection.getTable(TableName.valueOf(tableName))).thenReturn(table) val put = new Put(Bytes.toBytes(row)) put.addColumn(Bytes.toBytes(group), Bytes.toBytes(name), Bytes.toBytes(value)) val hbaseWriter = new HBaseWriter(connection, tableName) hbaseWriter.insert(Bytes.toBytes(row), Bytes.toBytes(group), Bytes.toBytes(name), Bytes.toBytes(value)) verify(table).put(MockUtil.argMatch[Put](_.getRow sameElements Bytes.toBytes(row))) } }
Example 167
Source File: MiniClusterUtils.scala From incubator-livy with Apache License 2.0 | 5 votes |
package org.apache.livy.test.framework import java.io._ import java.nio.charset.StandardCharsets.UTF_8 import java.util.Properties import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration trait MiniClusterUtils { protected def saveProperties(props: Map[String, String], dest: File): Unit = { val jprops = new Properties() props.foreach { case (k, v) => jprops.put(k, v) } val tempFile = new File(dest.getAbsolutePath() + ".tmp") val out = new OutputStreamWriter(new FileOutputStream(tempFile), UTF_8) try { jprops.store(out, "Configuration") } finally { out.close() } tempFile.renameTo(dest) } protected def loadProperties(file: File): Map[String, String] = { val in = new InputStreamReader(new FileInputStream(file), UTF_8) val props = new Properties() try { props.load(in) } finally { in.close() } props.asScala.toMap } protected def saveConfig(conf: Configuration, dest: File): Unit = { val redacted = new Configuration(conf) // This setting references a test class that is not available when using a real Spark // installation, so remove it from client configs. redacted.unset("net.topology.node.switch.mapping.impl") val out = new FileOutputStream(dest) try { redacted.writeXml(out) } finally { out.close() } } }
Example 168
Source File: HDFSCluster.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hdfs.MiniDFSCluster import org.apache.hadoop.test.PathUtils class HDFSCluster extends HDFSClusterLike trait HDFSClusterLike { @transient private var hdfsCluster: MiniDFSCluster = null def startHDFS() = { println("Starting HDFS Cluster...") val baseDir = new File(PathUtils.getTestDir(getClass()), "miniHDFS") val conf = new Configuration() conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()) val builder = new MiniDFSCluster.Builder(conf) hdfsCluster = builder.nameNodePort(8020).format(true).build() hdfsCluster.waitClusterUp() } def getNameNodeURI(): String = { "hdfs://localhost:" + hdfsCluster.getNameNodePort() } def shutdownHDFS(): Unit = { hdfsCluster.shutdown() } }
Example 169
Source File: HDFSClusterTest.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import java.io.{ BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter} import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.rdd.RDD import org.scalatest.FunSuite class HDFSClusterTest extends FunSuite with SharedSparkContext with RDDComparisons { var hdfsCluster: HDFSCluster = null override def beforeAll(): Unit = { super.beforeAll() hdfsCluster = new HDFSCluster hdfsCluster.startHDFS() } test("get the namenode uri") { val nameNodeURI = hdfsCluster.getNameNodeURI() assert(nameNodeURI == "hdfs://localhost:8020") } test("read and write from spark to hdfs") { val list = List(1, 2, 3, 4, 5) val numRDD: RDD[Int] = sc.parallelize(list) val path = hdfsCluster.getNameNodeURI() + "/myRDD" numRDD.saveAsTextFile(path) val loadedRDD: RDD[Int] = sc.textFile(path).map(_.toInt) assertRDDEquals(numRDD, loadedRDD) } test("test creating local file to hdfs") { val path = new Path(hdfsCluster.getNameNodeURI() + "/myfile") val fs = FileSystem.get(path.toUri, new Configuration()) val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path))) val writtenString = "hello, it's me" writer.write(writtenString) writer.close() val reader = new BufferedReader(new InputStreamReader(fs.open(path))) val readString = reader.readLine() reader.close() assert(writtenString == readString) } override def afterAll() { hdfsCluster.shutdownHDFS() super.afterAll() } }
Example 170
Source File: OrcFileOperator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.flatMap(getFileReader(_, conf)).headOption.map { reader => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 171
Source File: HiveExternalCatalogSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.types.StructType class HiveExternalCatalogSuite extends ExternalCatalogSuite { private val externalCatalog: HiveExternalCatalog = { val catalog = new HiveExternalCatalog(new SparkConf, new Configuration) catalog.client.reset() catalog } protected override val utils: CatalogTestUtils = new CatalogTestUtils { override val tableInputFormat: String = "org.apache.hadoop.mapred.SequenceFileInputFormat" override val tableOutputFormat: String = "org.apache.hadoop.mapred.SequenceFileOutputFormat" override def newEmptyCatalog(): ExternalCatalog = externalCatalog } protected override def resetState(): Unit = { externalCatalog.client.reset() } import utils._ test("list partitions by filter") { val catalog = newBasicCatalog() val selectedPartitions = catalog.listPartitionsByFilter("db2", "tbl2", Seq('a.int === 1)) assert(selectedPartitions.length == 1) assert(selectedPartitions.head.spec == part1.spec) } test("SPARK-18647: do not put provider in table properties for Hive serde table") { val catalog = newBasicCatalog() val hiveTable = CatalogTable( identifier = TableIdentifier("hive_tbl", Some("db1")), tableType = CatalogTableType.MANAGED, storage = storageFormat, schema = new StructType().add("col1", "int").add("col2", "string"), provider = Some("hive")) catalog.createTable(hiveTable, ignoreIfExists = false) val rawTable = externalCatalog.client.getTable("db1", "hive_tbl") assert(!rawTable.properties.contains(HiveExternalCatalog.DATASOURCE_PROVIDER)) assert(externalCatalog.getTable("db1", "hive_tbl").provider == Some(DDLUtils.HIVE_PROVIDER)) } }
Example 172
Source File: HiveClientBuilder.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.util.VersionInfo import org.apache.spark.SparkConf import org.apache.spark.util.Utils private[client] class HiveClientBuilder { private val sparkConf = new SparkConf() // In order to speed up test execution during development or in Jenkins, you can specify the path // of an existing Ivy cache: private val ivyPath: Option[String] = { sys.env.get("SPARK_VERSIONS_SUITE_IVY_PATH").orElse( Some(new File(sys.props("java.io.tmpdir"), "hive-ivy-cache").getAbsolutePath)) } private def buildConf() = { lazy val warehousePath = Utils.createTempDir() lazy val metastorePath = Utils.createTempDir() metastorePath.delete() Map( "javax.jdo.option.ConnectionURL" -> s"jdbc:derby:;databaseName=$metastorePath;create=true", "hive.metastore.warehouse.dir" -> warehousePath.toString) } def buildClient(version: String, hadoopConf: Configuration): HiveClient = { IsolatedClientLoader.forVersion( hiveMetastoreVersion = version, hadoopVersion = VersionInfo.getVersion, sparkConf = sparkConf, hadoopConf = hadoopConf, config = buildConf(), ivyPath = ivyPath).createClient() } }
Example 173
Source File: HiveClientSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.conf.HiveConf import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal} import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.types.IntegerType class HiveClientSuite extends SparkFunSuite { private val clientBuilder = new HiveClientBuilder private val tryDirectSqlKey = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname test(s"getPartitionsByFilter returns all partitions when $tryDirectSqlKey=false") { val testPartitionCount = 5 val storageFormat = CatalogStorageFormat( locationUri = None, inputFormat = None, outputFormat = None, serde = None, compressed = false, properties = Map.empty) val hadoopConf = new Configuration() hadoopConf.setBoolean(tryDirectSqlKey, false) val client = clientBuilder.buildClient(HiveUtils.hiveExecutionVersion, hadoopConf) client.runSqlHive("CREATE TABLE test (value INT) PARTITIONED BY (part INT)") val partitions = (1 to testPartitionCount).map { part => CatalogTablePartition(Map("part" -> part.toString), storageFormat) } client.createPartitions( "default", "test", partitions, ignoreIfExists = false) val filteredPartitions = client.getPartitionsByFilter(client.getTable("default", "test"), Seq(EqualTo(AttributeReference("part", IntegerType)(), Literal(3)))) assert(filteredPartitions.size == testPartitionCount) } }
Example 174
Source File: CompressionCodecs.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress._ import org.apache.spark.util.Utils object CompressionCodecs { private val shortCompressionCodecNames = Map( "none" -> null, "uncompressed" -> null, "bzip2" -> classOf[BZip2Codec].getName, "deflate" -> classOf[DeflateCodec].getName, "gzip" -> classOf[GzipCodec].getName, "lz4" -> classOf[Lz4Codec].getName, "snappy" -> classOf[SnappyCodec].getName) def setCodecConfiguration(conf: Configuration, codec: String): Unit = { if (codec != null) { conf.set("mapreduce.output.fileoutputformat.compress", "true") conf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString) conf.set("mapreduce.output.fileoutputformat.compress.codec", codec) conf.set("mapreduce.map.output.compress", "true") conf.set("mapreduce.map.output.compress.codec", codec) } else { // This infers the option `compression` is set to `uncompressed` or `none`. conf.set("mapreduce.output.fileoutputformat.compress", "false") conf.set("mapreduce.map.output.compress", "false") } } }
Example 175
Source File: HadoopFileLinesReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 176
Source File: StreamMetadata.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = FileSystem.get(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 177
Source File: StreamMetadataSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.streaming.StreamTest class StreamMetadataSuite extends StreamTest { test("writing and reading") { withTempDir { dir => val id = UUID.randomUUID.toString val metadata = StreamMetadata(id) val file = new Path(new File(dir, "test").toString) StreamMetadata.write(metadata, file, hadoopConf) val readMetadata = StreamMetadata.read(file, hadoopConf) assert(readMetadata.nonEmpty) assert(readMetadata.get.id === id) } } test("read Spark 2.1.0 format") { // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0 assert( readForResource("query-metadata-logs-version-2.1.0.txt") === StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e")) } private def readForResource(fileName: String): StreamMetadata = { val input = getClass.getResource(s"/structured-streaming/$fileName") StreamMetadata.read(new Path(input.toString), hadoopConf).get } private val hadoopConf = new Configuration() }
Example 178
Source File: HBaseCredentialProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging { override def serviceName: String = "hbase" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def credentialsRequired(hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 179
Source File: HDFSCredentialProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).flatMap { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val hdfsToken = creds.getAllTokens.asScala .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) hdfsToken.map { t => val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 180
Source File: HDFSCredentialProviderSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, PrivateMethodTester} import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} class HDFSCredentialProviderSuite extends SparkFunSuite with PrivateMethodTester with Matchers { private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer) private def getTokenRenewer( hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = { hdfsCredentialProvider invokePrivate _getTokenRenewer(conf) } private var hdfsCredentialProvider: HDFSCredentialProvider = null override def beforeAll() { super.beforeAll() if (hdfsCredentialProvider == null) { hdfsCredentialProvider = new HDFSCredentialProvider() } } override def afterAll() { if (hdfsCredentialProvider != null) { hdfsCredentialProvider = null } super.afterAll() } test("check token renewer") { val hadoopConf = new Configuration() hadoopConf.set("yarn.resourcemanager.address", "myrm:8033") hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]") val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf) renewer should be ("yarn/myrm:[email protected]") } test("check token renewer default") { val hadoopConf = new Configuration() val caught = intercept[SparkException] { getTokenRenewer(hdfsCredentialProvider, hadoopConf) } assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer") } }
Example 181
Source File: FileBasedWriteAheadLogRandomReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.Closeable import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration private[streaming] class FileBasedWriteAheadLogRandomReader(path: String, conf: Configuration) extends Closeable { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = (instream == null) // the file may be deleted as we're opening the stream def read(segment: FileBasedWriteAheadLogSegment): ByteBuffer = synchronized { assertOpen() instream.seek(segment.offset) val nextLength = instream.readInt() HdfsUtils.checkState(nextLength == segment.length, s"Expected message length to be ${segment.length}, but was $nextLength") val buffer = new Array[Byte](nextLength) instream.readFully(buffer) ByteBuffer.wrap(buffer) } override def close(): Unit = synchronized { closed = true instream.close() } private def assertOpen() { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Reader to read from the file.") } }
Example 182
Source File: HdfsUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{FileNotFoundException, IOException} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ private[streaming] object HdfsUtils { def getOutputStream(path: String, conf: Configuration): FSDataOutputStream = { val dfsPath = new Path(path) val dfs = getFileSystemForPath(dfsPath, conf) // If the file exists and we have append support, append instead of creating a new file val stream: FSDataOutputStream = { if (dfs.isFile(dfsPath)) { if (conf.getBoolean("hdfs.append.support", false) || dfs.isInstanceOf[RawLocalFileSystem]) { dfs.append(dfsPath) } else { throw new IllegalStateException("File exists and there is no append support!") } } else { dfs.create(dfsPath) } } stream } def getInputStream(path: String, conf: Configuration): FSDataInputStream = { val dfsPath = new Path(path) val dfs = getFileSystemForPath(dfsPath, conf) try { dfs.open(dfsPath) } catch { case _: FileNotFoundException => null case e: IOException => // If we are really unlucky, the file may be deleted as we're opening the stream. // This can happen as clean up is performed by daemon threads that may be left over from // previous runs. if (!dfs.isFile(dfsPath)) null else throw e } } def checkState(state: Boolean, errorMsg: => String) { if (!state) { throw new IllegalStateException(errorMsg) } } def checkFileExists(path: String, conf: Configuration): Boolean = { val hdpPath = new Path(path) val fs = getFileSystemForPath(hdpPath, conf) fs.isFile(hdpPath) } }
Example 183
Source File: FileBasedWriteAheadLogWriter.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io._ import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.util.Utils def write(data: ByteBuffer): FileBasedWriteAheadLogSegment = synchronized { assertOpen() data.rewind() // Rewind to ensure all data in the buffer is retrieved val lengthToWrite = data.remaining() val segment = new FileBasedWriteAheadLogSegment(path, nextOffset, lengthToWrite) stream.writeInt(lengthToWrite) Utils.writeByteBuffer(data, stream: OutputStream) flush() nextOffset = stream.getPos() segment } override def close(): Unit = synchronized { closed = true stream.close() } private def flush() { stream.hflush() // Useful for local file system where hflush/sync does not work (HADOOP-7844) stream.getWrappedStream.flush() } private def assertOpen() { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.") } }
Example 184
Source File: FileBasedWriteAheadLogReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{Closeable, EOFException, IOException} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.internal.Logging private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration) extends Iterator[ByteBuffer] with Closeable with Logging { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = (instream == null) // the file may be deleted as we're opening the stream private var nextItem: Option[ByteBuffer] = None override def hasNext: Boolean = synchronized { if (closed) { return false } if (nextItem.isDefined) { // handle the case where hasNext is called without calling next true } else { try { val length = instream.readInt() val buffer = new Array[Byte](length) instream.readFully(buffer) nextItem = Some(ByteBuffer.wrap(buffer)) logTrace("Read next item " + nextItem.get) true } catch { case e: EOFException => logDebug("Error reading next item, EOF reached", e) close() false case e: IOException => logWarning("Error while trying to read data. If the file was deleted, " + "this should be okay.", e) close() if (HdfsUtils.checkFileExists(path, conf)) { // If file exists, this could be a legitimate error throw e } else { // File was deleted. This can occur when the daemon cleanup thread takes time to // delete the file during recovery. false } case e: Exception => logWarning("Error while trying to read data from HDFS.", e) close() throw e } } } override def next(): ByteBuffer = synchronized { val data = nextItem.getOrElse { close() throw new IllegalStateException( "next called without calling hasNext or after hasNext returned false") } nextItem = None // Ensure the next hasNext call loads new data. data } override def close(): Unit = synchronized { if (!closed) { instream.close() } closed = true } }
Example 185
Source File: SerializableWritable.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value: T = t override def toString: String = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration(false)) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 186
Source File: SerializableConfiguration.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{ObjectInputStream, ObjectOutputStream} import org.apache.hadoop.conf.Configuration private[spark] class SerializableConfiguration(@transient var value: Configuration) extends Serializable { private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() value.write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { value = new Configuration(false) value.readFields(in) } }
Example 187
Source File: WholeTextFileRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 188
Source File: BinaryFileRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.StreamFileInputFormat private[spark] class BinaryFileRDD[T]( @transient private val sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(sc, jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 189
Source File: SerializableWritable.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value = t override def toString = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration()) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 190
Source File: SparkHadoopMapReduceUtil.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.mapreduce import java.lang.{Boolean => JBoolean, Integer => JInteger} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID} private[spark] trait SparkHadoopMapReduceUtil { def newJobContext(conf: Configuration, jobId: JobID): JobContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.JobContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.JobContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID]) ctor.newInstance(conf, jobId).asInstanceOf[JobContext] } def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.TaskAttemptContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID]) ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext] } def newTaskAttemptID( jtIdentifier: String, jobId: Int, isMap: Boolean, taskId: Int, attemptId: Int) = { val klass = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptID") try { // First, attempt to use the old-style constructor that takes a boolean isMap // (not available in YARN) val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean], classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } catch { case exc: NoSuchMethodException => { // If that failed, look for the new constructor that takes a TaskType (not available in 1.x) val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType") .asInstanceOf[Class[Enum[_]]] val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke( taskTypeClass, if(isMap) "MAP" else "REDUCE") val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass, classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } } } private def firstAvailableClass(first: String, second: String): Class[_] = { try { Class.forName(first) } catch { case e: ClassNotFoundException => Class.forName(second) } } }
Example 191
Source File: BinaryFileRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{ Configurable, Configuration } import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.spark.input.StreamFileInputFormat import org.apache.spark.{ Partition, SparkContext } private[spark] class BinaryFileRDD[T]( sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], @transient conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = newJobContext(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 192
Source File: CarbonHiveMetastoreListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.hive import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.metastore.MetaStorePreEventListener import org.apache.hadoop.hive.metastore.api.{FieldSchema, MetaException} import org.apache.hadoop.hive.metastore.events._ import org.apache.hadoop.hive.metastore.events.PreEventContext.PreEventType._ import org.apache.spark.sql.types.{DataType, StructField, StructType} class CarbonHiveMetastoreListener(conf: Configuration) extends MetaStorePreEventListener(conf) { override def onEvent(preEventContext: PreEventContext): Unit = { preEventContext.getEventType match { case CREATE_TABLE => val table = preEventContext.asInstanceOf[PreCreateTableEvent].getTable val tableProps = table.getParameters if (tableProps != null && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource" || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) { val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts") if (numSchemaParts != null && !numSchemaParts.isEmpty) { val parts = (0 until numSchemaParts.toInt).map { index => val part = tableProps.get(s"spark.sql.sources.schema.part.${index}") if (part == null) { throw new MetaException(s"spark.sql.sources.schema.part.${index} is missing!") } part } // Stick all parts back to a single schema string. val schema = DataType.fromJson(parts.mkString).asInstanceOf[StructType] val hiveSchema = schema.map(toHiveColumn).asJava table.getSd.setCols(hiveSchema) table.getSd.setInputFormat("org.apache.carbondata.hive.MapredCarbonInputFormat") table.getSd.setOutputFormat("org.apache.carbondata.hive.MapredCarbonOutputFormat") val serdeInfo = table.getSd.getSerdeInfo serdeInfo.setSerializationLib("org.apache.carbondata.hive.CarbonHiveSerDe") val tablePath = serdeInfo.getParameters.get("tablePath") if (tablePath != null) { table.getSd.setLocation(tablePath) } } } case ALTER_TABLE => val table = preEventContext.asInstanceOf[PreAlterTableEvent].getNewTable val tableProps = table.getParameters if (tableProps != null && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource" || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) { val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts") if (numSchemaParts != null && !numSchemaParts.isEmpty) { val schemaParts = (0 until numSchemaParts.toInt).map { index => val schemaPart = tableProps.get(s"spark.sql.sources.schema.part.$index") if (schemaPart == null) { throw new MetaException(s"spark.sql.sources.schema.part.$index is missing!") } schemaPart } // Stick all schemaParts back to a single schema string. val schema = DataType.fromJson(schemaParts.mkString).asInstanceOf[StructType] val hiveSchema = schema.map(toHiveColumn).asJava table.getSd.setCols(hiveSchema) } } case _ => // do nothing } } private def toHiveColumn(c: StructField): FieldSchema = { val typeString = if (c.metadata.contains("HIVE_TYPE_STRING")) { c.metadata.getString("HIVE_TYPE_STRING") } else { c.dataType.catalogString } new FieldSchema(c.name, typeString, c.getComment().orNull) } }
Example 193
Source File: CarbonLoadParams.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.management import java.text.SimpleDateFormat import java.util import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.command.UpdateTableModel import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.statusmanager.SegmentStatus import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.events.OperationContext import org.apache.carbondata.processing.loading.model.CarbonLoadModel case class CarbonLoadParams( sparkSession: SparkSession, tableName: String, sizeInBytes: Long, isOverwriteTable: Boolean, carbonLoadModel: CarbonLoadModel, hadoopConf: Configuration, logicalPartitionRelation: LogicalRelation, dateFormat : SimpleDateFormat, timeStampFormat : SimpleDateFormat, optionsOriginal: Map[String, String], finalPartition : Map[String, Option[String]], currPartitions: util.List[PartitionSpec], partitionStatus : SegmentStatus, var dataFrame: Option[DataFrame], scanResultRDD : Option[RDD[InternalRow]], updateModel: Option[UpdateTableModel], operationContext: OperationContext) { }
Example 194
Source File: FileUtils.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{File, IOException} import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkContext import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.datastore.filesystem.CarbonFile import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.metadata.DatabaseLocationProvider import org.apache.carbondata.core.util.CarbonUtil import org.apache.carbondata.events.{CreateDatabasePostExecutionEvent, OperationContext, OperationListenerBus} import org.apache.carbondata.processing.exception.DataLoadingException object FileUtils { def getPaths(inputPath: String): String = { getPaths(inputPath, FileFactory.getConfiguration) } def getPaths(inputPath: String, hadoopConf: Configuration): String = { if (inputPath == null || inputPath.isEmpty) { throw new DataLoadingException("Input file path cannot be empty.") } else { val stringBuild = new StringBuilder() val filePaths = inputPath.split(",").map(_.trim) for (i <- 0 until filePaths.size) { val filePath = CarbonUtil.checkAndAppendHDFSUrl(filePaths(i)) val carbonFile = FileFactory.getCarbonFile(filePath, hadoopConf) if (!carbonFile.exists()) { throw new DataLoadingException( s"The input file does not exist: ${CarbonUtil.removeAKSK(filePaths(i))}" ) } getPathsFromCarbonFile(carbonFile, stringBuild, hadoopConf) } if (stringBuild.nonEmpty) { stringBuild.substring(0, stringBuild.size - 1) } else { throw new DataLoadingException("Please check your input path and make sure " + "that files end with '.csv' and content is not empty.") } } } def getSpaceOccupied(inputPath: String, hadoopConfiguration: Configuration): Long = { var size : Long = 0 if (inputPath == null || inputPath.isEmpty) { size } else { val filePaths = inputPath.split(",") for (i <- 0 until filePaths.size) { val carbonFile = FileFactory.getCarbonFile(filePaths(i), hadoopConfiguration) size = size + carbonFile.getSize } size } } def createDatabaseDirectory(dbName: String, storePath: String, sparkContext: SparkContext) { val databasePath: String = storePath + File.separator + DatabaseLocationProvider.get().provide(dbName.toLowerCase) FileFactory.mkdirs(databasePath) val operationContext = new OperationContext val createDatabasePostExecutionEvent = new CreateDatabasePostExecutionEvent(dbName, databasePath, sparkContext) OperationListenerBus.getInstance.fireEvent(createDatabasePostExecutionEvent, operationContext) } }
Example 195
Source File: TableLoader.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.util.Properties import scala.collection.{immutable, mutable} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.command.management.CarbonLoadDataCommand import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.util.CarbonProperties // scalastyle:off object TableLoader { def extractOptions(propertiesFile: String): immutable.Map[String, String] = { val props = new Properties val path = new Path(propertiesFile) val fs = path.getFileSystem(FileFactory.getConfiguration) props.load(fs.open(path)) val elments = props.entrySet().iterator() val map = new mutable.HashMap[String, String]() System.out.println("properties file:") while (elments.hasNext) { val elment = elments.next() System.out.println(s"${elment.getKey}=${elment.getValue}") map.put(elment.getKey.asInstanceOf[String], elment.getValue.asInstanceOf[String]) } immutable.Map(map.toSeq: _*) } def extractStorePath(map: immutable.Map[String, String]): String = { map.get(CarbonCommonConstants.STORE_LOCATION) match { case Some(path) => path case None => throw new Exception(s"${CarbonCommonConstants.STORE_LOCATION} can't be empty") } } def loadTable(spark: SparkSession, dbName: Option[String], tableName: String, inputPaths: String, options: scala.collection.immutable.Map[String, String]): Unit = { CarbonLoadDataCommand(dbName, tableName, inputPaths, Nil, options, false).run(spark) } def main(args: Array[String]): Unit = { if (args.length < 3) { System.err.println("Usage: TableLoader <properties file> <table name> <input files>") System.exit(1) } System.out.println("parameter list:") args.foreach(System.out.println) val map = extractOptions(TableAPIUtil.escape(args(0))) val storePath = extractStorePath(map) System.out.println(s"${CarbonCommonConstants.STORE_LOCATION}:$storePath") val (dbName, tableName) = TableAPIUtil.parseSchemaName(TableAPIUtil.escape(args(1))) System.out.println(s"table name: $dbName.$tableName") val inputPaths = TableAPIUtil.escape(args(2)) val spark = TableAPIUtil.spark(storePath, s"TableLoader: $dbName.$tableName") loadTable(spark, Option(dbName), tableName, inputPaths, map) } }
Example 196
Source File: CarbonDeltaRowScanRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.conf.Configuration import org.apache.spark.Partition import org.apache.spark.sql.SparkSession import org.apache.carbondata.converter.SparkDataTypeConverterImpl import org.apache.carbondata.core.index.IndexFilter import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier import org.apache.carbondata.core.metadata.schema.table.{CarbonTable, TableInfo} import org.apache.carbondata.core.mutate.CarbonUpdateUtil import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager import org.apache.carbondata.core.util.DataTypeConverter import org.apache.carbondata.hadoop.{CarbonMultiBlockSplit, CarbonProjection} import org.apache.carbondata.hadoop.api.CarbonTableInputFormat import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport import org.apache.carbondata.spark.InitInputMetrics class CarbonDeltaRowScanRDD[T: ClassTag]( @transient private val spark: SparkSession, @transient private val serializedTableInfo: Array[Byte], @transient private val tableInfo: TableInfo, @transient override val partitionNames: Seq[PartitionSpec], override val columnProjection: CarbonProjection, var filter: IndexFilter, identifier: AbsoluteTableIdentifier, inputMetricsStats: InitInputMetrics, override val dataTypeConverterClz: Class[_ <: DataTypeConverter] = classOf[SparkDataTypeConverterImpl], override val readSupportClz: Class[_ <: CarbonReadSupport[_]] = SparkReadSupport.readSupportClass, deltaVersionToRead: String) extends CarbonScanRDD[T]( spark, columnProjection, filter, identifier, serializedTableInfo, tableInfo, inputMetricsStats, partitionNames, dataTypeConverterClz, readSupportClz) { override def internalGetPartitions: Array[Partition] = { val table = CarbonTable.buildFromTableInfo(getTableInfo) val updateStatusManager = new SegmentUpdateStatusManager(table, deltaVersionToRead) val parts = super.internalGetPartitions parts.map { p => val partition = p.asInstanceOf[CarbonSparkPartition] val splits = partition.multiBlockSplit.getAllSplits.asScala.filter { s => updateStatusManager.getDetailsForABlock( CarbonUpdateUtil.getSegmentBlockNameKey(s.getSegmentId, s.getBlockPath, table.isHivePartitionTable)) != null }.asJava new CarbonSparkPartition(partition.rddId, partition.index, new CarbonMultiBlockSplit(splits, partition.multiBlockSplit.getLocations)) }.filter(p => p.multiBlockSplit.getAllSplits.size() > 0).zipWithIndex.map{ case (p, index) => new CarbonSparkPartition(p.rddId, index, p.multiBlockSplit) }.asInstanceOf[Array[Partition]] } override def createInputFormat(conf: Configuration): CarbonTableInputFormat[Object] = { val format = super.createInputFormat(conf) conf.set("updateDeltaVersion", deltaVersionToRead) conf.set("readDeltaOnly", "true") format } }
Example 197
Source File: CarbonRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.conf.Configuration import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.util.SparkSQLUtil import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.metadata.schema.table.TableInfo import org.apache.carbondata.core.util._ abstract class CarbonRDDWithTableInfo[T: ClassTag]( @transient private val ss: SparkSession, @transient private var deps: Seq[Dependency[_]], serializedTableInfo: Array[Byte]) extends CarbonRDD[T](ss, deps) { def this(@transient sparkSession: SparkSession, @transient oneParent: RDD[_], serializedTableInfo: Array[Byte]) = { this (sparkSession, List(new OneToOneDependency(oneParent)), serializedTableInfo) } def getTableInfo: TableInfo = TableInfo.deserialize(serializedTableInfo) }
Example 198
Source File: TestDataLoadWithFileName.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.dataload import scala.collection.JavaConverters._ import java.io.{File, FilenameFilter} import org.apache.hadoop.conf.Configuration import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.reader.CarbonIndexFileReader import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.core.util.path.CarbonTablePath import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.index.Segment import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.metadata.{CarbonMetadata, SegmentFileStore} class TestDataLoadWithFileName extends QueryTest with BeforeAndAfterAll { var originVersion = "" override def beforeAll() { originVersion = CarbonProperties.getInstance.getProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION) } test("Check the file_name in carbonindex with v3 format") { CarbonProperties.getInstance.addProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION, "3") sql("DROP TABLE IF EXISTS test_table_v3") sql( """ | CREATE TABLE test_table_v3(id int, name string, city string, age int) | STORED AS carbondata """.stripMargin) val testData = s"$resourcesPath/sample.csv" sql(s"LOAD DATA LOCAL INPATH '$testData' into table test_table_v3") val indexReader = new CarbonIndexFileReader() val carbonTable = CarbonMetadata.getInstance().getCarbonTable("default", "test_table_v3") val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, "0") val carbonIndexPaths = if (FileFactory.isFileExist(segmentDir)) { new File(segmentDir) .listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { name.endsWith(CarbonTablePath.getCarbonIndexExtension) } }) } else { val segment = Segment.getSegment("0", carbonTable.getTablePath) val store = new SegmentFileStore(carbonTable.getTablePath, segment.getSegmentFileName) store.readIndexFiles(new Configuration(false)) store.getIndexCarbonFiles.asScala.map(f => new File(f.getAbsolutePath)).toArray } for (carbonIndexPath <- carbonIndexPaths) { indexReader.openThriftReader(carbonIndexPath.getCanonicalPath) assert(indexReader.readIndexHeader().getVersion === 3) while (indexReader.hasNext) { val readBlockIndexInfo = indexReader.readBlockIndexInfo() assert(readBlockIndexInfo.getFile_name.startsWith(CarbonTablePath.getCarbonDataPrefix)) assert(readBlockIndexInfo.getFile_name.endsWith(CarbonTablePath.getCarbonDataExtension)) } } } override protected def afterAll() { sql("DROP TABLE IF EXISTS test_table_v1") sql("DROP TABLE IF EXISTS test_table_v2") sql("DROP TABLE IF EXISTS test_table_v3") CarbonProperties.getInstance.addProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION, originVersion) } }
Example 199
Source File: HadoopFileExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.carbondata.examples.util.ExampleUtils import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat} import org.apache.carbondata.hadoop.CarbonProjection // scalastyle:off println object HadoopFileExample { def main(args: Array[String]): Unit = { val spark = ExampleUtils.createSparkSession("HadoopFileExample") val rootPath = new File(this.getClass.getResource("/").getPath + "../../../..").getCanonicalPath val storeLocation: String = rootPath + "/examples/spark/target/store/default" exampleBody(spark, storeLocation) spark.close() } def exampleBody(spark : SparkSession, storeLocation : String): Unit = { import spark.implicits._ val df = spark.sparkContext.parallelize(1 to 1000) .map(x => ("a", "b", x)) .toDF("c1", "c2", "c3") df.write.format("carbondata") .option("tableName", "Hadoopfile_table") .option("compress", "true") .mode(SaveMode.Overwrite).save() // read two columns val projection = new CarbonProjection projection.addColumn("c1") // column c1 projection.addColumn("c3") // column c3 val conf = new Configuration() CarbonInputFormat.setColumnProjection(conf, projection) CarbonInputFormat.setDatabaseName(conf, "default") CarbonInputFormat.setTableName(conf, "Hadoopfile_table") val input = spark.sparkContext.newAPIHadoopFile(s"${storeLocation}/Hadoopfile_table", classOf[CarbonTableInputFormat[Array[Object]]], classOf[Void], classOf[Array[Object]], conf) val result = input.map(x => x._2.toList).collect result.foreach(x => println(x.mkString(", "))) // delete carbondata file ExampleUtils.cleanSampleCarbonFile(spark, "Hadoopfile_table") } } // scalastyle:on println
Example 200
Source File: FlinkExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import org.apache.flink.api.java.ExecutionEnvironment import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.Job import org.apache.carbondata.examples.util.ExampleUtils import org.apache.carbondata.hadoop.CarbonProjection import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat} // Write carbondata file by spark and read it by flink // scalastyle:off println object FlinkExample { def main(args: Array[String]): Unit = { // write carbondata file by spark val cc = ExampleUtils.createCarbonSession("FlinkExample") val path = ExampleUtils.writeSampleCarbonFile(cc, "carbon1") // read two columns by flink val projection = new CarbonProjection projection.addColumn("c1") // column c1 projection.addColumn("c3") // column c3 val conf = new Configuration() CarbonInputFormat.setColumnProjection(conf, projection) val env = ExecutionEnvironment.getExecutionEnvironment val ds = env.readHadoopFile( new CarbonTableInputFormat[Array[Object]], classOf[Void], classOf[Array[Object]], path, new Job(conf) ) // print result val result = ds.collect() for (i <- 0 until result.size()) { println(result.get(i).f1.mkString(",")) } // delete carbondata file ExampleUtils.cleanSampleCarbonFile(cc, "carbon1") } } // scalastyle:on println