org.apache.hadoop.conf.Configuration Scala Example

Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0

12 votes

package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
}

Source File: DirectOutputCommitter.scala From spark-snowflake with Apache License 2.0

6 votes

package net.snowflake.spark.snowflake

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapred._
import org.apache.hadoop.mapreduce.lib.output.{
  FileOutputCommitter,
  FileOutputFormat
}

class DirectOutputCommitter extends OutputCommitter {
  override def setupJob(jobContext: JobContext): Unit = {}

  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = {
    // We return true here to guard against implementations that do not handle false correctly.
    // The meaning of returning false is not entirely clear, so it's possible to be interpreted
    // as an error. Returning true just means that commitTask() will be called, which is a no-op.
    true
  }

  override def commitTask(taskContext: TaskAttemptContext): Unit = {}

  override def abortTask(taskContext: TaskAttemptContext): Unit = {}

  
  private def shouldCreateSuccessFile(conf: Configuration): Boolean = {
    conf.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)
  }
}

Source File: OrcFileOperator.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
      val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
      val schema = readerInspector.getTypeName
      logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
      CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.deploy.yarn.security

import java.io.{ByteArrayInputStream, DataInputStream}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.security.Credentials

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._

private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
  // Token renewal interval, this value will be set in the first call,
  // if None means no token renewer specified, so cannot get token renewal interval.
  private var tokenRenewalInterval: Option[Long] = null

  override val serviceName: String = "hdfs"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    // NameNode to access, used to get tokens from different FileSystems
    nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
      val dstFs = dst.getFileSystem(hadoopConf)
      logInfo("getting token for namenode: " + dst)
      dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
    }

    // Get the token renewal interval if it is not set. It will only be called once.
    if (tokenRenewalInterval == null) {
      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
    }

    // Get the time of next renewal.
    tokenRenewalInterval.map { interval =>
      creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .map { t =>
          val identifier = new DelegationTokenIdentifier()
          identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
          identifier.getIssueDate + interval
      }.foldLeft(0L)(math.max)
    }
  }

  private def getTokenRenewalInterval(
      hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
    // We cannot use the tokens generated with renewer yarn. Trying to renew
    // those will fail with an access control issue. So create new tokens with the logged in
    // user as renewer.
    sparkConf.get(PRINCIPAL).map { renewer =>
      val creds = new Credentials()
      nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
        val dstFs = dst.getFileSystem(hadoopConf)
        dstFs.addDelegationTokens(renewer, creds)
      }
      val t = creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .head
      val newExpiration = t.renew(hadoopConf)
      val identifier = new DelegationTokenIdentifier()
      identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
      val interval = newExpiration - identifier.getIssueDate
      logInfo(s"Renewal Interval is $interval")
      interval
    }
  }

  private def getTokenRenewer(conf: Configuration): String = {
    val delegTokenRenewer = Master.getMasterPrincipal(conf)
    logDebug("delegation token renewer is: " + delegTokenRenewer)
    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
      logError(errorMessage)
      throw new SparkException(errorMessage)
    }

    delegTokenRenewer
  }

  private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
    sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
      sparkConf.get(STAGING_DIR).map(new Path(_))
        .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
  }
}

Source File: HBase.scala From AI with Apache License 2.0

6 votes

package com.bigchange.hbase

import com.bigchange.util.HBaseUtil._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{Result, _}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.SparkContext


  def existRowKey(row:String, table: Table): Boolean ={

    val get = new Get(row.getBytes())
    val result = table.get(get)

    if (result.isEmpty) {
      warn("hbase table don't have this data,execute insert")
      return false
    }

    true

  }

  def getConfiguration = if(hBaseConfiguration == null) {
      warn("hbase setDefaultConfiguration....")
      setDefaultConfiguration
    } else  hBaseConfiguration

  def setDefaultConfiguration = {

    hBaseConfiguration = HBaseConfiguration.create

    // 本地测试 需配置的选项， 在集群上通过对应配置文件路径自动获得
    hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径
    hBaseConfiguration.set("dfs.nameservices", "ns1");  //
    hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址
    // 设置namenode自动切换的实现类
    hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
    hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase")
    hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2")
    hBaseConfiguration.set("hbase.zookeeper.property.clientPort", "2181")

    hBaseConfiguration

  }

}

Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0

6 votes

package io.eels.component.parquet

import java.nio.file.Paths

import io.eels.component.parquet.avro.AvroParquetSource
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.schema._
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{Matchers, WordSpec}

class AvroParquetSourceTest extends WordSpec with Matchers {
  ParquetLogMute()

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(conf)

  private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI)
  private val resourcesDir = personFile.getParent

  "AvroParquetSource" should {
    "read schema" in {
      val people = AvroParquetSource(personFile)
      people.schema shouldBe StructType(
        Field("name", StringType, nullable = false),
        Field("job", StringType, nullable = false),
        Field("location", StringType, nullable = false)
      )
    }
    "read parquet files" in {
      val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    "read multiple parquet files using file expansion" in {
      import io.eels.FilePattern._
      val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner"),
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    // todo add merge to parquet source
    "merge schemas" ignore {

      try {
        fs.delete(new Path("merge1.pq"), false)
      } catch {
        case t: Throwable =>
      }
      try {
        fs.delete(new Path("merge2.pq"), false)
      } catch {
        case t: Throwable =>
      }

      val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord()
      val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord()

      val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build()
      val record1 = new GenericData.Record(schema1)
      record1.put("a", "aaaaa")
      record1.put("b", 124.3)
      writer1.write(record1)
      writer1.close()

      val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build()
      val record2 = new GenericData.Record(schema2)
      record2.put("a", 111)
      record2.put("c", true)
      writer2.write(record2)
      writer2.close()

      ParquetSource(new Path("merge*")).schema shouldBe
        StructType(
          Field("a", StringType, nullable = false),
          Field("b", DoubleType, nullable = false),
          Field("c", BooleanType, nullable = false)
        )

      fs.delete(new Path(".merge1.pq.crc"), false)
      fs.delete(new Path(".merge2.pq.crc"), false)
      fs.delete(new Path("merge1.pq"), false)
      fs.delete(new Path("merge2.pq"), false)
    }
  }
}

Source File: CompressionCodecs.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.util

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.SequenceFile.CompressionType
import org.apache.hadoop.io.compress._

import org.apache.spark.util.Utils

object CompressionCodecs {
  private val shortCompressionCodecNames = Map(
    "none" -> null,
    "uncompressed" -> null,
    "bzip2" -> classOf[BZip2Codec].getName,
    "deflate" -> classOf[DeflateCodec].getName,
    "gzip" -> classOf[GzipCodec].getName,
    "lz4" -> classOf[Lz4Codec].getName,
    "snappy" -> classOf[SnappyCodec].getName)

  
  def setCodecConfiguration(conf: Configuration, codec: String): Unit = {
    if (codec != null) {
      conf.set("mapreduce.output.fileoutputformat.compress", "true")
      conf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString)
      conf.set("mapreduce.output.fileoutputformat.compress.codec", codec)
      conf.set("mapreduce.map.output.compress", "true")
      conf.set("mapreduce.map.output.compress.codec", codec)
    } else {
      // This infers the option `compression` is set to `uncompressed` or `none`.
      conf.set("mapreduce.output.fileoutputformat.compress", "false")
      conf.set("mapreduce.map.output.compress", "false")
    }
  }
}

Source File: HadoopFileLinesReader.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new LineRecordReader()
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Source File: HBaseCredentialProvider.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import scala.reflect.runtime.universe
import scala.util.control.NonFatal

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.security.Credentials
import org.apache.hadoop.security.token.{Token, TokenIdentifier}

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging

private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging {

  override def serviceName: String = "hbase"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    try {
      val mirror = universe.runtimeMirror(getClass.getClassLoader)
      val obtainToken = mirror.classLoader.
        loadClass("org.apache.hadoop.hbase.security.token.TokenUtil").
        getMethod("obtainToken", classOf[Configuration])

      logDebug("Attempting to fetch HBase security token.")
      val token = obtainToken.invoke(null, hbaseConf(hadoopConf))
        .asInstanceOf[Token[_ <: TokenIdentifier]]
      logInfo(s"Get token from HBase: ${token.toString}")
      creds.addToken(token.getService, token)
    } catch {
      case NonFatal(e) =>
        logDebug(s"Failed to get token from service $serviceName", e)
    }

    None
  }

  override def credentialsRequired(hadoopConf: Configuration): Boolean = {
    hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos"
  }

  private def hbaseConf(conf: Configuration): Configuration = {
    try {
      val mirror = universe.runtimeMirror(getClass.getClassLoader)
      val confCreate = mirror.classLoader.
        loadClass("org.apache.hadoop.hbase.HBaseConfiguration").
        getMethod("create", classOf[Configuration])
      confCreate.invoke(null, conf).asInstanceOf[Configuration]
    } catch {
      case NonFatal(e) =>
        logDebug("Fail to invoke HBaseConfiguration", e)
        conf
    }
  }
}

Source File: YarnRMClient.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.util.{List => JList}

import scala.collection.JavaConverters._
import scala.util.Try

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.yarn.api.records._
import org.apache.hadoop.yarn.client.api.AMRMClient
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.hadoop.yarn.webapp.util.WebAppUtils

import org.apache.spark.{SecurityManager, SparkConf}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.rpc.RpcEndpointRef
import org.apache.spark.util.Utils


  def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = {
    val sparkMaxAttempts = sparkConf.get(MAX_APP_ATTEMPTS).map(_.toInt)
    val yarnMaxAttempts = yarnConf.getInt(
      YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
    val retval: Int = sparkMaxAttempts match {
      case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts
      case None => yarnMaxAttempts
    }

    retval
  }

}

Source File: HDFSCredentialProviderSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.scalatest.{Matchers, PrivateMethodTester}

import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}

class HDFSCredentialProviderSuite
    extends SparkFunSuite
    with PrivateMethodTester
    with Matchers {
  private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer)

  private def getTokenRenewer(
      hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = {
    hdfsCredentialProvider invokePrivate _getTokenRenewer(conf)
  }

  private var hdfsCredentialProvider: HDFSCredentialProvider = null

  override def beforeAll() {
    super.beforeAll()

    if (hdfsCredentialProvider == null) {
      hdfsCredentialProvider = new HDFSCredentialProvider()
    }
  }

  override def afterAll() {
    if (hdfsCredentialProvider != null) {
      hdfsCredentialProvider = null
    }

    super.afterAll()
  }

  test("check token renewer") {
    val hadoopConf = new Configuration()
    hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
    hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]")
    val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf)
    renewer should be ("yarn/myrm:[email protected]")
  }

  test("check token renewer default") {
    val hadoopConf = new Configuration()
    val caught =
      intercept[SparkException] {
        getTokenRenewer(hdfsCredentialProvider, hadoopConf)
      }
    assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
  }
}

Source File: FileBasedWriteAheadLogRandomReader.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.Closeable
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration


private[streaming] class FileBasedWriteAheadLogRandomReader(path: String, conf: Configuration)
  extends Closeable {

  private val instream = HdfsUtils.getInputStream(path, conf)
  private var closed = (instream == null) // the file may be deleted as we're opening the stream

  def read(segment: FileBasedWriteAheadLogSegment): ByteBuffer = synchronized {
    assertOpen()
    instream.seek(segment.offset)
    val nextLength = instream.readInt()
    HdfsUtils.checkState(nextLength == segment.length,
      s"Expected message length to be ${segment.length}, but was $nextLength")
    val buffer = new Array[Byte](nextLength)
    instream.readFully(buffer)
    ByteBuffer.wrap(buffer)
  }

  override def close(): Unit = synchronized {
    closed = true
    instream.close()
  }

  private def assertOpen() {
    HdfsUtils.checkState(!closed, "Stream is closed. Create a new Reader to read from the file.")
  }
}

Source File: HdfsUtils.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{FileNotFoundException, IOException}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._

private[streaming] object HdfsUtils {

  def getOutputStream(path: String, conf: Configuration): FSDataOutputStream = {
    val dfsPath = new Path(path)
    val dfs = getFileSystemForPath(dfsPath, conf)
    // If the file exists and we have append support, append instead of creating a new file
    val stream: FSDataOutputStream = {
      if (dfs.isFile(dfsPath)) {
        if (conf.getBoolean("hdfs.append.support", false) || dfs.isInstanceOf[RawLocalFileSystem]) {
          dfs.append(dfsPath)
        } else {
          throw new IllegalStateException("File exists and there is no append support!")
        }
      } else {
        dfs.create(dfsPath)
      }
    }
    stream
  }

  def getInputStream(path: String, conf: Configuration): FSDataInputStream = {
    val dfsPath = new Path(path)
    val dfs = getFileSystemForPath(dfsPath, conf)
    try {
      dfs.open(dfsPath)
    } catch {
      case _: FileNotFoundException =>
        null
      case e: IOException =>
        // If we are really unlucky, the file may be deleted as we're opening the stream.
        // This can happen as clean up is performed by daemon threads that may be left over from
        // previous runs.
        if (!dfs.isFile(dfsPath)) null else throw e
    }
  }

  def checkState(state: Boolean, errorMsg: => String) {
    if (!state) {
      throw new IllegalStateException(errorMsg)
    }
  }

  
  def checkFileExists(path: String, conf: Configuration): Boolean = {
    val hdpPath = new Path(path)
    val fs = getFileSystemForPath(hdpPath, conf)
    fs.isFile(hdpPath)
  }
}

Source File: FileBasedWriteAheadLogWriter.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io._
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration

import org.apache.spark.util.Utils


  def write(data: ByteBuffer): FileBasedWriteAheadLogSegment = synchronized {
    assertOpen()
    data.rewind() // Rewind to ensure all data in the buffer is retrieved
    val lengthToWrite = data.remaining()
    val segment = new FileBasedWriteAheadLogSegment(path, nextOffset, lengthToWrite)
    stream.writeInt(lengthToWrite)
    Utils.writeByteBuffer(data, stream: OutputStream)
    flush()
    nextOffset = stream.getPos()
    segment
  }

  override def close(): Unit = synchronized {
    closed = true
    stream.close()
  }

  private def flush() {
    stream.hflush()
    // Useful for local file system where hflush/sync does not work (HADOOP-7844)
    stream.getWrappedStream.flush()
  }

  private def assertOpen() {
    HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.")
  }
}

Source File: FileBasedWriteAheadLogReader.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{Closeable, EOFException, IOException}
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration

import org.apache.spark.internal.Logging


private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration)
  extends Iterator[ByteBuffer] with Closeable with Logging {

  private val instream = HdfsUtils.getInputStream(path, conf)
  private var closed = (instream == null) // the file may be deleted as we're opening the stream
  private var nextItem: Option[ByteBuffer] = None

  override def hasNext: Boolean = synchronized {
    if (closed) {
      return false
    }

    if (nextItem.isDefined) { // handle the case where hasNext is called without calling next
      true
    } else {
      try {
        val length = instream.readInt()
        val buffer = new Array[Byte](length)
        instream.readFully(buffer)
        nextItem = Some(ByteBuffer.wrap(buffer))
        logTrace("Read next item " + nextItem.get)
        true
      } catch {
        case e: EOFException =>
          logDebug("Error reading next item, EOF reached", e)
          close()
          false
        case e: IOException =>
          logWarning("Error while trying to read data. If the file was deleted, " +
            "this should be okay.", e)
          close()
          if (HdfsUtils.checkFileExists(path, conf)) {
            // If file exists, this could be a legitimate error
            throw e
          } else {
            // File was deleted. This can occur when the daemon cleanup thread takes time to
            // delete the file during recovery.
            false
          }

        case e: Exception =>
          logWarning("Error while trying to read data from HDFS.", e)
          close()
          throw e
      }
    }
  }

  override def next(): ByteBuffer = synchronized {
    val data = nextItem.getOrElse {
      close()
      throw new IllegalStateException(
        "next called without calling hasNext or after hasNext returned false")
    }
    nextItem = None // Ensure the next hasNext call loads new data.
    data
  }

  override def close(): Unit = synchronized {
    if (!closed) {
      instream.close()
    }
    closed = true
  }
}

Source File: SerializableWritable.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {

  def value: T = t

  override def toString: String = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration(false))
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Source File: SerializableConfiguration.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
}

Source File: PortableDataStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.input

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}

import scala.collection.JavaConverters._

import com.google.common.io.{ByteStreams, Closeables}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}


  def toArray(): Array[Byte] = {
    val stream = open()
    try {
      ByteStreams.toByteArray(stream)
    } finally {
      Closeables.close(stream, true)
    }
  }

  def getPath(): String = path
}

Source File: WholeTextFileRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat


private[spark] class WholeTextFileRDD(
    sc : SparkContext,
    inputFormatClass: Class[_ <: WholeTextFileInputFormat],
    keyClass: Class[Text],
    valueClass: Class[Text],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
}

Source File: BinaryFileRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.StreamFileInputFormat

private[spark] class BinaryFileRDD[T](
    sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
}

Source File: SentenceTokenizer.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.text

import java.io.FileInputStream
import java.net.{URI, URL}

import com.intel.analytics.bigdl.dataset.Transformer

import scala.collection.Iterator
import opennlp.tools.tokenize.{SimpleTokenizer, Tokenizer, TokenizerME, TokenizerModel}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}



class SentenceTokenizer(tokenFile: Option[String] = None)
  extends Transformer[String, Array[String]] {

  var modelIn: FileInputStream = _
  var model: TokenizerModel = _

  var tokenizer: Tokenizer = _

  def this(tokenFile: URL) {
    this(Some(tokenFile.getPath))
  }

  def close(): Unit = {
    if (modelIn != null) {
      modelIn.close()
    }
  }

  override def apply(prev: Iterator[String]): Iterator[Array[String]] =
    prev.map(x => {
      if (tokenizer == null) {
        if (!tokenFile.isDefined) {
          tokenizer = SimpleTokenizer.INSTANCE
        } else {
          val src: Path = new Path(tokenFile.get)
          val fs = src.getFileSystem(new Configuration())
          val in = fs.open(src)
          model = new TokenizerModel(in)
          tokenizer = new TokenizerME(model)
        }
      }
      val words = tokenizer.tokenize(x)
      words
    })
}

object SentenceTokenizer {
  def apply(tokenFile: Option[String] = None):
    SentenceTokenizer = new SentenceTokenizer(tokenFile)
  def apply(tokenFile: URL):
    SentenceTokenizer = new SentenceTokenizer(tokenFile)
}

Source File: SentenceSplitter.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.text

import java.io.FileInputStream
import java.net.{URI, URL}

import com.intel.analytics.bigdl.dataset.Transformer
import opennlp.tools.sentdetect.{SentenceDetector, SentenceDetectorME, SentenceModel}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.collection.Iterator


class SentenceSplitter(sentFile: Option[String] = None)
  extends Transformer[String, Array[String]] {

  var modelIn: FileInputStream = _
  var model: SentenceModel = _
  var sentenceDetector: SentenceDetector = _

  def this(sentFileURL: URL) {
    this(Some(sentFileURL.getPath))
  }

  def this(sentFile: String) {
    this(Some(sentFile))
  }

  def close(): Unit = {
    if (modelIn != null) {
      modelIn.close()
    }
  }

  override def apply(prev: Iterator[String]): Iterator[Array[String]] =
    prev.map(x => {
      if (!sentFile.isDefined) {
        x.split('.')
      } else {
        if (sentenceDetector == null) {
          val src: Path = new Path(sentFile.get)
          val fs = src.getFileSystem(new Configuration())
          val in = fs.open(src)

          model = new SentenceModel(in)
          sentenceDetector = new SentenceDetectorME(model)
        }
        sentenceDetector.sentDetect(x)
      }
    })
}

object SentenceSplitter {
  def apply(sentFile: Option[String] = None):
    SentenceSplitter = new SentenceSplitter(sentFile)
  def apply(sentFileURL: URL):
    SentenceSplitter = new SentenceSplitter(sentFileURL)
  def apply(sentFile: String):
  SentenceSplitter = new SentenceSplitter(sentFile)
}

Source File: LocalSeqFileToBytes.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.image

import com.intel.analytics.bigdl.dataset.DataSet.SeqFileFolder
import com.intel.analytics.bigdl.dataset.{ByteRecord, LocalSeqFilePath, Transformer}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.SequenceFile.Reader
import org.apache.hadoop.io.{SequenceFile, Text}

import scala.collection.Iterator

object LocalSeqFileToBytes {
  def apply(): LocalSeqFileToBytes = new LocalSeqFileToBytes()
}


class LocalSeqFileToBytes extends Transformer[LocalSeqFilePath, ByteRecord] {

  import org.apache.hadoop.fs.{Path => hPath}


  @transient
  private var key: Text = null

  @transient
  private var value: Text = null

  @transient
  private var reader: SequenceFile.Reader = null

  @transient
  private var oneRecordBuffer: ByteRecord = null

  override def apply(prev: Iterator[LocalSeqFilePath]): Iterator[ByteRecord] = {
    new Iterator[ByteRecord] {
      override def next(): ByteRecord = {
        if (oneRecordBuffer != null) {
          val res = oneRecordBuffer
          oneRecordBuffer = null
          return res
        }

        if (key == null) {
          key = new Text()
        }
        if (value == null) {
          value = new Text
        }
        if (reader == null || !reader.next(key, value)) {
          if (reader != null) {
            reader.close()
          }

          reader = new SequenceFile.Reader(new Configuration,
            Reader.file(new hPath(prev.next().path.toAbsolutePath.toString)))
          reader.next(key, value)
        }

        ByteRecord(value.copyBytes(), SeqFileFolder.readLabel(key).toFloat)
      }

      override def hasNext: Boolean = {
        if (oneRecordBuffer != null) {
          true
        } else if (reader == null) {
          prev.hasNext
        } else {
          if (reader.next(key, value)) {
            oneRecordBuffer = ByteRecord(value.copyBytes(),
              SeqFileFolder.readLabel(key).toFloat)
            return true
          } else {
            prev.hasNext
          }
        }
      }
    }
  }
}

Source File: BGRImgToLocalSeqFile.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.image

import java.nio.ByteBuffer
import java.nio.file.Path

import com.intel.analytics.bigdl.dataset.Transformer
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path => hadoopPath}
import org.apache.hadoop.io.{SequenceFile, Text}

import scala.collection.Iterator

object BGRImgToLocalSeqFile {
  def apply(blockSize: Int, baseFileName: Path, hasName: Boolean = false): BGRImgToLocalSeqFile = {
    new BGRImgToLocalSeqFile(blockSize, baseFileName, hasName)
  }
}


class BGRImgToLocalSeqFile(blockSize: Int, baseFileName: Path, hasName: Boolean = false) extends
  Transformer[(LabeledBGRImage, String), String] {
  private val conf: Configuration = new Configuration
  private var index = 0
  private val preBuffer: ByteBuffer = ByteBuffer.allocate(4 * 2)

  override def apply(prev: Iterator[(LabeledBGRImage, String)]): Iterator[String] = {
    new Iterator[String] {
      override def hasNext: Boolean = prev.hasNext

      override def next(): String = {
        val fileName = baseFileName + s"_$index.seq"
        val path = new hadoopPath(fileName)
        val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(path),
          SequenceFile.Writer.keyClass(classOf[Text]),
          SequenceFile.Writer.valueClass(classOf[Text]))
        var i = 0
        while (i < blockSize && prev.hasNext) {
          val (image, imageName) = prev.next()

          preBuffer.putInt(image.width())
          preBuffer.putInt(image.height())
          val imageByteData = image.convertToByte()
          val data: Array[Byte] = new Array[Byte](preBuffer.capacity + imageByteData.length)
          System.arraycopy(preBuffer.array, 0, data, 0, preBuffer.capacity)
          System.arraycopy(imageByteData, 0, data, preBuffer.capacity, imageByteData.length)
          preBuffer.clear
          val imageKey = if (hasName) s"${imageName}\n${image.label().toInt}"
            else s"${image.label().toInt}"
          writer.append(new Text(imageKey), new Text(data))
          i += 1
        }
        writer.close()
        index += 1
        fileName
      }
    }
  }
}

Source File: COCOSeqFileGenerator.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.utils

import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext}
import java.io.File
import java.nio.file.{Files, Paths}
import java.util.concurrent.atomic.AtomicInteger
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.hadoop.io.compress.BZip2Codec
import org.apache.hadoop.io.{BytesWritable, SequenceFile}
import scala.collection.parallel.ForkJoinTaskSupport
import scopt.OptionParser

object COCOSeqFileGenerator {

  
  case class COCOSeqFileGeneratorParams(
    folder: String = ".",
    metaPath: String = "instances_val2014.json",
    output: String = ".",
    parallel: Int = 1,
    blockSize: Int = 12800
  )

  private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " +
    "Sequence File Generator") {
    head("BigDL COCO Sequence File Generator")
    opt[String]('f', "folder")
      .text("where you put the COCO image files")
      .action((x, c) => c.copy(folder = x))
    opt[String]('o', "output folder")
      .text("where you put the generated seq files")
      .action((x, c) => c.copy(output = x))
    opt[Int]('p', "parallel")
      .text("parallel num")
      .action((x, c) => c.copy(parallel = x))
    opt[Int]('b', "blockSize")
      .text("block size")
      .action((x, c) => c.copy(blockSize = x))
    opt[String]('m', "metaPath")
      .text("metadata json file path")
      .action((x, c) => c.copy(metaPath = x))
  }

  def main(args: Array[String]): Unit = {
    parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param =>
      println("Loading COCO metadata")
      val meta = COCODataset.load(param.metaPath, param.folder)
      println("Metadata loaded")
      val conf: Configuration = new Configuration
      val doneCount = new AtomicInteger(0)
      val tasks = meta.images.filter(img => {
        val path = img.path
        val valid = Files.exists(path) && !Files.isDirectory(path)
        if (!valid) {
          System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n")
        }
        valid
      }).grouped(param.blockSize).zipWithIndex.toArray.par
      tasks.tasksupport = new ForkJoinTaskSupport(
        new scala.concurrent.forkjoin.ForkJoinPool(param.parallel))
      tasks.foreach { case (imgs, blkId) =>
        val outFile = new Path(param.output, s"coco-seq-$blkId.seq")
        val key = new BytesWritable
        val value = new BytesWritable
        val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key
          .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile
          .CompressionType.BLOCK, new BZip2Codec))
        val context = new COCOSerializeContext
        imgs.foreach { img =>
          context.clear()
          context.dump(img.fileName)
          img.dumpTo(context)
          context.dump(COCODataset.MAGIC_NUM)
          val keyBytes = context.toByteArray
          key.set(keyBytes, 0, keyBytes.length)
          val bytes = img.data
          value.set(bytes, 0, bytes.length)
          writer.append(key, value)
          val cnt = doneCount.incrementAndGet()
          if (cnt % 500 == 0) {
            System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}")
          }
        }
        writer.close()
      }
      System.err.print("\n")
    }
  }
}

Source File: FileReader.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.visualization.tensorboard

import java.io.{BufferedInputStream}
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.tensorflow.util.Event

import scala.collection.mutable.ArrayBuffer
import scala.util.matching.Regex

private[bigdl] object FileReader {
  val fileNameRegex = """bigdl.tfevents.*""".r

  
  def readScalar(file: Path, tag: String, fs: FileSystem): Array[(Long, Float, Double)] = {
    require(fs.isFile(file), s"FileReader: ${file} should be a file")
    val bis = new BufferedInputStream(fs.open(file))
    val longBuffer = new Array[Byte](8)
    val crcBuffer = new Array[Byte](4)
    val bf = new ArrayBuffer[(Long, Float, Double)]
    while (bis.read(longBuffer) > 0) {
      val l = ByteBuffer.wrap(longBuffer.reverse).getLong()
      bis.read(crcBuffer)
      // TODO: checksum
      //      val crc1 = ByteBuffer.wrap(crcBuffer.reverse).getInt()
      val eventBuffer = new Array[Byte](l.toInt)
      bis.read(eventBuffer)
      val e = Event.parseFrom(eventBuffer)
      if (e.getSummary.getValueCount == 1 &&
        tag.equals(e.getSummary.getValue(0).getTag())) {
        bf.append((e.getStep, e.getSummary.getValue(0).getSimpleValue,
          e.getWallTime))
      }
      bis.read(crcBuffer)
      //      val crc2 = ByteBuffer.wrap(crcBuffer.reverse).getInt()
    }
    bis.close()
    bf.toArray.sortWith(_._1 < _._1)
  }
}

Source File: 2-CommonFunctions.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License

5 votes

// Databricks notebook source
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.hadoop.conf.Configuration

// COMMAND ----------

val prqShrinkageFactor = 0.19 //We found a saving in space of 81% with Parquet

// COMMAND ----------

def analyzeTables(databaseAndTable: String)
{
  println("Table: " + databaseAndTable)
  println("....refresh table")
  sql("REFRESH TABLE " + databaseAndTable)
  println("....analyze table")
  sql("ANALYZE TABLE " + databaseAndTable + " COMPUTE STATISTICS")
  println("....done")
}

// COMMAND ----------

def calcOutputFileCountTxtToPrq(srcDataFile: String, targetedFileSizeMB: Int): Int = {
  val fs = FileSystem.get(new Configuration())
  val estFileCount: Int = Math.floor((fs.getContentSummary(new Path(srcDataFile)).getLength * prqShrinkageFactor) / (targetedFileSizeMB * 1024 * 1024)).toInt
  if(estFileCount == 0) 1 else estFileCount
}

// COMMAND ----------

// Get recursive file collection you can iterate on
def getRecursiveFileCollection(directoryPath: String): Seq[String] =
  dbutils.fs.ls(directoryPath).map(directoryItem => {
    // Work around double encoding bug
    val directoryItemPath = directoryItem.path.replace("%25", "%").replace("%25", "%")
    if (directoryItem.isDir) 
      getRecursiveFileCollection(directoryItemPath)
    else 
      Seq[String](directoryItemPath)
  }).reduce(_ ++ _)

// COMMAND ----------

//Delete residual files from job operation (_SUCCESS, _start*, _committed*)
def recursivelyDeleteSparkJobFlagFiles(directoryPath: String)
{
  
  getRecursiveFileCollection(directoryPath).foreach(directoryItemPath => {
  if (directoryItemPath.indexOf("parquet") == -1)
  {
      println("Deleting...." +  directoryItemPath)
      dbutils.fs.rm(directoryItemPath)
  }})
}


// COMMAND ----------

dbutils.notebook.exit("Pass")

Source File: PostUrl.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.http

import java.io.{BufferedReader, InputStreamReader}
import java.net.URI

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.commons.httpclient.HttpClient
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.sql.SparkSession


class PostUrl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override val description: String = "Send a post request to the specified http"

  var url : String= _
  var jsonPath : String = _


  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()

    //read  json from hdfs
    val conf = new Configuration()
    val fs = FileSystem.get(URI.create(jsonPath),conf)
    val stream: FSDataInputStream = fs.open(new Path(jsonPath))
    val bufferReader = new BufferedReader(new InputStreamReader(stream))
    var lineTxt = bufferReader.readLine()
    val buffer = new StringBuffer()
    while (lineTxt != null ){
      buffer.append(lineTxt.mkString)
      lineTxt=bufferReader.readLine()
    }

    // post
    val client = HttpClients.createDefault()
    val httpClient = new HttpClient()
    httpClient.getParams().setContentCharset("utf-8")

    val post = new HttpPost(url)
    post.addHeader("content-Type","application/json")
    post.setEntity(new StringEntity(buffer.toString))
    val response = client.execute(post)
    val entity = response.getEntity
    val str = EntityUtils.toString(entity,"UTF-8")
    println("Code is " + str)

  }


  override def setProperties(map: Map[String, Any]): Unit = {
    url = MapUtil.get(map,key="url").asInstanceOf[String]
    jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val url = new PropertyDescriptor()
      .name("url")
      .displayName("Url")
      .defaultValue("")
      .description("http request address")
      .required(true)
      .example("http://master:8002/flow/start")

    val jsonPath = new PropertyDescriptor()
      .name("jsonPath")
      .displayName("JsonPath")
      .defaultValue("")
      .description("json parameter path for post request")
      .required(true)
        .example("hdfs://master:9000/work/flow.json")

    descriptor = url :: descriptor
    descriptor = jsonPath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/http/PostUrl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.HttpGroup.toString)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

}

Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter}

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.json.JSONObject


class Pathway extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse Pathway data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)


  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/pathway").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Pathway.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val inDf: DataFrame = in.read()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]

    val configuration: Configuration = new Configuration()
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)


    val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json"
    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()
    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var fdis: FSDataInputStream = null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var hasAnotherSequence:Boolean = true

    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var count = 0
      while (hasAnotherSequence) {
          count += 1
          doc = new JSONObject
          hasAnotherSequence = util.KeggPathway.process(br, doc)

          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      br.close()
      fdis.close()
    })
    hdfsWriter.close()

    val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary)
    df.schema.printTreeString()
    println(df.count)

    out.write(df)

  }
}

Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.PDB
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class PDBData extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse PDB data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var doc: JSONObject = null
    var pdb: PDB = null
    var count:Int=0
    inDf.collect().foreach(row => {
      count += 1
      pathStr = row.get(0).asInstanceOf[String]

      pdb = new PDB(pathStr,fs)
      doc = pdb.getDoc

      doc.write(hdfsWriter)
      hdfsWriter.write("\n")

      doc = null
    })
    hdfsWriter.close()

    val df: DataFrame = session.read.json(hdfsPathTemporary)
    out.write(df)
}

  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/PDB").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/PDBData.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

}

Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.ParserGff3Data
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class Ensembl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse ensembl data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/ensembl").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Ensembl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    val parser: ParserGff3Data = new ParserGff3Data
    var fdis: FSDataInputStream =null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var count:Int = 0
    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var eachStr:String=null

      while((eachStr = br.readLine()) != null && eachStr != null ){
        doc = parser.parserGff3(eachStr)

        if(doc.toString.length > 2){
          count += 1
          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      }

      br.close()
      fdis.close()
    })

    hdfsWriter.close()

    out.write(session.read.json(hdfsPathTemporary))
  }
}

Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License

5 votes

package daf.filesystem

import java.io.{ Closeable, InputStream }
import java.util.Scanner

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path }
import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec }

import scala.collection.convert.decorateAsScala._
import scala.util.{ Random, Try }

class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val fileSystem = FileSystem.getLocal(new Configuration)

  private val numFiles = 10

  private val baseDir = "test-dir".asHadoop

  private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d"

  private def safely[A <: Closeable, U](f: A => U) = { stream: A =>
    val attempt = Try { f(stream) }
    stream.close()
    attempt
  }

  private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path)

  private def readFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get }
  }

  private def openFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) }
  }

  private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream =>
    Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row =>
      stream.writeUTF { row.mkString("", ",", "\n") }
    }
  } apply fileSystem.create { workingDir / fileName }

  private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match {
    case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings
    case (head, tail)                 => randomSplits(tail, head.mkString +: strings)
  }

  private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) }

  private def createFiles = Try {
    0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse`
  }

  private def prepareData = for {
    _ <- createWorkingDir
    _ <- createFiles
  } yield ()

  private def purgeData = Try { fileSystem.delete(workingDir, true) }

  override def beforeAll() = prepareData.get

  override def afterAll() = purgeData.get

  "MergeStrategies info" when {

    "given compressed format files" must {

      "throw an exception" in {
        an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) }
      }
    }

    "given data as csv" must {

      "drop one line and merge the rest" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size - numFiles + 1 }
        } apply MergeStrategies.csv.merge { openFiles.get }
      }
    }

    "given data as json" must {

      "just merge the files into one" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size }
        } apply MergeStrategies.json.merge { openFiles.get }
      }

    }
  }
}

Source File: HiveUtils.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.hive.common

import java.io.File
import java.nio.file.Paths

import com.webank.wedatasphere.linkis.common.conf.{Configuration => CommonConfiguration}
import com.webank.wedatasphere.linkis.engine.hive.exception.HadoopConfSetFailedException
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.conf
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.ql.Driver


object HiveUtils {

  def jarOfClass(cls: Class[_]):Option[String] = {
    val uri = cls.getResource("/" + cls.getName.replace('.', '/') + ".class")
    if (uri != null) {
      val uriStr = uri.toString
      if (uriStr.startsWith("jar:file:")) {
        Some(uriStr.substring("jar:file:".length, uriStr.indexOf("!")))
      } else {
        None
      }
    } else {
      None
    }
  }

  def getHiveConf:HiveConf = {
    val confDir:File = new File(CommonConfiguration.hadoopConfDir)
    if (!confDir.exists() || confDir.isFile){
      throw HadoopConfSetFailedException(41001, "hadoop conf set failed, reason: conf dir does not exist")
    }
    val hadoopConf:Configuration = new Configuration()
    hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    new conf.HiveConf(hadoopConf, classOf[Driver])
  }


  def msDurationToString(ms: Long): String = {
    val second = 1000
    val minute = 60 * second
    val hour = 60 * minute
    ms match {
      case t if t < second =>
        "%d ms".format(t)
      case t if t < minute =>
        "%.1f s".format(t.toFloat / second)
      case t if t < hour =>
        "%.1f m".format(t.toFloat / minute)
      case t =>
        "%.2f h".format(t.toFloat / hour)
    }
  }

  def main(args: Array[String]): Unit = {
    jarOfClass(classOf[Driver]).foreach(println)
  }
}

Source File: HDFSUtils.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.hadoop.common.utils

import java.io.File
import java.nio.file.Paths
import java.security.PrivilegedExceptionAction

import com.webank.wedatasphere.linkis.common.conf.Configuration.hadoopConfDir
import com.webank.wedatasphere.linkis.hadoop.common.conf.HadoopConf._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.UserGroupInformation

object HDFSUtils {



  def getConfiguration(user: String): Configuration = getConfiguration(user, hadoopConfDir)

  def getConfiguration(user: String, hadoopConfDir: String): Configuration = {
    val confPath = new File(hadoopConfDir)
    if(!confPath.exists() || confPath.isFile) {
      throw new RuntimeException(s"Create hadoop configuration failed, path $hadoopConfDir not exists.")
    }
    val conf = new Configuration()
    conf.addResource(new Path(Paths.get(hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    conf.addResource(new Path(Paths.get(hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    conf.addResource(new Path(Paths.get(hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    conf
  }

  def getHDFSRootUserFileSystem: FileSystem = getHDFSRootUserFileSystem(getConfiguration(HADOOP_ROOT_USER.getValue))

  def getHDFSRootUserFileSystem(conf: org.apache.hadoop.conf.Configuration): FileSystem =
    getHDFSUserFileSystem(HADOOP_ROOT_USER.getValue, conf)

  def getHDFSUserFileSystem(userName: String): FileSystem = getHDFSUserFileSystem(userName, getConfiguration(userName))

  def getHDFSUserFileSystem(userName: String, conf: org.apache.hadoop.conf.Configuration): FileSystem =
    getUserGroupInformation(userName)
      .doAs(new PrivilegedExceptionAction[FileSystem]{
        def run = FileSystem.get(conf)
      })
  def getUserGroupInformation(userName: String): UserGroupInformation ={
    if(KERBEROS_ENABLE.getValue) {
      val path = new File(KEYTAB_FILE.getValue , userName + ".keytab").getPath
      val user = getKerberosUser(userName)
      UserGroupInformation.setConfiguration(getConfiguration(userName))
      UserGroupInformation.loginUserFromKeytabAndReturnUGI(user, path)
    } else {
      UserGroupInformation.createRemoteUser(userName)
    }
  }

  def getKerberosUser(userName: String): String = {
    var user = userName
    if(KEYTAB_HOST_ENABLED.getValue){
      user = user+ "/" + KEYTAB_HOST.getValue
    }
    user
  }

}

Source File: Credentials.scala From spark-select with Apache License 2.0

5 votes

package io.minio.spark.select

import java.net.URI

// For BasicAWSCredentials
import com.amazonaws.auth.AWSCredentials
import com.amazonaws.auth.AWSCredentialsProvider
import com.amazonaws.auth.BasicAWSCredentials
import com.amazonaws.auth.BasicSessionCredentials
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain

import org.apache.hadoop.conf.Configuration

private[spark] object Credentials {
  private def staticCredentialsProvider(credentials: AWSCredentials): AWSCredentialsProvider = {
    new AWSCredentialsProvider {
      override def getCredentials: AWSCredentials = credentials
      override def refresh(): Unit = {}
    }
  }

  def load(location: Option[String], hadoopConfiguration: Configuration): AWSCredentialsProvider = {
    val uri = new URI(location.getOrElse(""))
    val uriScheme = uri.getScheme

    uriScheme match {
      case "s3" | "s3a" =>
        // This matches what S3A does, with one exception: we don't
        // support anonymous credentials. First, try to parse from URI:
        Option(uri.getUserInfo).flatMap { userInfo =>
          if (userInfo.contains(":")) {
            val Array(accessKey, secretKey) = userInfo.split(":")
            Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey)))
          } else {
            None
          }
        }.orElse {
          val accessKey = hadoopConfiguration.get(s"fs.s3a.access.key", null)
          val secretKey = hadoopConfiguration.get(s"fs.s3a.secret.key", null)
          val sessionToken = hadoopConfiguration.get(s"fs.s3a.session.token", null)
          if (accessKey != null && secretKey != null) {
            if (sessionToken != null) {
              Some(staticCredentialsProvider(new BasicSessionCredentials(accessKey, secretKey, sessionToken)))
            } else {
              Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey)))
            }
          } else {
            None
          }
        }.getOrElse {
          // Finally, fall back on the instance profile provider
          new DefaultAWSCredentialsProviderChain()
        }
      case other =>
        throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a")
    }
  }
}

Source File: HiveTezSuite.scala From connectors with Apache License 2.0

5 votes

package io.delta.hive

import java.io.{Closeable, File}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.tez.dag.api.TezConfiguration
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration
import org.apache.tez.test.MiniTezCluster

class HiveTezSuite extends HiveConnectorTest {

  override val engine: String = "tez"

  private var tezConf: Configuration = _

  // scalastyle:off
  
  // scalastyle:on
  override def setupConfiguration(conf: Configuration): Unit = {
    tezConf.asScala.foreach { e =>
      conf.set(e.getKey, e.getValue)
    }
    // Overrides values from the hive/tez-site.
    conf.setInt("hive.tez.container.size", 256)
    conf.setInt(TezConfiguration.TEZ_AM_RESOURCE_MEMORY_MB, 256)
    conf.setInt(TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB, 256)
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 24)
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, 10)
    conf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0.4f)
    conf.setBoolean(TezConfiguration.TEZ_IGNORE_LIB_URIS, true)
  }
}

Source File: HiveMRSuite.scala From connectors with Apache License 2.0

5 votes

package io.delta.hive

import java.io.{Closeable, File}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{JobConf, MiniMRCluster}
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.yarn.conf.YarnConfiguration

class HiveMRSuite extends HiveConnectorTest {

  override val engine: String = "mr"

  override def createCluster(namenode: String, conf: Configuration, tempPath: File): Closeable = {
    val jConf = new JobConf(conf);
    jConf.set("yarn.scheduler.capacity.root.queues", "default");
    jConf.set("yarn.scheduler.capacity.root.default.capacity", "100");
    jConf.setInt(MRJobConfig.MAP_MEMORY_MB, 512);
    jConf.setInt(MRJobConfig.REDUCE_MEMORY_MB, 512);
    jConf.setInt(MRJobConfig.MR_AM_VMEM_MB, 128);
    jConf.setInt(YarnConfiguration.YARN_MINICLUSTER_NM_PMEM_MB, 512);
    jConf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128);
    jConf.setInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, 512);
    val mr = new MiniMRCluster(2, namenode, 1, null, null, jConf)

    new Closeable {
      override def close(): Unit = {
        mr.shutdown()
      }
    }
  }
}

Source File: ModelLoader.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.common

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.spark.ml.Transformer

trait ModelLoader[T <: Transformer] {

  def load(source: ModelSource): T

  final def load(path: String): T = {
    val source = if (path.startsWith("hdfs://")) {
      val uri = new URI(path)
      val p   = uri.getPath
      ModelSource.hadoop(p, new Configuration())
    } else {
      ModelSource.local(path)
    }

    load(source)
  }
}

Source File: ModelSource.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.common

import java.io.{InputStreamReader, BufferedReader}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, FileSystem}

case class ModelSource(
  root: String,
  fs: FileSystem
) {

  def readFile(path: String): String = {
    val fsPath = filePath(path)
    val reader = new BufferedReader(new InputStreamReader(fs.open(fsPath)))

    val builder      = new StringBuilder()
    var line: String = null
    while ({ line = reader.readLine(); line != null }) {
      builder.append(line + "\n")
    }
    builder.mkString
  }

  def findFile(dir: String, recursive: Boolean, f: String => Boolean): Option[Path] = {
    val dirPath = filePath(dir)
    if (fs.exists(dirPath) & fs.isDirectory(dirPath)) {
      val iter = fs.listFiles(dirPath, recursive)
      while (iter.hasNext) {
        val st = iter.next()
        if (st.isFile && f(st.getPath.getName)) return Some(st.getPath)
      }
      None
    } else {
      None
    }
  }

  def filePath(path: String): Path = {
    new Path(s"$root/$path")
  }

}

object ModelSource {

  def local(path: String): ModelSource = {
    ModelSource(path, FileSystem.getLocal(new Configuration()))
  }

  def hadoop(path: String, conf: Configuration): ModelSource = {
    val fs = FileSystem.get(conf)
    ModelSource(path, fs)
  }

}

Source File: ModelDataReader.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.common

import io.hydrosphere.spark_ml_serving.common.reader._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import parquet.format.converter.ParquetMetadataConverter.NO_FILTER
import parquet.hadoop.{ParquetFileReader, ParquetReader}
import parquet.schema.MessageType

import scala.collection.immutable.HashMap
import scala.collection.mutable

object ModelDataReader {

  def parse(source: ModelSource, path: String): LocalData = {
    source.findFile(path, recursive = true, _.endsWith(".parquet")) match {
      case Some(p) => readData(p)
      case None    => LocalData.empty
    }
  }

  private def readData(p: Path): LocalData = {
    val conf: Configuration = new Configuration()
    val metaData            = ParquetFileReader.readFooter(conf, p, NO_FILTER)
    val schema: MessageType = metaData.getFileMetaData.getSchema

    val reader = ParquetReader.builder[SimpleRecord](new SimpleReadSupport(), p.getParent).build()
    var result = LocalData.empty

    try {
      var value = reader.read()
      while (value != null) {
        val valMap = value.struct(HashMap.empty[String, Any], schema)
        result = mergeMaps(result, valMap)
        value  = reader.read()
      }
      result
    } finally {
      if (reader != null) {
        reader.close()
      }
    }
  }

  private def mergeMaps(acc: LocalData, map: HashMap[String, Any]) = {
    var result = acc
    map.foreach {
      case (k, v) => result = result.appendToColumn(k, List(v))
    }
    result
  }
}

Source File: SimpleReadSupport.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.common.reader

import java.util

import org.apache.hadoop.conf.Configuration
import parquet.hadoop.api.ReadSupport.ReadContext
import parquet.hadoop.api.{InitContext, ReadSupport}
import parquet.io.api.RecordMaterializer
import parquet.schema.MessageType

class SimpleReadSupport extends ReadSupport[SimpleRecord] {
  override def prepareForRead(
    configuration: Configuration,
    map: util.Map[String, String],
    messageType: MessageType,
    readContext: ReadContext
  ): RecordMaterializer[SimpleRecord] = {
    new SimpleRecordMaterializer(messageType)
  }

  override def init(context: InitContext): ReadContext = {
    new ReadContext(context.getFileSchema)
  }
}

Source File: HBaseGlobalValues.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.server.hbase

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory}

object HBaseGlobalValues {
  var appEventTableName = "app-event"
  var numberOfSalts = 10000
  var connection:Connection = null

  def init(conf:Configuration, numberOfSalts:Int,
           appEventTableName:String): Unit = {
    connection = ConnectionFactory.createConnection(conf)
    this.numberOfSalts = numberOfSalts
    this.appEventTableName = appEventTableName
  }
}

Source File: Util.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.util

import java.io._
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.{DeflaterOutputStream, InflaterInputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.HBaseConfiguration

object Util {
  val iteration = new AtomicInteger(0)

  def getTempFilePath(conf: Configuration, prefix: String): String = {
    val fileSystem = FileSystem.get(conf)
    val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}")
    if (fileSystem.exists(path)) {
      fileSystem.delete(path, true)
    }
    path.getName
  }

  def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val deflaterOutputStream = new DeflaterOutputStream(bos)
    val dos = new DataOutputStream(deflaterOutputStream)
    configuration.write(dos)
    dos.close()
    bos.toByteArray
  }

  def deserializeHBaseConfiguration(arr: Array[Byte]) = {
    val conf = HBaseConfiguration.create
    conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
    conf
  }
}

Source File: RecommenderSystem.scala From recommendersystem with Apache License 2.0

5 votes

package com.infosupport.recommendedcontent.core

import java.io.Serializable

import akka.actor.{Props, Actor, ActorLogging}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel


  private def generateRecommendations(userId: Int, count: Int) = {
    log.info(s"Generating ${count} recommendations for user with ID ${userId}")

    // Generate recommendations based on the machine learning model.
    // When there's no trained model return an empty list instead.
    val results = model match {
      case Some(m) => m.recommendProducts(userId,count)
        .map(rating => Recommendation(rating.product,rating.rating))
        .toList

      case None => Nil
    }

    sender ! Recommendations(results)
  }
}

Source File: AvroSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.io.File
import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels._
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

case class AvroSource(path: Path)
                     (implicit conf: Configuration, fs: FileSystem) extends Source with Using {

  override lazy val schema: StructType = {
    using(AvroReaderFns.createAvroReader(path)) { reader =>
      val record = reader.next()
      AvroSchemaFns.fromAvroSchema(record.getSchema)
    }
  }

  override def parts(): Seq[Publisher[Seq[Row]]] = Seq(AvroSourcePublisher(path))
}

case class AvroSourcePublisher(path: Path)
                              (implicit conf: Configuration, fs: FileSystem)
  extends Publisher[Seq[Row]] with Logging with Using {
  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    val deserializer = new AvroDeserializer()
    try {
      using(AvroReaderFns.createAvroReader(path)) { reader =>
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        AvroRecordIterator(reader)
          .takeWhile(_ => running.get)
          .map(deserializer.toRow)
          .grouped(DataStream.DefaultBatchSize)
          .foreach(subscriber.next)
        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

object AvroSource {
  def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSource = AvroSource(new Path(file.getAbsoluteFile.toString))
  def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSource = apply(path.toFile)
}

Source File: AvroSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.io.File

import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}

case class AvroSink(path: Path,
                    overwrite: Boolean = false,
                    permission: Option[FsPermission] = None,
                    inheritPermissions: Option[Boolean] = None)
                   (implicit conf: Configuration, fs: FileSystem) extends Sink {

  def withOverwrite(overwrite: Boolean): AvroSink = copy(overwrite = overwrite)
  def withPermission(permission: FsPermission): AvroSink = copy(permission = Option(permission))
  def withInheritPermission(inheritPermissions: Boolean): AvroSink = copy(inheritPermissions = Option(inheritPermissions))

  override def open(schema: StructType): SinkWriter = new SinkWriter {

    private val writer = new AvroWriter(schema, fs.create(path, overwrite))

    override def write(row: Row): Unit = writer.write(row)

    override def close(): Unit = {
      writer.close()
      permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }
}

object AvroSink {
  def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSink = AvroSink(new Path(file.getAbsoluteFile.toString))
  def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSink = apply(path.toFile)
}

Source File: SequenceSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import java.io.StringWriter

import com.univocity.parsers.csv.{CsvWriter, CsvWriterSettings}
import io.eels.{Row, Sink, SinkWriter}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

case class SequenceSink(path: Path)(implicit conf: Configuration) extends Sink {

  override def open(schema: StructType): SinkWriter = new SequenceSinkWriter(schema, path)

  class SequenceSinkWriter(schema: StructType, path: Path) extends SinkWriter {

    val writer = SequenceFile.createWriter(conf,
        SequenceFile.Writer.file(path),
      SequenceFile.Writer.keyClass(classOf[IntWritable]),
      SequenceFile.Writer.valueClass(classOf[BytesWritable])
    )

    val key = new IntWritable(0)

    val headers = valuesToCsv(schema.fieldNames())
    writer.append(key, new BytesWritable(headers.getBytes))

    override def close(): Unit = writer.close()

    override def write(row: Row): Unit = {
      this.synchronized {
        val csv = valuesToCsv(row.values)
        writer.append(key, new BytesWritable(csv.getBytes()))
        key.set(key.get() + 1)
      }
    }

    private def valuesToCsv(values: Seq[Any]): String = {
      val swriter = new StringWriter()
      val csv = new CsvWriter(swriter, new CsvWriterSettings())
      csv.writeRow(values.map {
        case null => null
        case other => other.toString
      }: _*)
      csv.close()
      swriter.toString().trim()
    }
  }
}

Source File: SequenceSupport.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import java.io.StringReader
import java.nio.charset.Charset

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels.component.csv.{CsvFormat, CsvSupport}
import io.eels.schema.{Field, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

object SequenceSupport extends Logging with Using {

  def createReader(path: Path)(implicit conf: Configuration): SequenceFile.Reader =
    new SequenceFile.Reader(conf, SequenceFile.Reader.file(path))

  def toValues(v: BytesWritable): Array[String] = toValues(new String(v.copyBytes(), Charset.forName("UTF8")))

  def toValues(str: String): Array[String] = {
    val parser = CsvSupport.createParser(CsvFormat(), false, false, false, null, null)
    parser.beginParsing(new StringReader(str))
    val record = parser.parseNext()
    parser.stopParsing()
    record
  }

  def schema(path: Path)(implicit conf: Configuration): StructType = {
    logger.debug(s"Fetching sequence schema for $path")
    using(createReader(path)) { it =>
      val k = new IntWritable()
      val v = new BytesWritable()
      val fields: Array[Field] = {
        it.next(k, v)
        toValues(v).map { it => new Field(it) }
      }
      StructType(fields.toList)
    }
  }
}

Source File: SequenceSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels._
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

case class SequenceSource(path: Path)(implicit conf: Configuration) extends Source with Logging {
  logger.debug(s"Creating sequence source from $path")

  override def schema: StructType = SequenceSupport.schema(path)
  override def parts(): Seq[Publisher[Seq[Row]]] = List(new SequencePublisher(path))
}

object SequenceReaderIterator {
  def apply(schema: StructType, reader: SequenceFile.Reader): Iterator[Row] = new Iterator[Row] {
    private val k = new IntWritable()
    private val v = new BytesWritable()
    // throw away the header
    reader.next(k, v)
    override def next(): Row = Row(schema, SequenceSupport.toValues(v).toVector)
    override def hasNext(): Boolean = reader.next(k, v)
  }
}

class SequencePublisher(val path: Path)(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using {

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      using(SequenceSupport.createReader(path)) { reader =>
        val schema = SequenceSupport.schema(path)
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        SequenceReaderIterator(schema, reader)
          .takeWhile(_ => running.get)
          .grouped(DataStream.DefaultBatchSize)
          .foreach(subscriber.next)

        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: RowParquetReaderFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import com.sksamuel.exts.Logging
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.api.ReadSupport
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetReader}
import org.apache.parquet.schema.Type


  def apply(path: Path,
            predicate: Option[Predicate],
            readSchema: Option[Type],
            dictionaryFiltering: Boolean)(implicit conf: Configuration): ParquetReader[Row] = {
    logger.debug(s"Opening parquet reader for $path")

    // The parquet reader can use a projection by setting a projected schema onto the supplied conf object
    def configuration(): Configuration = {
      val newconf = new Configuration(conf)
      readSchema.foreach { it =>
        newconf.set(ReadSupport.PARQUET_READ_SCHEMA, it.toString)
      }
      //newconf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, dictionaryFiltering.toString)
      newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString)
      newconf
    }

    // a filter is set when we have a predicate for the read
    def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build)
      .map(FilterCompat.get)
      .getOrElse(FilterCompat.NOOP)

    ParquetReader.builder(new RowReadSupport, path)
      .withConf(configuration())
      .withFilter(filter())
      .build()
  }
}

Source File: ParquetPublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.parquet.util.ParquetIterator
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.schema.MessageType

class ParquetPublisher(path: Path,
                       predicate: Option[Predicate],
                       projection: Seq[String],
                       caseSensitive: Boolean,
                       dictionaryFiltering: Boolean)
                      (implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using {

  def readSchema: Option[MessageType] = {
    if (projection.isEmpty) None
    else {

      val fileSchema = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER).getFileMetaData.getSchema
      val structType = ParquetSchemaFns.fromParquetMessageType(fileSchema)

      if (caseSensitive) {
        assert(
          structType.fieldNames.toSet.size == structType.fieldNames.map(_.toLowerCase).toSet.size,
          "Cannot use case sensitive = true when this would result in a clash of field names"
        )
      }

      val projectionSchema = StructType(projection.map { field =>
        structType.field(field, caseSensitive).getOrError(s"Requested field $field does not exist in the parquet schema")
      })

      ParquetSchemaFns.toParquetMessageType(projectionSchema).some
    }
  }

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      using(RowParquetReaderFn(path, predicate, readSchema, dictionaryFiltering)) { reader =>
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        ParquetIterator(reader)
          .grouped(DataStream.DefaultBatchSize)
          .takeWhile(_ => running.get)
          .foreach(subscriber.next)
        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: AvroParquetSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.avro.{AvroSchemaFns, AvroSchemaMerge}
import io.eels.component.parquet._
import io.eels.datastream.Publisher
import io.eels.schema.StructType
import io.eels.{FilePattern, Predicate, _}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.{Footer, ParquetFileReader}

import scala.collection.JavaConverters._

object AvroParquetSource {

  def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource =
    apply(FilePattern(new Path(uri.toString)))

  def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource =
    apply(FilePattern(path))

  def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource =
    apply(FilePattern(path))
}

case class AvroParquetSource(pattern: FilePattern,
                             predicate: Option[Predicate] = None)
                            (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using {

  private lazy val paths = pattern.toPaths()

  def withPredicate(pred: Predicate): AvroParquetSource = copy(predicate = pred.some)

  // the schema returned by the parquet source should be a merged version of the
  // schemas contained in all the files.
  override def schema: StructType = {
    val schemas = paths.map { path =>
      using(AvroParquetReaderFn.apply(path, predicate, None)) { reader =>
        val record = Option(reader.read()).getOrElse {
          sys.error(s"Cannot read $path for schema; file contains no records")
        }
        record.getSchema
      }
    }
    val avroSchema = AvroSchemaMerge("record", "namspace", schemas)
    AvroSchemaFns.fromAvroSchema(avroSchema)
  }

  // returns the count of all records in this source, predicate is ignored
  def countNoPredicate(): Long = statistics().count

  // returns stats, predicate is ignored
  def statistics(): Statistics = {
    if (paths.isEmpty) Statistics.Empty
    else {
      paths.foldLeft(Statistics.Empty) { (stats, path) =>
        val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER)
        footer.getBlocks.asScala.foldLeft(stats) { (stats, block) =>
          stats.copy(
            count = stats.count + block.getRowCount,
            compressedSize = stats.compressedSize + block.getCompressedSize,
            uncompressedSize = stats.uncompressedSize + block.getTotalByteSize
          )
        }
      }
    }
  }

  override def parts(): Seq[Publisher[Seq[Row]]] = {
    logger.debug(s"AvroParquetSource source has ${paths.size} files: $paths")
    paths.map { it => new AvroParquetPublisher(it, predicate) }
  }

  def footers(): List[Footer] = {
    logger.debug(s"AvroParquetSource source will read footers from $paths")
    paths.flatMap { it =>
      val status = fs.getFileStatus(it)
      logger.debug(s"status=$status; path=$it")
      ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala
    }
  }
}

Source File: AvroParquetReaderFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import io.eels.Predicate
import io.eels.component.parquet.{ParquetPredicateBuilder, ParquetReaderConfig}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.{AvroParquetReader, AvroReadSupport}
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.hadoop.ParquetReader


  def apply(path: Path,
            predicate: Option[Predicate],
            projectionSchema: Option[Schema])(implicit conf: Configuration): ParquetReader[GenericRecord] = {

    // The parquet reader can use a projection by setting a projected schema onto a conf object
    def configuration(): Configuration = {
      val newconf = new Configuration(conf)
      projectionSchema.foreach { it =>
        AvroReadSupport.setAvroReadSchema(newconf, it)
        AvroReadSupport.setRequestedProjection(newconf, it)
      }
      //conf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, "true")
      newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString)
      newconf
    }

    // a filter is set when we have a predicate for the read
    def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build)
      .map(FilterCompat.get)
      .getOrElse(FilterCompat.NOOP)

    AvroParquetReader.builder[GenericRecord](path)
      .withCompatibility(false)
      .withConf(configuration())
      .withFilter(filter())
      .build()
      .asInstanceOf[ParquetReader[GenericRecord]]
  }
}

Source File: AvroParquetPublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels.component.avro.AvroDeserializer
import io.eels.component.parquet.util.ParquetIterator
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

class AvroParquetPublisher(path: Path,
                           predicate: Option[Predicate])(implicit conf: Configuration)
  extends Publisher[Seq[Row]] with Logging with Using {

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      val deser = new AvroDeserializer()
      val running = new AtomicBoolean(true)
      subscriber.subscribed(Subscription.fromRunning(running))
      using(AvroParquetReaderFn(path, predicate, None)) { reader =>
        ParquetIterator(reader)
          .map(deser.toRow)
          .grouped(DataStream.DefaultBatchSize)
          .takeWhile(_ => running.get)
          .foreach(subscriber.next)
      }
      subscriber.completed()
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: RowWriteSupport.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import com.sksamuel.exts.Logging
import io.eels.Row
import org.apache.hadoop.conf.Configuration
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.RecordConsumer
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._
import scala.math.BigDecimal.RoundingMode.RoundingMode

// implementation of WriteSupport for Row's used by the native ParquetWriter
class RowWriteSupport(schema: MessageType,
                      roundingMode: RoundingMode,
                      metadata: Map[String, String]) extends WriteSupport[Row] with Logging {
  logger.trace(s"Created parquet row write support for schema message type $schema")

  private var writer: RowWriter = _

  override def finalizeWrite(): FinalizedWriteContext = new FinalizedWriteContext(metadata.asJava)

  def init(configuration: Configuration): WriteSupport.WriteContext = {
    new WriteSupport.WriteContext(schema, new java.util.HashMap())
  }

  def prepareForWrite(record: RecordConsumer) {
    writer = new RowWriter(record, roundingMode)
  }

  def write(row: Row) {
    writer.write(row)
  }
}

class RowWriter(record: RecordConsumer, roundingMode: RoundingMode) {

  def write(row: Row): Unit = {
    record.startMessage()
    val writer = new StructRecordWriter(row.schema, roundingMode, false)
    writer.write(record, row.values)
    record.endMessage()
  }
}

Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.Row
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}
import org.apache.parquet.schema.MessageType

import scala.math.BigDecimal.RoundingMode.RoundingMode


object RowParquetWriterFn {

  class RowParquetWriterBuilder(path: Path,
                                schema: MessageType,
                                roundingMode: RoundingMode,
                                metadata: Map[String, String])
    extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) {
    override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata)
    override def self(): RowParquetWriterBuilder = this
  }

  def apply(path: Path,
            schema: StructType,
            metadata: Map[String, String],
            dictionary: Boolean,
            roundingMode: RoundingMode,
            fsConfig: Configuration): ParquetWriter[Row] = {
    val config = ParquetWriterConfig()
    val messageType = ParquetSchemaFns.toParquetMessageType(schema)
    new RowParquetWriterBuilder(path, messageType, roundingMode, metadata)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(dictionary)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withValidation(config.validating)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withConf(fsConfig)
      .build()
  }
}

Source File: ParquetSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.datastream.Publisher
import io.eels.{Predicate, _}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.{Footer, ParquetFileReader}

import scala.collection.JavaConverters._

object ParquetSource {

  def apply(string: String)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(string))

  def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(new Path(uri.toString)))

  def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path))

  def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path))
}

case class ParquetSource(pattern: FilePattern,
                         predicate: Option[Predicate] = None,
                         projection: Seq[String] = Nil,
                         dictionaryFiltering: Boolean = true,
                         caseSensitive: Boolean = true)
                        (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using {
  logger.debug(s"Created parquet source with pattern=$pattern")

  lazy val paths: List[Path] = pattern.toPaths()

  def withDictionaryFiltering(dictionary: Boolean): ParquetSource = copy(dictionaryFiltering = dictionary)
  def withCaseSensitivity(caseSensitive: Boolean): ParquetSource = copy(caseSensitive = caseSensitive)
  def withPredicate(pred: => Predicate): ParquetSource = copy(predicate = pred.some)
  def withProjection(first: String, rest: String*): ParquetSource = withProjection(first +: rest)
  def withProjection(fields: Seq[String]): ParquetSource = {
    require(fields.nonEmpty)
    copy(projection = fields.toList)
  }

  // returns the metadata in the parquet file, or an empty map if none
  def metadata(): Map[String, String] = {
    paths.foldLeft(Map.empty[String, String]) { (metadata, path) =>
      val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER)
      metadata ++ footer.getFileMetaData.getKeyValueMetaData.asScala
    }
  }

  // todo should take the merged schema from all files
  lazy val schema: StructType = RowParquetReaderFn.schema(paths.headOption.getOrError("No paths found for source"))

  // returns the count of all records in this source, predicate is ignored
  def countNoPredicate(): Long = statistics().count

  // returns stats, predicate is ignored
  def statistics(): Statistics = {
    if (paths.isEmpty) Statistics.Empty
    else {
      paths.foldLeft(Statistics.Empty) { (stats, path) =>
        val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER)
        footer.getBlocks.asScala.foldLeft(stats) { (stats, block) =>
          stats.copy(
            count = stats.count + block.getRowCount,
            compressedSize = stats.compressedSize + block.getCompressedSize,
            uncompressedSize = stats.uncompressedSize + block.getTotalByteSize
          )
        }
      }
    }
  }

  override def parts(): Seq[Publisher[Seq[Row]]] = {
    logger.debug(s"Parquet source has ${paths.size} files: ${paths.mkString(", ")}")
    paths.map { it => new ParquetPublisher(it, predicate, projection, caseSensitive, dictionaryFiltering) }
  }

  def footers(): List[Footer] = {
    logger.debug(s"Parquet source will read footers from $paths")
    paths.flatMap { it =>
      val status = fs.getFileStatus(it)
      logger.debug(s"status=$status; path=$it")
      ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala
    }
  }
}

Source File: HdfsWatcher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hdfs

import java.util.concurrent.Executors
import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import io.eels.util.HdfsIterator
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.client.HdfsAdmin
import org.apache.hadoop.hdfs.inotify.Event

import scala.concurrent.duration._
import scala.util.control.NonFatal

class HdfsWatcher(path: Path, callback: FileCallback)
                 (implicit fs: FileSystem, conf: Configuration) extends Logging {

  private val files = HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toBuffer
  files.foreach(callback.onStart)

  private val executor = Executors.newSingleThreadExecutor()
  private val running = new AtomicBoolean(true)
  private val interval = 5.seconds

  private val admin = new HdfsAdmin(path.toUri, conf)
  private val eventStream = admin.getInotifyEventStream

  executor.submit(new Runnable {
    override def run(): Unit = {
      while (running.get) {
        try {
          Thread.sleep(interval.toMillis)
          val events = eventStream.take
          for (event <- events.getEvents) {
            event match {
              case create: Event.CreateEvent => callback.onCreate(create)
              case append: Event.AppendEvent => callback.onAppend(append)
              case rename: Event.RenameEvent => callback.onRename(rename)
              case close: Event.CloseEvent => callback.onClose(close)
              case _ =>
            }
          }
        } catch {
          case NonFatal(e) => logger.error("Error while polling fs", e)
        }
      }
    }
  })

  def stop(): Unit = {
    running.set(false)
    executor.shutdownNow()
  }
}

trait FileCallback {
  def onStart(path: Path): Unit
  def onClose(close: Event.CloseEvent): Unit
  def onRename(rename: Event.RenameEvent): Unit
  def onAppend(append: Event.AppendEvent): Unit
  def onCreate(path: Event.CreateEvent): Unit
}

Source File: CsvSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.csv

import com.univocity.parsers.csv.CsvWriter
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

case class CsvSink(path: Path,
                   overwrite: Boolean = false,
                   headers: Header = Header.FirstRow,
                   format: CsvFormat = CsvFormat(),
                   ignoreLeadingWhitespaces: Boolean = false,
                   ignoreTrailingWhitespaces: Boolean = false)
                  (implicit conf: Configuration, fs: FileSystem) extends Sink {

  override def open(schema: StructType): SinkWriter = new CsvSinkWriter(schema, path, headers, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces)

  def withOverwrite(overwrite: Boolean): CsvSink = copy(overwrite = overwrite)
  def withHeaders(headers: Header): CsvSink = copy(headers = headers)
  def withIgnoreLeadingWhitespaces(ignoreLeadingWhitespaces: Boolean): CsvSink = copy(ignoreLeadingWhitespaces = ignoreLeadingWhitespaces)
  def withIgnoreTrailingWhitespaces(ignoreTrailingWhitespaces: Boolean): CsvSink = copy(ignoreTrailingWhitespaces = ignoreTrailingWhitespaces)
  def withFormat(format: CsvFormat): CsvSink = copy(format = format)

  class CsvSinkWriter(schema: StructType,
                      path: Path,
                      headers: Header,
                      format: CsvFormat,
                      ignoreLeadingWhitespaces: Boolean = false,
                      ignoreTrailingWhitespaces: Boolean = false) extends SinkWriter {

    private val lock = new AnyRef {}

    if (overwrite && fs.exists(path))
      fs.delete(path, false)

    import scala.collection.JavaConverters._

    private lazy val writer: CsvWriter = {
      val output = fs.create(path)
      val writer = CsvSupport.createWriter(output, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces)
      headers match {
        case Header.FirstComment => writer.commentRow(schema.fieldNames().mkString(format.delimiter.toString()))
        case Header.FirstRow => writer.writeHeaders(schema.fieldNames().asJava)
        case _ =>
      }
      writer
    }

    override def close(): Unit = writer.close()

    override def write(row: Row): Unit = {
      lock.synchronized {
        // nulls should be written as empty strings
        val array = row.values.map {
          case null => ""
          case other => other.toString
        }
        writer.writeRow(array: _*)
      }
    }
  }
}

object CsvSink {
  def apply(path: java.nio.file.Path)
           (implicit conf: Configuration, fs: FileSystem): CsvSink = CsvSink(new Path(path.toString))
}

Source File: ReadParquetEEL.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import java.sql.Timestamp

import io.eels.component.parquet.{ParquetSink, ParquetSource}
import io.eels.datastream.DataStream
import io.eels.schema.{ArrayType, DecimalType, Field, IntType, Precision, Scale, StringType, StructType, TimestampMillisType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

object ReadParquetEEL extends App {

  def readParquet(path: Path): Unit = {

    implicit val hadoopConfiguration = new Configuration()
    implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration)
    val rows = ParquetSource(parquetFilePath).toDataStream().collect
    rows.foreach(row => println(row))
  }

  val parquetFilePath = new Path("file:///home/sam/development/person2.parquet")
  implicit val hadoopConfiguration = new Configuration()
  implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration)

  val friendStruct = Field.createStructField("FRIEND",
    Seq(
      Field("NAME", StringType),
      Field("AGE", IntType.Signed)
    )
  )

  val personDetailsStruct = Field.createStructField("PERSON_DETAILS",
    Seq(
      Field("NAME", StringType),
      Field("AGE", IntType.Signed),
      Field("SALARY", DecimalType(Precision(38), Scale(5))),
      Field("CREATION_TIME", TimestampMillisType)
    )
  )

  val friendType = StructType(friendStruct)
  val schema = StructType(personDetailsStruct, Field("FRIENDS", ArrayType(friendType), nullable = false))

  val friends = Vector(
    Vector(Vector("John", 25)),
    Vector(Vector("Adam", 26)),
    Vector(Vector("Steven", 27))
  )

  val rows = Vector(
    Vector(Vector("Fred", 50, BigDecimal("50000.99000"), new Timestamp(System.currentTimeMillis())), friends)
  )

  try {
    DataStream.fromValues(schema, rows).to(ParquetSink(parquetFilePath).withOverwrite(true))
  } catch {
    case e: Exception => e.printStackTrace()
  }

  try {
    readParquet(parquetFilePath)
  } catch {
    case e: Exception => e.printStackTrace()
  }
}

Source File: FilePatternTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import java.nio.file.Files

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class FilePatternTest extends WordSpec with Matchers {

  implicit val fs = FileSystem.get(new Configuration())

  "FilePattern" should {
    "detect single hdfs path without name server" ignore {
      FilePattern("hdfs:///mypath").toPaths() shouldBe List(new Path("hdfs:///mypath"))
    }
    "detect single hdfs path with name server" ignore {
      FilePattern("hdfs://nameserver/mypath").toPaths() shouldBe List(new Path("hdfs://nameserver/mypath"))
    }
    "detect absolute local file" in {
      FilePattern("file:///absolute/file").toPaths() shouldBe List(new Path("file:///absolute/file"))
    }
    "detect relative local file" in {
      FilePattern("file:///local/file").toPaths() shouldBe List(new Path("file:///local/file"))
    }
    "detect relative local file expansion" in {
      val dir = Files.createTempDirectory("filepatterntest")
      val files = List("a", "b", "c").map { it =>
        dir.resolve(it)
      }
      val hdfsPaths = files.map { it =>
        new Path(it.toUri)
      }
      files.foreach(file => Files.createFile(file))
      FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet
      files.foreach(Files.deleteIfExists)
      Files.deleteIfExists(dir)
    }

    //not working on windows
    "detect relative local file expansion with schema" in {
      val dir = Files.createTempDirectory("filepatterntest")
      val files = List("a", "b", "c").map { it =>
        dir.resolve(it)
      }
      val hdfsPaths = files.map { it =>
        new Path(it.toUri)
      }
      files.foreach(file => Files.createFile(file))
      FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet
      files.foreach(Files.deleteIfExists)
      Files.deleteIfExists(dir)
    }

    "use filter if supplied" in {
      val dir = Files.createTempDirectory("filepatterntest")
      val files = List("a", "b", "c").map { it => dir.resolve(it) }
      files.foreach { it => Files.createFile(it) }
      val a = FilePattern(dir.toAbsolutePath().toString() + "/*")
        .withFilter(_.toString().endsWith("a"))
        .toPaths.toSet
      a shouldBe Set(new Path("file:///" + dir.resolve("a")))
      files.foreach { it => Files.deleteIfExists(it) }
      Files.deleteIfExists(dir)
    }
  }
}

Source File: ListenerTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import java.util.concurrent.{CountDownLatch, TimeUnit}

import io.eels.component.csv.{CsvSink, CsvSource}
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

import scala.util.Random

class ListenerTest extends WordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.get(conf)

  val schema = StructType("a", "b", "c", "d", "e")
  val rows = List.fill(1000)(Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(10)))
  val ds = DataStream.fromRows(schema, rows)

  val path = new Path("listener_test.csv")

  "DataStream" should {
    "support user's listeners" in {

      val latch = new CountDownLatch(1000)
      fs.delete(path, false)

      ds.listener(new Listener {
        override def onNext(value: Row): Unit = latch.countDown()
        override def onError(e: Throwable): Unit = ()
        override def onComplete(): Unit = ()
      }).to(CsvSink(path))

      latch.await(20, TimeUnit.SECONDS) shouldBe true

      fs.delete(path, false)
    }
    "propagate errors in listeners" in {

      class TestSink extends Sink {
        override def open(schema: StructType): SinkWriter = new SinkWriter {
          override def close(): Unit = ()
          override def write(row: Row): Unit = ()
        }
      }

      try {
        ds.listener(new Listener {
          override def onNext(value: Row): Unit = sys.error("boom")
          override def onError(e: Throwable): Unit = ()
          override def onComplete(): Unit = ()
        }).to(new TestSink)
        assert(false)
      } catch {
        case _: Throwable =>
      }
    }
  }

  "Source.toDataStream" should {
    "call on next for each row" in {

      val latch = new CountDownLatch(1000)

      fs.delete(path, false)
      ds.to(CsvSink(path))

      CsvSource(path).toDataStream(new Listener {
        override def onNext(value: Row): Unit = latch.countDown()
        override def onError(e: Throwable): Unit = ()
        override def onComplete(): Unit = ()
      }).collect

      latch.await(5, TimeUnit.SECONDS) shouldBe true
      fs.delete(path, false)
    }
    "call on complete once finished" in {

      val latch = new CountDownLatch(1001)

      fs.delete(path, false)
      ds.to(CsvSink(path))

      CsvSource(path).toDataStream(new Listener {
        override def onNext(value: Row): Unit = latch.countDown()
        override def onError(e: Throwable): Unit = ()
        override def onComplete(): Unit = latch.countDown()
      }).collect

      latch.await(5, TimeUnit.SECONDS) shouldBe true
      fs.delete(path, false)
    }
  }
}

Source File: AvroSourceTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.nio.file.Paths

import com.typesafe.config.ConfigFactory
import io.eels.schema.{Field, StructType}
import org.apache.avro.util.Utf8
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.scalatest.{Matchers, WordSpec}

class AvroSourceTest extends WordSpec with Matchers {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  "AvroSource" should {
    "read schema" in {
      val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath)
      people.schema shouldBe StructType(Field("name", nullable = false), Field("job", nullable = false), Field("location", nullable = false))
    }
    "read strings as java.lang.String when eel.avro.java.string is true" in {
      System.setProperty("eel.avro.java.string", "true")
      ConfigFactory.invalidateCaches()
      val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet
      people.map(_.values) shouldBe Set(
        List("clint eastwood", "actor", "carmel"),
        List("elton john", "musician", "pinner"),
        List("issac newton", "scientist", "heaven")
      )
      System.setProperty("eel.avro.java.string", "false")
      ConfigFactory.invalidateCaches()
    }
    "read strings as utf8 when eel.avro.java.string is false" in {
      System.setProperty("eel.avro.java.string", "false")
      ConfigFactory.invalidateCaches()
      val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet
      people.map(_.values) shouldBe Set(
        List(new Utf8("clint eastwood"), new Utf8("actor"), new Utf8("carmel")),
        List(new Utf8("elton john"), new Utf8("musician"), new Utf8("pinner")),
        List(new Utf8("issac newton"), new Utf8("scientist"), new Utf8("heaven"))
      )
      System.setProperty("eel.avro.java.string", "true")
      ConfigFactory.invalidateCaches()
    }
  }
}

Source File: AvroSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import io.eels.Row
import io.eels.datastream.DataStream
import io.eels.schema.{ArrayType, Field, MapType, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class AvroSinkTest extends WordSpec with Matchers {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  private val ds = DataStream.fromValues(
    StructType("name", "job", "location"),
    Seq(
      List("clint eastwood", "actor", "carmel"),
      List("elton john", "musician", "pinner"),
      List("issac newton", "scientist", "heaven")
    )
  )

  "AvroSink" should {
    "write to avro" in {
      val path = new Path("avro.test")
      fs.delete(path, false)
      ds.to(AvroSink(path))
      fs.delete(path, false)
    }
    "support overwrite option" in {
      val path = new Path("overwrite_test", ".avro")
      fs.delete(path, false)
      ds.to(AvroSink(path))
      ds.to(AvroSink(path).withOverwrite(true))
      fs.delete(path, false)
    }
    "write lists and maps" in {
      val ds = DataStream.fromValues(
        StructType(
          Field("name"),
          Field("movies", ArrayType(StringType)),
          Field("characters", MapType(StringType, StringType))
        ),
        Seq(
          List(
            "clint eastwood",
            List("fistful of dollars", "high plains drifters"),
            Map("preacher" -> "high plains", "no name" -> "good bad ugly")
          )
        )
      )

      val path = new Path("array_map_avro", ".avro")
      fs.delete(path, false)
      ds.to(AvroSink(path))
      AvroSource(path).toDataStream().collect shouldBe Seq(
        Row(
          ds.schema,
          Seq(
            "clint eastwood",
            List("fistful of dollars", "high plains drifters"),
            Map("preacher" -> "high plains", "no name" -> "good bad ugly")
          )
        )
      )

      fs.delete(path, true)
    }
  }
}

Source File: JsonSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.json

import io.eels.datastream.DataStream
import io.eels.schema.{Field, StructType}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class JsonSinkTest extends WordSpec with Matchers {

  val path = new Path("test.json")
  implicit val fs: FileSystem = FileSystem.get(new Configuration())

  "JsonSink" should {
    "write multiple json docs to a file" in {
      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("location"))
      val ds = DataStream.fromValues(
        schema,
        Seq(
          Vector("sam", "aylesbury"),
          Vector("jam", "aylesbury"),
          Vector("ham", "buckingham")
        )
      )

      ds.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input should include("""{"name":"sam","location":"aylesbury"}""")
      input should include("""{"name":"jam","location":"aylesbury"}""")
      input should include("""{"name":"ham","location":"buckingham"}""")

      fs.delete(path, false)
    }
    "support arrays" in {
      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("skills"))
      val frame = DataStream.fromValues(
        schema,
        Seq(Vector("sam", Array("karate", "kung fu")))
      )

      frame.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input.trim shouldBe """{"name":"sam","skills":["karate","kung fu"]}"""

      fs.delete(path, false)
    }
    "support maps" in {
      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("locations"))
      val frame = DataStream.fromValues(
        schema,
        Seq(Vector("sam", Map("home" -> "boro", "work" -> "london")))
      )

      frame.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}"""

      fs.delete(path, false)
    }
    "support structs" in {

      case class Foo(home: String, work: String)

      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("locations"))
      val frame = DataStream.fromValues(
        schema,
        Seq(Vector("sam", Foo("boro", "london")))
      )

      frame.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}"""

      fs.delete(path, false)
    }
  }
}

Source File: SequenceSourceTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import io.eels.Row
import io.eels.datastream.DataStream
import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.scalatest.{Matchers, WordSpec}

class SequenceSourceTest extends WordSpec with Matchers {

  private implicit val conf = new Configuration()

  private val schema = StructType(Field("name"), Field("location"))
  private val ds = DataStream.fromValues(
    schema,
    Seq(
      Vector("name", "location"),
      Vector("sam", "aylesbury"),
      Vector("jam", "aylesbury"),
      Vector("ham", "buckingham")
    )
  )

  "SequenceSource" should {
    "read sequence files" in {
      val schema = StructType(
        Field("a", StringType),
        Field("b", StringType),
        Field("c", StringType),
        Field("d", StringType)
      )
      val path = new Path(getClass.getResource("/test.seq").getFile)
      val rows = SequenceSource(path).toDataStream().toSet
      rows shouldBe Set(
        Row(schema, "1", "2", "3", "4"),
        Row(schema, "5", "6", "7", "8")
      )
    }
    "read header as schema" in {
      val path = new Path(getClass.getResource("/test.seq").getFile)
      SequenceSource(path).schema shouldBe StructType(
        Field("a", StringType),
        Field("b", StringType),
        Field("c", StringType),
        Field("d", StringType)
      )
    }
  }
}

Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}
import org.scalatest.{Matchers, WordSpec}

class SequenceSinkTest extends WordSpec with Matchers {

  private val ds = DataStream.fromValues(
    StructType("a", "b", "c", "d"),
    Seq(
      List("1", "2", "3", "4"),
      List("5", "6", "7", "8")
    )
  )

  "SequenceSink" should {
    "write sequence files" in {

      implicit val conf = new Configuration
      implicit val fs = FileSystem.get(conf)

      val path = new Path("seqsink.seq")
      if (fs.exists(path))
        fs.delete(path, true)

      ds.to(SequenceSink(path))

      val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path))

      val k = new IntWritable
      val v = new BytesWritable

      val set = for (_ <- 1 to 3) yield {
        reader.next(k, v)
        new String(v.copyBytes)
      }

      set.toSet shouldBe Set(
        "a,b,c,d",
        "1,2,3,4",
        "5,6,7,8"
      )

      reader.close()

      fs.delete(path, true)
    }
  }
}

Source File: ParquetProjectionTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.io.{File, FilenameFilter}

import io.eels.datastream.DataStream
import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{FlatSpec, Matchers}

class ParquetProjectionTest extends FlatSpec with Matchers {

  cleanUpResidualParquetTestFiles

  private val schema = StructType(
    Field("name", StringType, nullable = false),
    Field("job", StringType, nullable = false),
    Field("location", StringType, nullable = false)
  )
  private val ds = DataStream.fromValues(
    schema,
    Seq(
      Vector("clint eastwood", "actor", "carmel"),
      Vector("elton john", "musician", "pinner")
    )
  )

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())
  private val file = new File(s"test_${System.currentTimeMillis()}.pq")
  file.deleteOnExit()
  private val path = new Path(file.toURI)

  if (fs.exists(path))
    fs.delete(path, false)

  ds.to(ParquetSink(path).withOverwrite(true))

  "ParquetSource" should "support projections" in {
    val rows = ParquetSource(path).withProjection("name").toDataStream().collect
    rows.map(_.values) shouldBe Vector(Vector("clint eastwood"), Vector("elton john"))
  }

  it should "return all data when no projection is set" in {
    val rows = ParquetSource(path).toDataStream().collect
    rows.map(_.values) shouldBe Vector(Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner"))
  }

  private def cleanUpResidualParquetTestFiles = {
    new File(".").listFiles(new FilenameFilter {
      override def accept(dir: File, name: String): Boolean = {
        (name.startsWith("test_") && name.endsWith(".pq")) || (name.startsWith(".test_") && name.endsWith(".pq.crc"))
      }
    }).foreach(_.delete())
  }

}

Source File: AvroAndParquetCrossCompatibilityTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource}
import io.eels.datastream.DataStream
import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{FlatSpec, Matchers}

// tests that avro source/sink and avro parquet source/sink can write/read each others files
class AvroAndParquetCrossCompatibilityTest extends FlatSpec with Matchers {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  "AvroParquetSource and ParquetSource" should "be compatible" in {

    val path = new Path("cross.pq")
    if (fs.exists(path))
      fs.delete(path, false)

    val structType = StructType(
      Field("name", StringType, nullable = false),
      Field("location", StringType, nullable = false)
    )

    val ds = DataStream.fromValues(
      structType,
      Seq(
        Vector("clint eastwood", "carmel"),
        Vector("elton john", "pinner")
      )
    )

    ds.to(ParquetSink(path))
    AvroParquetSource(path).toDataStream().collect shouldBe ds.collect
    fs.delete(path, false)

    ds.to(AvroParquetSink(path))
    ParquetSource(path).toDataStream().collect shouldBe ds.collect
    fs.delete(path, false)
  }
}

Source File: ParquetSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.io.File

import com.sksamuel.exts.metrics.Timed
import io.eels.Row
import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource}
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Random


object ParquetSpeedTest extends App with Timed {
  ParquetLogMute()

  val size = 2000000
  val schema = StructType("a", "b", "c", "d", "e")
  val createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4))
  val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size))

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val path = new Path("parquet_speed.pq")
  fs.delete(path, false)

  new File(path.toString).deleteOnExit()

  timed("Insertion") {
    ds.to(AvroParquetSink(path).withOverwrite(true))
  }

  while (true) {

    timed("Reading with ParquetSource") {
      val actual = ParquetSource(path).toDataStream().size
      assert(actual == size)
    }

    println("")
    println("---------")
    println("")

    Thread.sleep(2000)

    timed("Reading with AvroParquetSource") {
      val actual = AvroParquetSource(path).toDataStream().size
      assert(actual == size)
    }
  }
}

Source File: ParquetMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.io.File

import com.sksamuel.exts.metrics.Timed
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import io.eels.{FilePattern, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Random


object ParquetMultipleFileSpeedTest extends App with Timed {
  ParquetLogMute()

  val size = 5000000
  val count = 20
  val schema = StructType("a", "b", "c", "d", "e")

  def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4))

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val dir = new Path("parquet-speed-test")
  new File(dir.toString).mkdirs()

  new File(dir.toString).listFiles().foreach(_.delete)
  timed("Insertion") {
    val ds = DataStream.fromRowIterator(schema, Iterator.continually(createRow).take(size))
    ds.to(ParquetSink(new Path("parquet-speed-test/parquet_speed.pq")), count)
  }

  for (_ <- 1 to 25) {
    assert(count == FilePattern("parquet-speed-test/*").toPaths().size)

    timed("Reading with ParquetSource") {
      val actual = ParquetSource("parquet-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size
      assert(actual == size, s"Expected $size but was $actual")
    }

    println("")
    println("---------")
    println("")
  }
}

Source File: AvroParquetSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.Row
import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource}
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.datastream.DataStream
import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class AvroParquetSinkTest extends WordSpec with Matchers {
  ParquetLogMute()

  private val schema = StructType(
    Field("name", StringType, nullable = false),
    Field("job", StringType, nullable = false),
    Field("location", StringType, nullable = false)
  )
  private val ds = DataStream.fromValues(
    schema,
    Seq(
      Vector("clint eastwood", "actor", "carmel"),
      Vector("elton john", "musician", "pinner")
    )
  )

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())
  private val path = new Path("test.pq")

  "ParquetSink" should {
    "write schema" in {
      if (fs.exists(path))
        fs.delete(path, false)
      ds.to(AvroParquetSink(path))
      val people = ParquetSource(path)
      people.schema shouldBe StructType(
        Field("name", StringType, false),
        Field("job", StringType, false),
        Field("location", StringType, false)
      )
      fs.delete(path, false)
    }
    "write data" in {
      if (fs.exists(path))
        fs.delete(path, false)
      ds.to(AvroParquetSink(path))
      AvroParquetSource(path).toDataStream().toSet.map(_.values) shouldBe
        Set(
          Vector("clint eastwood", "actor", "carmel"),
          Vector("elton john", "musician", "pinner")
        )
      fs.delete(path, false)
    }
    "support overwrite" in {

      val path = new Path("overwrite_test.pq")
      fs.delete(path, false)

      val schema = StructType(Field("a", StringType))
      val ds = DataStream.fromRows(schema,
        Row(schema, Vector("x")),
        Row(schema, Vector("y"))
      )

      ds.to(AvroParquetSink(path))
      ds.to(AvroParquetSink(path).withOverwrite(true))
      fs.delete(path, false)
    }
  }
}

Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.util.UUID

import io.eels.component.avro.AvroSchemaFns
import io.eels.component.parquet.avro.AvroParquetReaderFn
import io.eels.schema.{DoubleType, Field, LongType, StructType}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.util.Utf8
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec}

class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  private val path = new Path(UUID.randomUUID().toString())

  override def afterAll(): Unit = {
    val fs = FileSystem.get(new Configuration())
    fs.delete(path, false)
  }

  private val avroSchema = SchemaBuilder.record("com.chuckle").fields()
    .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord()

  private val writer = AvroParquetWriter.builder[GenericRecord](path)
    .withSchema(avroSchema)
    .build()

  private val record = new GenericData.Record(avroSchema)
  record.put("str", "wibble")
  record.put("looong", 999L)
  record.put("dooble", 12.34)
  writer.write(record)
  writer.close()

  val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true))

  "AvroParquetReaderFn" should {
    "support projections on doubles" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong"))))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("dooble") shouldBe 12.34
    }
    "support projections on longs" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str"))))
      val record = reader.read()
      reader.close()

      record.get("looong") shouldBe 999L
    }
    "support full projections" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema)))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("looong") shouldBe 999L
      record.get("dooble") shouldBe 12.34

    }
    "support non projections" in {

      val reader = AvroParquetReaderFn(path, None, None)
      val group = reader.read()
      reader.close()

      group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      group.get("looong") shouldBe 999L
      group.get("dooble") shouldBe 12.34

    }
  }
}

Source File: DecimalWriterTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.Row
import io.eels.schema.{DecimalType, Field, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.scalatest.FunSuite

import scala.math.BigDecimal.RoundingMode

class DecimalWriterTest extends FunSuite {

  test("negativeDecimalTest") {
    implicit val configuration = new Configuration
    val expectedBigDecimals = Seq(BigDecimal(-5025176.39), BigDecimal(-5), BigDecimal(-999.56434), BigDecimal(-10000.9890))
    assertBigDecimals("bigd_negative.parquet", expectedBigDecimals)
  }

  test("positiveDecimalTest") {
    implicit val configuration = new Configuration
    val expectedBigDecimals = Seq(BigDecimal(5025176.39), BigDecimal(5), BigDecimal(999.56434), BigDecimal(-10000.9890))
    assertBigDecimals("bigd_positive.parquet", expectedBigDecimals)
  }

  private def assertBigDecimals(filename: String, expectedBigDecimals: Seq[BigDecimal])(implicit configuration: Configuration): Unit = {
    val schema = StructType(Field(name = "bd", dataType = DecimalType(38, 10)))
    val path = new Path(filename)
    val fileSystem = path.getFileSystem(configuration)
    if (fileSystem.exists(path)) fileSystem.delete(path, false)

    // Write out the decimal values
    val parquetWriter = RowParquetWriterFn(path = path, schema = schema, metadata = Map.empty, dictionary = false, roundingMode = RoundingMode.UP, fileSystem.getConf)
    expectedBigDecimals.foreach { expectedBigDecimal =>
      println(s"Writing row with value $expectedBigDecimal")
      parquetWriter.write(Row.fromMap(schema, Map("bd" -> expectedBigDecimal)))
    }
    parquetWriter.close()

    // Read back all the writes and assert their values
    val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(schema)
    val parquetReader = RowParquetReaderFn(path, None, Option(parquetProjectionSchema), dictionaryFiltering = true)
    for (i <- 0 until expectedBigDecimals.length) {
      val readRow = parquetReader.read
      println(s"read row: $readRow")
      assert(readRow.values.head == expectedBigDecimals(i))
    }
    parquetReader.close()
  }

}

Source File: CsvSourceTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.csv

import java.nio.file.Paths

import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.scalatest.{Matchers, WordSpec}

class CsvSourceTest extends WordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "CsvSource" should {
    "read schema" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).schema shouldBe StructType(
        Field("a", StringType, true),
        Field("b", StringType, true),
        Field("c", StringType, true)
      )
    }
    "support null cell value option as null" in {
      val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withNullValue(null).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("1", null, "3"))
    }
    "support null cell value replacement value" in {
      val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withNullValue("foo").toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("1", "foo", "3"))
    }
    "read from path" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstRow).toDataStream().size shouldBe 3
      CsvSource(path).withHeader(Header.None).toDataStream().size shouldBe 4
    }
    "allow specifying manual schema" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      val schema = StructType(
        Field("test1", StringType, true),
        Field("test2", StringType, true),
        Field("test3", StringType, true)
      )
      CsvSource(path).withSchema(schema).toDataStream().schema shouldBe schema
    }
    "support reading header" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstRow).toDataStream().collect.map(_.values).toSet shouldBe
        Set(Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6"))
    }
    "support skipping header" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("a", "b", "c"), Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6"))
    }
    "support delimiters" in {
      val file = getClass.getResource("/io/eels/component/csv/psv.psv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withDelimiter('|').toDataStream().collect.map(_.values).toSet shouldBe
        Set(Vector("e", "f", "g"))
      CsvSource(path).withDelimiter('|').withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("a", "b", "c"), Vector("e", "f", "g"))
    }
    "support comments for headers" in {
      val file = getClass.getResource("/io/eels/component/csv/comments.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType(
        Field("a", StringType, true),
        Field("b", StringType, true),
        Field("c", StringType, true)
      )
      CsvSource(path).withHeader(Header.FirstComment).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("1", "2", "3"), Vector("e", "f", "g"), Vector("4", "5", "6"))
    }
    "terminate if asking for first comment but no comments" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType(
        Field("", StringType, true)
      )
    }
    "support skipping corrupt rows" ignore {
      val file = getClass.getResource("/io/eels/component/csv/corrupt.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstRow).toDataStream().toVector.map(_.values) shouldBe
        Vector(Vector("1", "2", "3"))
    }
  }
}

Source File: Main.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object Main extends App {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  // the first parameter determines the command to run, just like in git, eg git pull, or in hadoop, eg hadoop fs
  val command = args.head
  val params = args.tail

  command match {
    case "schema" => ShowSchemaMain(params)
    case "stream" => StreamMain(params)
    case "apply-spec" => ApplySpecMain(params)
    case "fetch-spec" => FetchSpecMain(params)
    case "analyze" => AnalyzeMain(params)
    case other => System.err.println(s"Unknown command $other")
  }
}

case class Options(from: String = "", to: String = "", workerThreads: Int = 1, sourceIOThreads: Int = 1)

Source File: FetchSpecMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream

import io.eels.{Constants, SourceParser}
import io.eels.component.hive.{HiveSource, HiveSpec}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object FetchSpecMain {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel fetch-spec", Constants.EelVersion)

      opt[String]("dataset") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify dataset, eg hive:database:table"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val source = builder()
        source match {
          case hive: HiveSource =>
            val spec = hive.spec
            val json = HiveSpec.writeAsJson(spec.copy(tables = spec.tables.filter(_.tableName == hive.tableName)))
            println(json)
          case _ =>
            sys.error(s"Unsupported source $source")
        }
      case _ =>
    }
  }

  case class Options(source: String = null)
}

Source File: ApplySpecMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream
import java.nio.file.{Path, Paths}

import io.eels.{Constants, SourceParser}
import io.eels.component.hive.{HiveOps, HiveSource, HiveSpec}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient

object ApplySpecMain {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf
  implicit val client = new HiveMetaStoreClient(hiveConf)

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel apply-spec", Constants.EelVersion)

      opt[String]("dataset") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify dataset, eg hive:database:table"

      opt[String]("spec") required() action { (schema, o) =>
        o.copy(specPath = Paths.get(schema))
      } text "specify path to eel spec"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val source = builder()
        source match {
          case hive: HiveSource =>
            HiveOps.applySpec(HiveSpec(options.specPath), false)
          case _ =>
            sys.error(s"Unsupported source $source")
        }
      case _ =>
    }
  }

  case class Options(source: String = null, specPath: Path = null)
}

Source File: ShowSchemaMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream

import io.eels.{Constants, SourceParser}
import io.eels.component.avro.AvroSchemaFn
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object ShowSchemaMain {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel schema", Constants.EelVersion)

      opt[String]("source") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify source, eg hive:database:table or parquet:/path/to/file"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val source = builder()
        val schema = source.schema
        val avroSchema = AvroSchemaFn.toAvro(schema)
        out.println(avroSchema)
      case _ =>
    }
  }

  case class Options(source: String = "")
}

Source File: AnalyzeMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream

import io.eels.{Constants, SourceParser}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object AnalyzeMain {

  import scala.concurrent.ExecutionContext.Implicits.global

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel analyze", Constants.EelVersion)

      opt[String]("dataset") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify dataset, eg hive:database:table"

      opt[Boolean]("reverse") optional() action { (reverse, o) =>
        o.copy(reverse = reverse)
      } text "specify reverse ordering of columns, eg most distinct first"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val result = builder().counts.toSeq.sortBy(_._2.size)
        val orderedResults = if (options.reverse) result.reverse else result
        for ((columnName, columnCounts) <- orderedResults) {
          println(columnName)
          for ((value, counts) <- columnCounts) {
            println(s"\t$value ($counts)")
          }
        }
      case _ =>
    }
  }

  case class Options(source: String = null, reverse: Boolean = false)
}

Source File: StreamMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import io.eels.{Constants, Sink, SinkParser, SourceParser}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object StreamMain {

  import scala.concurrent.ExecutionContext.Implicits.global

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String]): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel", Constants.EelVersion)

      opt[String]("source") required() action { (source, o) =>
        o.copy(from = source)
      } text "specify source, eg hive:database:table"

      opt[String]("sink") required() action { (sink, o) =>
        o.copy(to = sink)
      } text "specify sink, eg hive:database:table"

      opt[Int]("sourceThreads") optional() action { (threads, options) =>
        options.copy(sourceIOThreads = threads)
      } text "number of source io threads, defaults to 1"

      opt[Int]("workerThreads") optional() action { (threads, options) =>
        options.copy(workerThreads = threads)
      } text "number of worker threads, defaults to 1"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val sourceBuilder = SourceParser(options.from).orNull
        val source = sourceBuilder()
        val sinkBuilder = SinkParser(options.to).orNull
        val sink = sinkBuilder()
        val result = source.toFrame(options.sourceIOThreads).to(sink)
        println(s"Completed with $result rows")
      case _ =>
    }
  }
}

Source File: HbaseTests.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hbase

import java.nio.file.Paths
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hdfs.MiniDFSCluster

trait HbaseTests {
  val MINI_CLUSTER_ROOT = "miniclusters"

  def startHBaseCluster(clusterName: String): MiniHBaseCluster = {
    // Setup the underlying HDFS mini cluster for HBASE mini cluster
    System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA)
    val clusterFolder = s"${clusterName}_${UUID.randomUUID().toString}"
    val clusterPath = Paths.get(MINI_CLUSTER_ROOT, clusterFolder)
    val conf = new Configuration()
    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, clusterPath.toAbsolutePath.toString)
    val miniDFSCluster = new MiniDFSCluster.Builder(conf).build()

    // Now setup and start the HBASE mini cluster
    val hBaseTestingUtility = new HBaseTestingUtility
    hBaseTestingUtility.setDFSCluster(miniDFSCluster)
    hBaseTestingUtility.startMiniCluster(1, 1)
    val cluster = hBaseTestingUtility.getHBaseCluster
    cluster.waitForActiveAndReadyMaster()
    cluster
  }

}

Source File: HiveFilePublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.io.Using
import io.eels.datastream.{Subscription, Publisher, Subscriber}
import io.eels.schema.{Partition, StructType}
import io.eels.{Predicate, _}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus}


class HiveFilePublisher(dialect: HiveDialect,
                        file: LocatedFileStatus,
                        metastoreSchema: StructType,
                        projectionSchema: StructType,
                        predicate: Option[Predicate],
                        partition: Partition)
                       (implicit fs: FileSystem, conf: Configuration) extends Publisher[Seq[Row]] with Using {
  require(projectionSchema.fieldNames.forall { it => it == it.toLowerCase() }, s"Use only lower case field names with hive")

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {

    val partitionMap: Map[String, Any] = partition.entries.map { it => (it.key, it.value) }.toMap

    // the schema we send to the dialect must have any partition fields removed, because those
    // fields won't exist in the data files. This is because partitions are not always written
    // and instead inferred from the partition itself.
    val projectionFields = projectionSchema.fields.filterNot(field => partition.containsKey(field.name))
    val projectionWithoutPartitions = StructType(projectionFields)

    // since we removed the partition fields from the target schema, we must repopulate them after the read
    // we also need to throw away the dummy field if we had an empty schema
    val publisher = dialect.input(file.getPath, metastoreSchema, projectionWithoutPartitions, predicate)
    publisher.subscribe(new Subscriber[Seq[Row]] {
      override def subscribed(s: Subscription): Unit = subscriber.subscribed(s)
      override def next(chunk: Seq[Row]): Unit = {
        val aligned = chunk.map { row =>
          if (projectionFields.isEmpty) {
            val values = projectionSchema.fieldNames().map(partitionMap.apply)
            Row(projectionSchema, values.toVector)
          } else {
            RowUtils.rowAlign(row, projectionSchema, partitionMap)
          }
        }
        subscriber.next(aligned)
      }
      override def completed(): Unit = subscriber.completed()
      override def error(t: Throwable): Unit = subscriber.error(t)
    })
  }
}

Source File: HiveStats.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import io.eels.schema.PartitionConstraint
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.ParquetFileReader

import scala.collection.JavaConverters._

trait HiveStats {

  // total number of records
  def count: Long = count(Nil)

  // total number of records in the partitions that match the constraints
  def count(constraints: Seq[PartitionConstraint]): Long

  // returns the minimum value of this field
  def min(field: String): Any = min(field, Nil)

  // returns the maximum value of this field
  def max(field: String): Any = max(field, Nil)

  // returns the minimum value of this field for the partitions that match the constraints
  def min(field: String, constraints: Seq[PartitionConstraint]): Any

  // returns the maximum value of this field for the partitions that match the constraints
  def max(field: String, constraints: Seq[PartitionConstraint]): Any
}

class ParquetHiveStats(dbName: String,
                       tableName: String,
                       table: HiveTable)
                      (implicit fs: FileSystem,
                       conf: Configuration,
                       client: IMetaStoreClient) extends HiveStats with Logging {

  private val ops = new HiveOps(client)

  private def count(path: Path) = {
    val blocks = ParquetFileReader.readFooter(fs.getConf, path, ParquetMetadataConverter.NO_FILTER).getBlocks.asScala
    blocks.map(_.getRowCount).sum
  }

  override def count(constraints: Seq[PartitionConstraint]): Long = {
    val counts = HiveTableFilesFn(dbName, tableName, table.location, constraints)
      .flatMap(_._2)
      .map(_.getPath).map(count)
    if (counts.isEmpty) 0 else counts.sum
  }

  private def minmax(field: String, constraints: Seq[PartitionConstraint]): (Any, Any) = {
    def stats[T]: (Any, Any) = {
      def min(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) <= 0) a else b }
      def max(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) >= 0) a else b }
      val location = new Path(ops.location(dbName, tableName))
      val (mins, maxes) = HiveTableFilesFn(dbName, tableName, location, constraints).toSeq.flatMap { case (_, files) =>
        logger.debug(s"Calculating min,max in file $files")
        files.flatMap { file =>
          val footer = ParquetFileReader.readFooter(conf, file, ParquetMetadataConverter.NO_FILTER)
          footer.getBlocks.asScala.map { block =>
            val column = block.getColumns.asScala.find(_.getPath.toDotString == field).getOrError(s"Unknown column $field")
            val min = column.getStatistics.genericGetMin.asInstanceOf[Comparable[T]]
            val max = column.getStatistics.genericGetMax.asInstanceOf[Comparable[T]]
            (min, max)
          }
        }
      }.unzip
      (min(mins), max(maxes))
    }
    stats[Any]
  }

  override def min(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._1
  override def max(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._2
}

Source File: ParquetHiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.dialect

import java.util.concurrent.atomic.AtomicInteger

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.hive.{HiveDialect, HiveOps, HiveOutputStream}
import io.eels.component.parquet._
import io.eels.component.parquet.util.{ParquetIterator, ParquetLogMute}
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
import org.apache.hadoop.hive.ql.io.parquet.{MapredParquetInputFormat, MapredParquetOutputFormat}

import scala.math.BigDecimal.RoundingMode.RoundingMode

case class ParquetHiveDialect(options: ParquetWriteOptions = ParquetWriteOptions()) extends HiveDialect with Logging with Using {

  override val serde: String = classOf[ParquetHiveSerDe].getCanonicalName
  override val inputFormat: String = classOf[MapredParquetInputFormat].getCanonicalName
  override val outputFormat: String = classOf[MapredParquetOutputFormat].getCanonicalName

  override def input(path: Path,
                     ignore: StructType,
                     projectionSchema: StructType,
                     predicate: Option[Predicate])
                    (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] {

    val client = new HiveMetaStoreClient(new HiveConf)
    val ops = new HiveOps(client)

    override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
      // convert the eel projection schema into a parquet schema which will be used by the native parquet reader
      try {
        val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(projectionSchema)
        using(RowParquetReaderFn(path, predicate, parquetProjectionSchema.some, true)) { reader =>
          val subscription = new Subscription {
            override def cancel(): Unit = reader.close()
          }
          subscriber.subscribed(subscription)
          ParquetIterator(reader).grouped(DataStream.DefaultBatchSize).foreach(subscriber.next)
          subscriber.completed()
        }
      } catch {
        case t: Throwable => subscriber.error(t)
      }
    }
  }

  override def output(schema: StructType,
                      path: Path,
                      permission: Option[FsPermission],
                      roundingMode: RoundingMode,
                      metadata: Map[String, String])
                     (implicit fs: FileSystem, conf: Configuration): HiveOutputStream = {
    val path_x = path
    new HiveOutputStream {
      ParquetLogMute()

      private val _records = new AtomicInteger(0)
      logger.debug(s"Creating parquet writer at $path")
      private val writer = RowParquetWriterFn(path, schema, metadata, true, roundingMode, fs.getConf)

      override def write(row: Row) {
        require(row.values.nonEmpty, "Attempting to write an empty row")
        writer.write(row)
        _records.incrementAndGet()
      }

      override def close(): Unit = {
        logger.debug(s"Closing hive parquet writer $path")
        writer.close()
        // after the files are closed, we should set permissions if we've been asked to, this allows
        // all the files we create to stay consistent
        permission.foreach(fs.setPermission(path, _))
      }

      override def records: Int = _records.get()
      override def path: Path = path_x
    }
  }
}

Source File: OrcHiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.dialect

import com.sksamuel.exts.Logging
import io.eels.component.hive.{HiveDialect, HiveOutputStream}
import io.eels.component.orc.{OrcPublisher, OrcWriteOptions, OrcWriter}
import io.eels.datastream.{Publisher, Subscriber}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde}

import scala.math.BigDecimal.RoundingMode.RoundingMode

case class OrcHiveDialect(options: OrcWriteOptions = OrcWriteOptions()) extends HiveDialect with Logging {

  override val serde: String = classOf[OrcSerde].getCanonicalName
  override val inputFormat: String = classOf[OrcInputFormat].getCanonicalName
  override val outputFormat: String = classOf[OrcOutputFormat].getCanonicalName

  override def input(path: Path,
                     metastoreSchema: StructType,
                     projectionSchema: StructType,
                     predicate: Option[Predicate])
                    (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] {
    override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
      new OrcPublisher(path, projectionSchema.fieldNames(), predicate).subscribe(subscriber)
    }
  }

  override def output(schema: StructType,
                      path: Path,
                      permission: Option[FsPermission],
                      roundingMode: RoundingMode,
                      metadata: Map[String, String])(implicit fs: FileSystem, conf: Configuration): HiveOutputStream = {

    val path_x = path
    val writer = new OrcWriter(path, schema, options)

    new HiveOutputStream {

      override def write(row: Row): Unit = {
        require(row.values.nonEmpty, "Attempting to write an empty row")
        writer.write(row)
      }

      override def close(): Unit = {
        writer.close()
        permission.foreach(fs.setPermission(path, _))
      }

      override def records: Int = writer.records
      override def path: Path = path_x
    }
  }
}

Source File: HiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import io.eels.component.hive.dialect.{OrcHiveDialect, ParquetHiveDialect}
import io.eels.datastream.Publisher
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.api.Table

import scala.math.BigDecimal.RoundingMode.RoundingMode

trait HiveDialect extends Logging {

  def serde: String
  def inputFormat: String
  def outputFormat: String

  
  def output(schema: StructType, // schema without partition information
             path: Path,
             permission: Option[FsPermission],
             roundingMode: RoundingMode,
             metadata: Map[String, String])
            (implicit fs: FileSystem, conf: Configuration): HiveOutputStream

  def stats(getPath: Path)(implicit fs: FileSystem): Long = throw new UnsupportedOperationException
}

object HiveDialect extends Logging {

  def apply(format: String): HiveDialect = format match {
    case input if input.contains("ParquetInputFormat") => ParquetHiveDialect()
    case input if input.contains("OrcInputFormat") => OrcHiveDialect()
    //case input if input.contains("AvroHiveDialect") || input.contains("AvroContainerInputFormat") => AvroHiveDialect
    //      "org.apache.hadoop.mapred.TextInputFormat" -> TextHiveDialect
    case _ => throw new UnsupportedOperationException(s"Unknown hive input format $format")
  }

  def apply(table: Table): HiveDialect = {
    val format = table.getSd.getInputFormat
    logger.debug(s"Table format is $format")
    val dialect = HiveDialect(format)
    logger.debug(s"HiveDialect is $dialect")
    dialect
  }
}

Source File: ParquetVsOrcSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import java.io.File
import java.math.MathContext

import com.sksamuel.exts.metrics.Timed
import io.eels.Row
import io.eels.component.orc.{OrcSink, OrcSource}
import io.eels.component.parquet.{ParquetSink, ParquetSource}
import io.eels.datastream.DataStream
import io.eels.schema._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.math.BigDecimal.RoundingMode
import scala.util.Random

object ParquetVsOrcSpeedTest extends App with Timed {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val size = 5000000

  val structType = StructType(
    Field("name", StringType),
    Field("age", IntType.Signed),
    Field("height", DoubleType),
    Field("amazing", BooleanType),
    Field("fans", LongType.Signed),
    Field("rating", DecimalType(4, 2))
  )

  def iter: Iterator[Vector[Any]] = Iterator.continually(Vector(
    Random.nextString(10),
    Random.nextInt(),
    Random.nextDouble(),
    Random.nextBoolean(),
    Random.nextLong(),
    BigDecimal(Random.nextDouble(), new MathContext(4)).setScale(2, RoundingMode.UP)
  ))

  def ds: DataStream = DataStream.fromIterator(structType, iter.take(size).map(Row(structType, _)))

  val ppath = new Path("parquet_speed.pq")
  fs.delete(ppath, false)

  val opath = new Path("orc_speed.orc")
  fs.delete(opath, false)

  new File(ppath.toString).deleteOnExit()
  new File(opath.toString).deleteOnExit()

  timed("Orc Insertion") {
    ds.to(OrcSink(opath))
  }

  timed("Parquet Insertion") {
    ds.to(ParquetSink(ppath))
  }

  while (true) {

    timed("Reading with OrcSource") {
      val actual = OrcSource(opath).toDataStream().size
      assert(actual == size, s"$actual != $size")
    }

    timed("Reading with ParquetSource") {
      val actual = ParquetSource(ppath).toDataStream().size
      assert(actual == size, s"$actual != $size")
    }
  }
}

Source File: HiveTableFilesFnTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import java.nio.file.Paths

import com.sksamuel.exts.Logging
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hdfs.MiniDFSCluster
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.scalatest.mockito.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}

class HiveTableFilesFnTest extends FlatSpec with Matchers with Logging with MockitoSugar {

  System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA)
  val clusterPath = Paths.get("miniclusters", "cluster")
  val conf = new Configuration()
  conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, clusterPath.toAbsolutePath.toString)
  val cluster = new MiniDFSCluster.Builder(conf).build()
  implicit val fs = cluster.getFileSystem

  "HiveTableFilesFn" should "detect all files in root when no partitions" in {

    implicit val client = mock[IMetaStoreClient]
    org.mockito.Mockito.when(client.getTable("default", "mytable")).thenReturn(new Table)

    val root = new Path("tab1")
    fs.mkdirs(root)

    // table scanner will skip 0 length files
    val a = fs.create(new Path(root, "a"))
    a.write(1)
    a.close()

    val b = fs.create(new Path(root, "b"))
    b.write(1)
    b.close()

    HiveTableFilesFn("default", "mytable", fs.resolvePath(root), Nil).values.flatten.map(_.getPath.getName).toSet shouldBe Set("a", "b")
  }

  it should "ignore hidden files in root when no partitions" in {
    implicit val client = mock[IMetaStoreClient]
    org.mockito.Mockito.when(client.getTable("default", "mytable")).thenReturn(new Table)

    val root = new Path("tab2")
    fs.mkdirs(root)

    // table scanner will skip 0 length files
    val a = fs.create(new Path(root, "a"))
    a.write(1)
    a.close()

    val b = fs.create(new Path(root, "_b"))
    b.write(1)
    b.close()

    HiveTableFilesFn("default", "mytable", fs.resolvePath(root), Nil).values.flatten.map(_.getPath.getName).toSet shouldBe Set("a")
  }
}

Source File: HiveBenchmarkApp.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import java.util.UUID

import com.sksamuel.exts.metrics.Timed
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient

import scala.util.Random

object HiveBenchmarkApp extends App with Timed {

  val states = List(
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming").map(_.replace(' ', '_').toLowerCase)

  import HiveConfig._

  val schema = StructType("id", "state")
  val rows = List.fill(1000000)(List(UUID.randomUUID.toString, states(Random.nextInt(50))))

  logger.info(s"Generated ${rows.size} rows")

  new HiveOps(client).createTable(
    "sam",
    "people",
    schema,
    List("state"),
    overwrite = true
  )

  logger.info("Table created")

  val sink = HiveSink("sam", "people")
  DataStream.fromValues(schema, rows).to(sink)

  logger.info("Write complete")

  while (true) {

    timed("datastream took") {
      val result = HiveSource("sam", "people").toDataStream().collect
      println(result.size)
    }
  }
}

Source File: OrcWriter.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.util.concurrent.atomic.AtomicInteger
import java.util.function.IntUnaryOperator

import com.sksamuel.exts.Logging
import com.typesafe.config.ConfigFactory
import io.eels.Row
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector
import org.apache.orc.{OrcConf, OrcFile, TypeDescription}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

// performs the actual write out of orc data, to be used by an orc sink
class OrcWriter(path: Path,
                structType: StructType,
                options: OrcWriteOptions)(implicit conf: Configuration) extends Logging {

  private val schema: TypeDescription = OrcSchemaFns.toOrcSchema(structType)
  logger.trace(s"Creating orc writer for schema $schema")

  private val batchSize = {
    val size = ConfigFactory.load().getInt("eel.orc.sink.batchSize")
    Math.max(Math.min(1024, size), 1)
  }
  logger.debug(s"Orc writer will use batchsize=$batchSize")

  private val buffer = new ArrayBuffer[Row](batchSize)
  private val serializers = schema.getChildren.asScala.map(OrcSerializer.forType).toArray
  private val batch = schema.createRowBatch(batchSize)

  OrcConf.COMPRESSION_STRATEGY.setString(conf, options.compressionStrategy.name)
  OrcConf.COMPRESS.setString(conf, options.compressionKind.name)
  options.encodingStrategy.map(_.name).foreach(OrcConf.ENCODING_STRATEGY.setString(conf, _))
  options.compressionBufferSize.foreach(OrcConf.BUFFER_SIZE.setLong(conf, _))
  private val woptions = OrcFile.writerOptions(conf).setSchema(schema)

  options.rowIndexStride.foreach { size =>
    woptions.rowIndexStride(size)
    logger.debug(s"Using stride size = $size")
  }

  if (options.bloomFilterColumns.nonEmpty) {
    woptions.bloomFilterColumns(options.bloomFilterColumns.mkString(","))
    logger.debug(s"Using bloomFilterColumns = $options.bloomFilterColumns")
  }
  private lazy val writer = OrcFile.createWriter(path, woptions)

  private val counter = new AtomicInteger(0)

  def write(row: Row): Unit = {
    buffer.append(row)
    if (buffer.size == batchSize)
      flush()
  }

  def records: Int = counter.get()

  def flush(): Unit = {

    def writecol[T <: ColumnVector](rowIndex: Int, colIndex: Int, row: Row): Unit = {
      val value = row.values(colIndex)
      val vector = batch.cols(colIndex).asInstanceOf[T]
      val serializer = serializers(colIndex).asInstanceOf[OrcSerializer[T]]
      serializer.writeToVector(rowIndex, vector, value)
    }

    // don't use foreach here, using old school for loops for perf
    for (rowIndex <- buffer.indices) {
      val row = buffer(rowIndex)
      for (colIndex <- batch.cols.indices) {
        writecol(rowIndex, colIndex, row)
      }
    }

    batch.size = buffer.size
    writer.addRowBatch(batch)
    counter.updateAndGet(new IntUnaryOperator {
      override def applyAsInt(operand: Int): Int = operand + batch.size
    })
    buffer.clear()
    batch.reset()
  }

  def close(): Long = {
    if (buffer.nonEmpty)
      flush()
    writer.close()
    val count = writer.getNumberOfRows
    logger.info(s"Orc writer wrote $count rows")
    count
  }
}

Source File: OrcBatchIterator.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import com.sksamuel.exts.Logging
import io.eels.{Predicate, Row}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector
import org.apache.orc.Reader

object OrcBatchIterator extends Logging {

  def apply(reader: Reader,
            fileSchema: StructType,
            projection: Seq[String],
            predicate: Option[Predicate])
           (implicit conf: Configuration): Iterator[Seq[Row]] = new Iterator[Seq[Row]] {

    val options = new Reader.Options()

    // if we have a projection then we need to return a schema that matches
    // the projection and not the full file schema
    val schema = if (projection.isEmpty) fileSchema else {
      val fields = projection.flatMap(name => fileSchema.field(name))
      StructType(fields)
    }
    logger.trace(s"Orc read will use projection=$schema")

    // a projection is column index based, so the given projection columns must be
    // resolved against the file schema to work out which column indices are required
    if (projection.nonEmpty) {
      // we have to include a true for the containing struct itself
      val includes = true +: fileSchema.fieldNames.map(projection.contains)
      logger.debug(s"Setting included columns=${includes.mkString(",")}")
      options.include(includes.toArray)
    }

    val searchArg = predicate.foreach { predicate =>
      val searchArg = OrcPredicateBuilder.build(predicate)
      options.searchArgument(searchArg, predicate.fields.toArray)
      logger.info(s"Setting predicate=$searchArg")
    }

    // if true then the predicate is applied to rows as well as being pushed down into the stripes,
    // this is because orc will either skip a stripe or return the whole stripe.
    // it is useful to disable for unit testing
    val rowLevelFilter = conf.get("eel.orc.predicate.row.filter", "true") != "false"
    logger.debug(s"Row level filtering = $rowLevelFilter")

    val batch = reader.getSchema().createRowBatch()
    val rows = reader.rows(options)
    val vector = new StructColumnVector(batch.numCols, batch.cols: _*)

    val projectionIndices = schema.fields.map(fileSchema.indexOf)
    val deserializer = new StructDeserializer(schema.fields, projectionIndices)

    override def hasNext(): Boolean = rows.nextBatch(batch) && !batch.endOfFile && batch.size > 0

    override def next(): Seq[Row] = {
      val rows = Vector.newBuilder[Row]
      for (rowIndex <- 0 until batch.size) {
        val values = deserializer.readFromVector(rowIndex, vector)
        val row = Row(schema, values)
        if (rowLevelFilter && predicate.isDefined) {
          if (predicate.get.eval(row)) {
            rows += row
          }
        } else {
          rows += row
        }
      }
      batch.reset()
      rows.result()
    }
  }
}

Source File: OrcSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels._
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.orc.OrcFile.ReaderOptions
import org.apache.orc._

import scala.collection.JavaConverters._

object OrcSource {
  def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(path))
  def apply(str: String)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(str))
}

case class OrcSource(pattern: FilePattern,
                     projection: Seq[String] = Nil,
                     predicate: Option[Predicate] = None)
                    (implicit fs: FileSystem, conf: Configuration) extends Source with Using {

  override def parts(): Seq[Publisher[Seq[Row]]] = pattern.toPaths().map(new OrcPublisher(_, projection, predicate))

  def withPredicate(predicate: Predicate): OrcSource = copy(predicate = predicate.some)
  def withProjection(first: String, rest: String*): OrcSource = withProjection(first +: rest)
  def withProjection(fields: Seq[String]): OrcSource = {
    require(fields.nonEmpty)
    copy(projection = fields.toList)
  }

  override def schema: StructType = {
    val reader = OrcFile.createReader(pattern.toPaths().head, new ReaderOptions(conf))
    val schema = reader.getSchema
    OrcSchemaFns.fromOrcType(schema).asInstanceOf[StructType]
  }

  private def reader() = {
    val options = new ReaderOptions(conf)
    OrcFile.createReader(pattern.toPaths().head, options)
  }

  def count(): Long = reader().getNumberOfRows
  def statistics(): Seq[ColumnStatistics] = reader().getStatistics.toVector
  def stripes(): Seq[StripeInformation] = reader().getStripes.asScala
  def stripeStatistics(): Seq[StripeStatistics] = reader().getStripeStatistics.asScala
}

class OrcPublisher(path: Path,
                   projection: Seq[String],
                   predicate: Option[Predicate])(implicit conf: Configuration) extends Publisher[Seq[Row]] {

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      val reader = OrcFile.createReader(path, new ReaderOptions(conf))
      val fileSchema = OrcSchemaFns.fromOrcType(reader.getSchema).asInstanceOf[StructType]
      val iterator: Iterator[Row] = OrcBatchIterator(reader, fileSchema, projection, predicate).flatten

      val running = new AtomicBoolean(true)
      subscriber.subscribed(Subscription.fromRunning(running))
      iterator.grouped(DataStream.DefaultBatchSize).takeWhile(_ => running.get).foreach(subscriber.next)
      subscriber.completed()
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: OrcSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.config.ConfigSupport
import com.typesafe.config.ConfigFactory
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.orc.OrcFile.{CompressionStrategy, EncodingStrategy}
import org.apache.orc.OrcProto.CompressionKind

case class OrcWriteOptions(overwrite: Boolean = false,
                           compressionKind: CompressionKind,
                           compressionStrategy: CompressionStrategy,
                           compressionBufferSize: Option[Int],
                           encodingStrategy: Option[EncodingStrategy],
                           bloomFilterColumns: Seq[String] = Nil,
                           permission: Option[FsPermission] = None,
                           inheritPermissions: Option[Boolean] = None,
                           rowIndexStride: Option[Int] = None) {
  def withCompressionKind(kind: CompressionKind): OrcWriteOptions = copy(compressionKind = kind)
  def withCompressionStrategy(strategy: CompressionStrategy): OrcWriteOptions = copy(compressionStrategy = strategy)
  def withCompressionBufferSize(size: Int): OrcWriteOptions = copy(compressionBufferSize = size.some)
  def withEncodingStrategy(strategy: EncodingStrategy): OrcWriteOptions = copy(encodingStrategy = strategy.some)
  def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcWriteOptions = copy(bloomFilterColumns = bloomFilterColumns)
  def withRowIndexStride(stride: Int): OrcWriteOptions = copy(rowIndexStride = stride.some)
  def withOverwrite(overwrite: Boolean): OrcWriteOptions = copy(overwrite = overwrite)
  def withPermission(permission: FsPermission): OrcWriteOptions = copy(permission = permission.some)
  def withInheritPermission(inheritPermissions: Boolean): OrcWriteOptions = copy(inheritPermissions = inheritPermissions.some)
}

object OrcWriteOptions extends ConfigSupport {

  // creates a config from the typesafe reference.confs
  def apply(): OrcWriteOptions = {
    val config = ConfigFactory.load()
    OrcWriteOptions(
      false,
      CompressionKind valueOf config.getString("eel.orc.writer.compression-kind"),
      CompressionStrategy valueOf config.getString("eel.orc.writer.compression-strategy"),
      config.getIntOpt("eel.orc.writer.compression-buffer-size"),
      config.getStringOpt("eel.orc.writer.encoding-strategy").map(EncodingStrategy.valueOf)
    )
  }
}

case class OrcSink(path: Path, options: OrcWriteOptions = OrcWriteOptions())
                  (implicit fs: FileSystem, conf: Configuration) extends Sink with Logging {

  // -- convenience options --
  def withCompressionKind(kind: CompressionKind): OrcSink = copy(options = options.copy(compressionKind = kind))
  def withCompressionStrategy(strategy: CompressionStrategy): OrcSink = copy(options = options.copy(compressionStrategy = strategy))
  def withCompressionBufferSize(size: Int): OrcSink = copy(options = options.copy(compressionBufferSize = size.some))
  def withEncodingStrategy(strategy: EncodingStrategy): OrcSink = copy(options = options.copy(encodingStrategy = strategy.some))
  def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcSink = copy(options = options.copy(bloomFilterColumns = bloomFilterColumns))
  def withRowIndexStride(stride: Int): OrcSink = copy(options = options.copy(rowIndexStride = stride.some))
  def withOverwrite(overwrite: Boolean): OrcSink = copy(options = options.copy(overwrite = overwrite))
  def withPermission(permission: FsPermission): OrcSink = copy(options = options.copy(permission = permission.some))
  def withInheritPermission(inheritPermissions: Boolean): OrcSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some))

  override def open(schema: StructType, n: Int): Seq[SinkWriter] = {
    if (n == 1) Seq(create(schema, path))
    else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) }
  }

  override def open(schema: StructType): SinkWriter = create(schema, path)

  private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter {

    if (options.overwrite && fs.exists(path))
      fs.delete(path, false)

    val writer = new OrcWriter(path, schema, options)

    override def write(row: Row): Unit = writer.write(row)
    
    override def close(): Unit = {
      writer.close()
      options.permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (options.inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }
}

Source File: OrcMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.io.File

import com.sksamuel.exts.metrics.Timed
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import io.eels.{FilePattern, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Random


object OrcMultipleFileSpeedTest extends App with Timed {

  val size = 5000000
  val count = 20
  val schema = StructType("a", "b", "c", "d", "e")

  def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4))

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val dir = new Path("orc-speed-test")
  new File(dir.toString).mkdirs()

  timed("Insertion") {
    val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size))
    new File(dir.toString).listFiles().foreach(_.delete)
    ds.to(OrcSink(new Path("orc-speed-test/orc_speed.pq")).withOverwrite(true), count)
  }

  for (_ <- 1 to 25) {
    assert(count == FilePattern("orc-speed-test/*").toPaths().size)

    timed("Reading with OrcSource") {
      val actual = OrcSource("orc-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size
      assert(actual == size, s"Expected $size but was $actual")
    }

    println("")
    println("---------")
    println("")
  }
}

Source File: OrcPredicateTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.io.{File, FilenameFilter}

import io.eels.Predicate
import io.eels.datastream.DataStream
import io.eels.schema.{Field, LongType, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}

class OrcPredicateTest extends FlatSpec with Matchers with BeforeAndAfterAll {
  cleanUpResidualOrcTestFiles

  val schema = StructType(
    Field("name", StringType, nullable = true),
    Field("city", StringType, nullable = true),
    Field("age", LongType.Signed, nullable = true)
  )

  val values = Vector.fill(1000) {
    Vector("sam", "middlesbrough", 37)
  } ++ Vector.fill(1000) {
    Vector("laura", "iowa city", 24)
  }

  val ds = DataStream.fromValues(schema, values)

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.get(new Configuration())
  val path = new Path("test.orc")

  if (fs.exists(path))
    fs.delete(path, false)

  new File(path.toString).deleteOnExit()

  ds.to(OrcSink(path).withRowIndexStride(1000))

  override protected def afterAll(): Unit = fs.delete(path, false)

  "OrcSource" should "support string equals predicates" in {
    conf.set("eel.orc.predicate.row.filter", "false")
    val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect
    rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L))
  }

  it should "support gt predicates" in {
    conf.set("eel.orc.predicate.row.filter", "false")
    val rows = OrcSource(path).withPredicate(Predicate.gt("age", 30L)).toDataStream().collect
    rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L))
  }

  it should "support lt predicates" in {
    conf.set("eel.orc.predicate.row.filter", "false")
    val rows = OrcSource(path).withPredicate(Predicate.lt("age", 30)).toDataStream().collect
    rows.map(_.values).toSet shouldBe Set(Vector("laura", "iowa city", 24L))
  }

  it should "enable row level filtering with predicates by default" in {
    conf.set("eel.orc.predicate.row.filter", "true")
    val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect
    rows.head.schema shouldBe schema
    rows.head.values shouldBe Vector("sam", "middlesbrough", 37L)
  }

  private def cleanUpResidualOrcTestFiles = {
    new File(".").listFiles(new FilenameFilter {
      override def accept(dir: File, name: String): Boolean = {
        (name.startsWith("test_") && name.endsWith(".orc")) || (name.startsWith(".test_") && name.endsWith(".orc.crc"))
      }
    }).foreach(_.delete())
  }
}

Source File: InputFormatConf.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.hadoop.io.{ LongWritable, Text, Writable }
import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader }
import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat }

import scala.collection.immutable

trait InputFormatConf[K, V] extends Serializable {
  type IF <: InputFormat[K, V]
  type Split <: InputSplit with Writable

  type KExtract <: Extract[K]
  type VExtract <: Extract[V]

  def kExtract: KExtract
  def vExtract: VExtract

  def makeInputFormat(): IF

  // I'm unsure if we should WriSer them for them
  def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]]

  // TODO do we want to require typing of the RecordReader as well?
  final def createRecordReader(hadoopConf: Configuration, split: Split,
    inputFormat: IF = makeInputFormat()): RecordReader[K, V] = {
    val tac = ConfOnlyTAC(hadoopConf)
    val recordReader = inputFormat.createRecordReader(split, tac)
    recordReader.initialize(split, tac)
    recordReader
  }
}

case class TextInputFormatConf(file: String, partitions: Int)
  extends InputFormatConf[LongWritable, Text] {
  type IF = TextInputFormat
  type Split = FileSplit

  // TODO now that we figured out what's up, see if we can't eliminate the need for this...
  val internalK = Extract.unit[LongWritable]
  val internalV = Extract.text

  type KExtract = internalK.type
  type VExtract = internalV.type

  override val kExtract: KExtract = internalK
  override val vExtract: VExtract = internalV

  def makeInputFormat() = new TextInputFormat()
  def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = {
    val job = Job.getInstance(hadoopConf)
    FileInputFormat.setInputPaths(job, file)
    val path = new Path(file)
    val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen
    val size_per = math.round(len / partitions.toDouble)

    ((0 until partitions - 1).map { p =>
      new FileSplit(path, size_per * p, size_per, null)
    } :+ {
      val fin = size_per * (partitions - 1)
      new FileSplit(path, fin, len - fin, null)
    }).map(WriSer(_))
  }
}

// TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf
object CSVInputFormatConf {
  def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract
  } = new InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract

    override val kExtract: KExtract = ifc.kExtract
    override val vExtract: VExtract = ifc.vExtract

    override def makeInputFormat() = ifc.makeInputFormat()
    override def makeSplits(hadoopConf: Configuration) = {
      val splits = ifc.makeSplits(hadoopConf)
      splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) {
        case WriSer(head) =>
          val rr = createRecordReader(hadoopConf, head)
          require(rr.nextKeyValue, "csv has no header, first line was empty")
          val afterHeader = rr.getCurrentKey.get
          require(rr.nextKeyValue, "first split is empty")
          WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +:
            splits.tail
      }
    }
  }
}

Source File: Hadoop.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.spark.SparkContext

import com.twosigma.flint.rdd.Range
import grizzled.slf4j.Logger

object Hadoop {

  val logger = Logger(Hadoop.getClass)

  def fileSplits[K1, V1, K: Ordering](
    sc: SparkContext,
    file: String,
    ifConf: InputFormatConf[K1, V1] // TODO consider just straight up making this (K, K) as we CAN get it, it's just a pain.
  )(parseKey: (ifConf.KExtract#Extracted, ifConf.VExtract#Extracted) => K): Map[Int, (Range[K], WriSer[ifConf.Split])] = {
    val splits = ifConf.makeSplits(new Configuration())
    logger.info(s"Total number of splits: ${splits.size}")
    splits.foreach { s => logger.debug(s.get.toString) }
    // TODO implement the version which does the more rigorous thing, at least for splits that
    // support it
    val m = getSplitTimes(sc, ifConf)(parseKey, splits)
      .sortBy(_._1)
      .zip(splits)
      .map { case ((index, time), split) => (index, (time, split)) }
      .toMap
    m.map { case (k, (b, w)) => (k, (Range(b, m.get(k + 1).map(_._1)), w)) }
  }

  def getSplitTimes[K1, V1, K](
    sc: SparkContext,
    ifConf: InputFormatConf[K1, V1]
  )(
    parseKey: (ifConf.KExtract#Extracted, ifConf.VExtract#Extracted) => K,
    splits: Seq[WriSer[ifConf.Split]]
  ): Vector[(Int, K)] =
    sc.parallelize(splits.zipWithIndex).map {
      case (serSplit, num) =>
        val (a, b) = readRecords(ifConf)(serSplit).next
        val time = parseKey(a, b)
        Vector((num, time))
    }.reduce(_ ++ _)

  def readRecords[K, V](ifConf: InputFormatConf[K, V])(
    serSplit: WriSer[ifConf.Split]
  ): Iterator[(ifConf.KExtract#Extracted, ifConf.VExtract#Extracted)] = {
    val inputFormat = ifConf.makeInputFormat()
    val split = serSplit.get

    val tac = ConfOnlyTAC(new Configuration())
    val recordReader = inputFormat.createRecordReader(split, tac)
    recordReader.initialize(split, tac)

    logger.info(s"Beginning to read lines from split: $split")
    new Iterator[(ifConf.KExtract#Extracted, ifConf.VExtract#Extracted)] {
      var stillMore = false
      lazy val init = stillMore = recordReader.nextKeyValue()

      override def hasNext = {
        init
        stillMore
      }

      override def next = {
        init
        if (!stillMore) sys.error("hit end of iterator")
        val toReturn = (
          ifConf.kExtract(recordReader.getCurrentKey),
          ifConf.vExtract(recordReader.getCurrentValue)
        )
        stillMore = recordReader.nextKeyValue
        toReturn
      }
    }
  }
}

Source File: ConfOnlyTAC.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.{ Counter, TaskAttemptID, Job, TaskAttemptContext }

// This exists just because of a quirk of the record reader api.
case class ConfOnlyTAC(_conf: Configuration) extends Job with TaskAttemptContext {
  // JobContextImpl and JobContext
  override def getConfiguration: Configuration = _conf

  // TaskAttemptContext
  override def getTaskAttemptID: TaskAttemptID = sys.error("not implemented")
  override def setStatus(msg: String): Unit = sys.error("not implemented")
  override def getStatus = sys.error("not implemented")
  override def getProgress: Float = sys.error("not implemented")
  override def getCounter(counterName: Enum[_]): Counter = sys.error("not implemented")
  override def getCounter(groupName: String, counterName: String): Counter = sys.error("not implemented")

  // Progressable
  override def progress(): Unit = sys.error("not implemented")
}

Source File: RMCallbackHandler.scala From DataXServer with Apache License 2.0

5 votes

package org.tianlangstudio.data.hamal.yarn

import java.io.File
import java.util.{Collections, List}

import org.tianlangstudio.data.hamal.core.{Constants, HamalConf}
import org.tianlangstudio.data.hamal.core.HamalConf
//import java.util.Collections

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path, FileContext}
import org.apache.hadoop.yarn.api.records._
import org.apache.hadoop.yarn.client.api.{AMRMClient, NMClient}
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
import scala.jdk.CollectionConverters._
//import scala.collection.JavaConverters._
/**
 * Created by zhuhq on 2016/4/29.
 */
class RMCallbackHandler(nmClient:NMClient,containerCmd:Container => String,hamalConf: HamalConf,yarnConfiguration: Configuration)  extends  AMRMClientAsync.CallbackHandler {
  private val logging = org.slf4j.LoggerFactory.getLogger(classOf[RMCallbackHandler])
  override def onContainersCompleted(statuses: List[ContainerStatus]): Unit = {
    for(containerStatus <- statuses.asScala) {
      logging.info(s"containerId:${containerStatus} exitStatus:${containerStatus}")
    }
  }

  override def onError(e: Throwable): Unit = {
    logging.error("on error",e)

  }

  override def getProgress: Float = {

    0
  }

  override def onShutdownRequest(): Unit = {
    logging.info("on shutdown request")

  }

  override def onNodesUpdated(updatedNodes: List[NodeReport]): Unit = {
    logging.info("on nodes updated")
    for(nodeReport <- updatedNodes.asScala) {
      logging.info(s"node id:${nodeReport} node labels:${nodeReport}");
    }
  }

  override def onContainersAllocated(containers: List[Container]): Unit = {
    logging.info("on containers allocated");
    for (container:Container <- containers.asScala) {
      try {
        // Launch container by create ContainerLaunchContext
        val  ctx = Records.newRecord(classOf[ContainerLaunchContext]);

        //ctx.setCommands(Collections.singletonList(""" echo "begin";sleep 900;echo "end"; """))
        ctx.setCommands(Collections.singletonList(containerCmd(container)))
        val packagePath = hamalConf.getString(Constants.DATAX_EXECUTOR_FILE,"executor.zip");
        val archiveStat = FileSystem.get(yarnConfiguration).getFileStatus(new Path(packagePath))
        val  packageUrl = ConverterUtils.getYarnUrlFromPath(
          FileContext.getFileContext.makeQualified(new Path(packagePath)));
        val packageResource = Records.newRecord[LocalResource](classOf[LocalResource])

        packageResource.setResource(packageUrl);
        packageResource.setSize(archiveStat.getLen);
        packageResource.setTimestamp(archiveStat.getModificationTime);
        packageResource.setType(LocalResourceType.ARCHIVE);
        packageResource.setVisibility(LocalResourceVisibility.APPLICATION)
        ctx.setLocalResources(Collections.singletonMap(Constants.DATAX_EXECUTOR_ARCHIVE_FILE_NAME,packageResource))
        logging.info("[AM] Launching container " + container.getId());
        nmClient.startContainer(container, ctx);
      } catch {
        case ex:Exception =>
          logging.info("[AM] Error launching container " + container.getId() + " " + ex);
      }
    }
  }

}

Source File: HbRddConfig.scala From hbrdd with Apache License 2.0

5 votes

package top.spoofer.hbrdd.config

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration

class HbRddConfig(config: Configuration) extends Serializable {
  def getHbaseConfig = HBaseConfiguration.create(config)
}

object HbRddConfig {
  type configOption = (String, String)
  private[HbRddConfig] case class HbaseOption(name: String, value: String)

  def apply(config: Configuration): HbRddConfig = new HbRddConfig(config)

  def apply(configs: configOption*): HbRddConfig = {
    val hbConfig = HBaseConfiguration.create()

    for {
      option <- configs
      hbOption = HbaseOption(option._1, option._2) //使用新的case class 只是为了表达更加清晰
    } hbConfig.set(hbOption.name, hbOption.value)

    this.apply(hbConfig)
  }

  def apply(configs: { def rootDir: String; def quorum: String }): HbRddConfig = {
    apply(
      "hbase.rootdir" -> configs.rootDir,
      "hbase.zookeeper.quorum" -> configs.quorum
    )
  }

  def apply(configs: Map[String, String]): HbRddConfig = {
    val hbConfig = HBaseConfiguration.create()

    configs.keys foreach { name =>
      hbConfig.set(name, configs(name))
    }

    this.apply(hbConfig)
  }

  def apply(configs: TraversableOnce[configOption]): HbRddConfig = {
    val hbConfig = HBaseConfiguration.create()

    configs foreach { option =>
      val hbOption = HbaseOption(option._1, option._2)
      hbConfig.set(hbOption.name, hbOption.value)
    }

    this.apply(hbConfig)
  }
}

Source File: KerberosLoginProvider.scala From rokku with Apache License 2.0

5 votes

package com.ing.wbaa.rokku.proxy.provider

import java.io.File

import com.ing.wbaa.rokku.proxy.config.KerberosSettings
import com.typesafe.scalalogging.LazyLogging
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.security.UserGroupInformation

import scala.util.{ Failure, Success, Try }

trait KerberosLoginProvider extends LazyLogging {

  protected[this] def kerberosSettings: KerberosSettings

  loginUserFromKeytab(kerberosSettings.keytab, kerberosSettings.principal)

  private def loginUserFromKeytab(keytab: String, principal: String): Unit = {

    if (StringUtils.isNotBlank(keytab) && StringUtils.isNotBlank(principal)) {
      if (!new File(keytab).exists()) {
        logger.info("keytab file does not exist {}", keytab)
      } else {
        Try {
          UserGroupInformation.setConfiguration(new Configuration())
          UserGroupInformation.loginUserFromKeytab(principal, keytab)
        } match {
          case Success(_)         => logger.info("kerberos credentials provided {}", UserGroupInformation.getLoginUser)
          case Failure(exception) => logger.error("kerberos login error {}", exception)
        }
      }
    } else {
      logger.info("kerberos credentials are not provided")
    }
  }

}

Source File: ImageLoaderUtils.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import java.awt.image.BufferedImage
import java.io.{InputStream, ByteArrayInputStream}
import java.net.URI
import java.util.zip.GZIPInputStream
import javax.imageio.ImageIO

import keystoneml.loaders.VOCLoader._
import org.apache.commons.compress.archivers.ArchiveStreamFactory
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import keystoneml.pipelines.Logging
import keystoneml.utils._

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

object ImageLoaderUtils extends Logging {
  
  def loadFiles[L, I <: AbstractLabeledImage[L] : ClassTag](
      filePathsRDD: RDD[URI],
      labelsMap: String => L,
      imageBuilder: (Image, L, Option[String]) => I, // TODO(etrain): We can probably do this with implicits.
      namePrefix: Option[String] = None): RDD[I] = {
    filePathsRDD.flatMap(fileUri => loadFile(fileUri, labelsMap, imageBuilder, namePrefix))
  }

  private def loadFile[L, I <: AbstractLabeledImage[L]](
      fileUri: URI,
      labelsMap: String => L,
      imageBuilder: (Image, L, Option[String]) => I,
      namePrefix: Option[String]): Iterator[I] = {
    val filePath = new Path(fileUri)
    val conf = new Configuration(true)
    val fs = FileSystem.get(filePath.toUri(), conf)
    val fStream = fs.open(filePath)

    val tarStream = new ArchiveStreamFactory().createArchiveInputStream(
      "tar", fStream).asInstanceOf[TarArchiveInputStream]

    var entry = tarStream.getNextTarEntry()
    val imgs = new ArrayBuffer[I]
    while (entry != null) {
      if (!entry.isDirectory && (namePrefix.isEmpty || entry.getName.startsWith(namePrefix.get))) {
        var offset = 0
        var ret = 0
        val content = new Array[Byte](entry.getSize().toInt)
        while (ret >= 0 && offset != entry.getSize()) {
          ret = tarStream.read(content, offset, content.length - offset)
          if (ret >= 0) {
            offset += ret
          }
        }

        val bais = new ByteArrayInputStream(content)

        val image = ImageUtils.loadImage(bais).map { img =>
          imageBuilder(img, labelsMap(entry.getName), Some(entry.getName))
        }

        imgs ++= image
      }
      entry = tarStream.getNextTarEntry()
    }

    imgs.iterator
  }
}

Source File: OrcFileOperator.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import java.io.IOException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[hive] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String,
      config: Option[Configuration] = None,
      ignoreCorruptFiles: Boolean = false)
      : Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      val reader = try {
        Some(OrcFile.createReader(fs, path))
      } catch {
        case e: IOException =>
          if (ignoreCorruptFiles) {
            logWarning(s"Skipped the footer in the corrupted file: $path", e)
            None
          } else {
            throw new SparkException(s"Could not read footer for file: $path", e)
          }
      }
      path -> reader
    }.collectFirst {
      case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean)
      : Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.toIterator.map(getFileReader(_, conf, ignoreCorruptFiles)).collectFirst {
      case Some(reader) =>
        val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
        val schema = readerInspector.getTypeName
        logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
        CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: HiveExternalCatalogSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.hadoop.conf.Configuration

import org.apache.spark.SparkConf
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.types.StructType


class HiveExternalCatalogSuite extends ExternalCatalogSuite {

  private val externalCatalog: HiveExternalCatalog = {
    val catalog = new HiveExternalCatalog(new SparkConf, new Configuration)
    catalog.client.reset()
    catalog
  }

  protected override val utils: CatalogTestUtils = new CatalogTestUtils {
    override val tableInputFormat: String = "org.apache.hadoop.mapred.SequenceFileInputFormat"
    override val tableOutputFormat: String = "org.apache.hadoop.mapred.SequenceFileOutputFormat"
    override def newEmptyCatalog(): ExternalCatalog = externalCatalog
    override val defaultProvider: String = "hive"
  }

  protected override def resetState(): Unit = {
    externalCatalog.client.reset()
  }

  import utils._

  test("SPARK-18647: do not put provider in table properties for Hive serde table") {
    val catalog = newBasicCatalog()
    val hiveTable = CatalogTable(
      identifier = TableIdentifier("hive_tbl", Some("db1")),
      tableType = CatalogTableType.MANAGED,
      storage = storageFormat,
      schema = new StructType().add("col1", "int").add("col2", "string"),
      provider = Some("hive"))
    catalog.createTable(hiveTable, ignoreIfExists = false)

    val rawTable = externalCatalog.client.getTable("db1", "hive_tbl")
    assert(!rawTable.properties.contains(HiveExternalCatalog.DATASOURCE_PROVIDER))
    assert(DDLUtils.isHiveTable(externalCatalog.getTable("db1", "hive_tbl")))
  }

  Seq("parquet", "hive").foreach { format =>
    test(s"Partition columns should be put at the end of table schema for the format $format") {
      val catalog = newBasicCatalog()
      val newSchema = new StructType()
        .add("col1", "int")
        .add("col2", "string")
        .add("partCol1", "int")
        .add("partCol2", "string")
      val table = CatalogTable(
        identifier = TableIdentifier("tbl", Some("db1")),
        tableType = CatalogTableType.MANAGED,
        storage = CatalogStorageFormat.empty,
        schema = new StructType()
          .add("col1", "int")
          .add("partCol1", "int")
          .add("partCol2", "string")
          .add("col2", "string"),
        provider = Some(format),
        partitionColumnNames = Seq("partCol1", "partCol2"))
      catalog.createTable(table, ignoreIfExists = false)

      val restoredTable = externalCatalog.getTable("db1", "tbl")
      assert(restoredTable.schema == newSchema)
    }
  }

  test("SPARK-22306: alter table schema should not erase the bucketing metadata at hive side") {
    val catalog = newBasicCatalog()
    externalCatalog.client.runSqlHive(
      """
        |CREATE TABLE db1.t(a string, b string)
        |CLUSTERED BY (a, b) SORTED BY (a, b) INTO 10 BUCKETS
        |STORED AS PARQUET
      """.stripMargin)

    val newSchema = new StructType().add("a", "string").add("b", "string").add("c", "string")
    catalog.alterTableDataSchema("db1", "t", newSchema)

    assert(catalog.getTable("db1", "t").schema == newSchema)
    val bucketString = externalCatalog.client.runSqlHive("DESC FORMATTED db1.t")
      .filter(_.contains("Num Buckets")).head
    assert(bucketString.contains("10"))
  }

  test("SPARK-23001: NullPointerException when running desc database") {
    val catalog = newBasicCatalog()
    catalog.createDatabase(newDb("dbWithNullDesc").copy(description = null), ignoreIfExists = false)
    assert(catalog.getDatabase("dbWithNullDesc").description == "")
  }
}

Source File: HiveClientBuilder.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.client

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.util.VersionInfo

import org.apache.spark.SparkConf
import org.apache.spark.util.Utils

private[client] object HiveClientBuilder {
  // In order to speed up test execution during development or in Jenkins, you can specify the path
  // of an existing Ivy cache:
  private val ivyPath: Option[String] = {
    sys.env.get("SPARK_VERSIONS_SUITE_IVY_PATH").orElse(
      Some(new File(sys.props("java.io.tmpdir"), "hive-ivy-cache").getAbsolutePath))
  }

  private def buildConf(extraConf: Map[String, String]) = {
    lazy val warehousePath = Utils.createTempDir()
    lazy val metastorePath = Utils.createTempDir()
    metastorePath.delete()
    extraConf ++ Map(
      "javax.jdo.option.ConnectionURL" -> s"jdbc:derby:;databaseName=$metastorePath;create=true",
      "hive.metastore.warehouse.dir" -> warehousePath.toString)
  }

  // for testing only
  def buildClient(
      version: String,
      hadoopConf: Configuration,
      extraConf: Map[String, String] = Map.empty,
      sharesHadoopClasses: Boolean = true): HiveClient = {
    IsolatedClientLoader.forVersion(
      hiveMetastoreVersion = version,
      hadoopVersion = VersionInfo.getVersion,
      sparkConf = new SparkConf(),
      hadoopConf = hadoopConf,
      config = buildConf(extraConf),
      ivyPath = ivyPath,
      sharesHadoopClasses = sharesHadoopClasses).createClient()
  }
}

Source File: HiveVersionSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.client

import org.apache.hadoop.conf.Configuration
import org.scalactic.source.Position
import org.scalatest.Tag

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.hive.HiveUtils

private[client] abstract class HiveVersionSuite(version: String) extends SparkFunSuite {
  override protected val enableAutoThreadAudit = false
  protected var client: HiveClient = null

  protected def buildClient(
      hadoopConf: Configuration,
      sharesHadoopClasses: Boolean = true): HiveClient = {
    // Hive changed the default of datanucleus.schema.autoCreateAll from true to false and
    // hive.metastore.schema.verification from false to true since 2.0
    // For details, see the JIRA HIVE-6113 and HIVE-12463
    if (version == "2.0" || version == "2.1" || version == "2.2" || version == "2.3") {
      hadoopConf.set("datanucleus.schema.autoCreateAll", "true")
      hadoopConf.set("hive.metastore.schema.verification", "false")
    }
    HiveClientBuilder.buildClient(
      version,
      hadoopConf,
      HiveUtils.formatTimeVarsForHiveClient(hadoopConf),
      sharesHadoopClasses = sharesHadoopClasses)
  }

  override def suiteName: String = s"${super.suiteName}($version)"

  override protected def test(testName: String, testTags: Tag*)(testFun: => Any)
      (implicit pos: Position): Unit = {
    super.test(s"$version: $testName", testTags: _*)(testFun)
  }
}

Source File: DataSourceManagerFactory.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql

import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.util.Utils

object DataSourceManagerFactory {

  def create(
      datasourceType: String,
      conf: SparkConf,
      hadoopConf: Configuration): DataSourceManager = {
    val loader = Utils.getContextOrSparkClassLoader
    val serviceLoader = ServiceLoader.load(classOf[DataSourceManager], loader)
    var cls: Class[_] = null
    // As we use ServiceLoader to support creating any user provided DataSourceManager here,
    // META-INF/services/org.apache.spark.sql.sources.DataSourceRegister must be packaged properly
    // in user's jar, and the implementation of DataSourceManager must have a public parameterless
    // constructor. For scala language, def this() = this(null...) just work.
    try {
      cls = serviceLoader.asScala
        .filter(_.shortName().equals(datasourceType))
        .toList match {
        case head :: Nil =>
          head.getClass
        case _ =>
          throw new SparkException(s"error when instantiate datasource ${datasourceType}")
      }
    } catch {
      case _: Exception =>
        throw new SparkException(
          s"""Can't find corresponding DataSourceManager for ${datasourceType} type,
             |please check
             |1. META-INF/services/org.apache.spark.sql.sources.DataSourceRegister is packaged
             |2. your implementation of DataSourceManager's shortname is ${datasourceType}
             |3. your implementation of DataSourceManager must have a public parameterless
             |   constructor. For scala language, def this() = this(null, null, ...) just work.
           """.stripMargin)
    }
    try {
      val constructor = cls.getConstructor(classOf[SparkConf], classOf[Configuration])
      val newHadoopConf = new Configuration(hadoopConf)
      constructor.newInstance(conf, newHadoopConf).asInstanceOf[DataSourceManager]
    } catch {
      case _: NoSuchMethodException =>
        try {
          cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[DataSourceManager]
        } catch {
          case _: NoSuchMethodException =>
            cls.getConstructor().newInstance().asInstanceOf[DataSourceManager]
        }
    }
  }
}

Source File: CompressionCodecs.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.util

import java.util.Locale

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.SequenceFile.CompressionType
import org.apache.hadoop.io.compress._

import org.apache.spark.util.Utils

object CompressionCodecs {
  private val shortCompressionCodecNames = Map(
    "none" -> null,
    "uncompressed" -> null,
    "bzip2" -> classOf[BZip2Codec].getName,
    "deflate" -> classOf[DeflateCodec].getName,
    "gzip" -> classOf[GzipCodec].getName,
    "lz4" -> classOf[Lz4Codec].getName,
    "snappy" -> classOf[SnappyCodec].getName)

  
  def setCodecConfiguration(conf: Configuration, codec: String): Unit = {
    if (codec != null) {
      conf.set("mapreduce.output.fileoutputformat.compress", "true")
      conf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString)
      conf.set("mapreduce.output.fileoutputformat.compress.codec", codec)
      conf.set("mapreduce.map.output.compress", "true")
      conf.set("mapreduce.map.output.compress.codec", codec)
    } else {
      // This infers the option `compression` is set to `uncompressed` or `none`.
      conf.set("mapreduce.output.fileoutputformat.compress", "false")
      conf.set("mapreduce.map.output.compress", "false")
    }
  }
}

Source File: CodecStreams.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.{InputStream, OutputStream, OutputStreamWriter}
import java.nio.charset.{Charset, StandardCharsets}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.compress._
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext

object CodecStreams {
  private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = {
    val compressionCodecs = new CompressionCodecFactory(config)
    Option(compressionCodecs.getCodec(file))
  }

  def createInputStream(config: Configuration, file: Path): InputStream = {
    val fs = file.getFileSystem(config)
    val inputStream: InputStream = fs.open(file)

    getDecompressionCodec(config, file)
      .map(codec => codec.createInputStream(inputStream))
      .getOrElse(inputStream)
  }

  
  def getCompressionExtension(context: JobContext): String = {
    getCompressionCodec(context)
      .map(_.getDefaultExtension)
      .getOrElse("")
  }
}

Source File: HadoopFileLinesReader.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile,
    lineSeparator: Option[Array[Byte]],
    conf: Configuration) extends Iterator[Text] with Closeable {

  def this(file: PartitionedFile, conf: Configuration) = this(file, None, conf)

  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)

    val reader = lineSeparator match {
      case Some(sep) => new LineRecordReader(sep)
      // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`.
      case _ => new LineRecordReader()
    }

    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Source File: CatalogFileIndex.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types.StructType



private class PrunedInMemoryFileIndex(
    sparkSession: SparkSession,
    tableBasePath: Path,
    fileStatusCache: FileStatusCache,
    override val partitionSpec: PartitionSpec,
    override val metadataOpsTimeNs: Option[Long])
  extends InMemoryFileIndex(
    sparkSession,
    partitionSpec.partitions.map(_.path),
    Map.empty,
    Some(partitionSpec.partitionColumns),
    fileStatusCache)

Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.FileNotFoundException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.{SparkContext, TaskContext}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.SQLExecution
import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
import org.apache.spark.util.SerializableConfiguration



class BasicWriteJobStatsTracker(
    serializableHadoopConf: SerializableConfiguration,
    @transient val metrics: Map[String, SQLMetric])
  extends WriteJobStatsTracker {

  override def newTaskInstance(): WriteTaskStatsTracker = {
    new BasicWriteTaskStatsTracker(serializableHadoopConf.value)
  }

  override def processStats(stats: Seq[WriteTaskStats]): Unit = {
    val sparkContext = SparkContext.getActive.get
    var numPartitions: Long = 0L
    var numFiles: Long = 0L
    var totalNumBytes: Long = 0L
    var totalNumOutput: Long = 0L

    val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats])

    basicStats.foreach { summary =>
      numPartitions += summary.numPartitions
      numFiles += summary.numFiles
      totalNumBytes += summary.numBytes
      totalNumOutput += summary.numRows
    }

    metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles)
    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes)
    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput)
    metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions)

    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList)
  }
}

object BasicWriteJobStatsTracker {
  private val NUM_FILES_KEY = "numFiles"
  private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes"
  private val NUM_OUTPUT_ROWS_KEY = "numOutputRows"
  private val NUM_PARTS_KEY = "numParts"

  def metrics: Map[String, SQLMetric] = {
    val sparkContext = SparkContext.getActive.get
    Map(
      NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"),
      NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"),
      NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
      NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part")
    )
  }
}

Source File: HadoopFileWholeTextReader.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.input.WholeTextFileRecordReader


class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration)
  extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new CombineFileSplit(
      Array(new Path(new URI(file.filePath))),
      Array(file.start),
      Array(file.length),
      // TODO: Implement Locality
      Array.empty[String])
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0)
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Source File: DataWritingCommand.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import org.apache.hadoop.conf.Configuration

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker
import org.apache.spark.sql.execution.datasources.FileFormatWriter
import org.apache.spark.sql.execution.metric.SQLMetric
import org.apache.spark.util.SerializableConfiguration


  def logicalPlanOutputWithNames(
      query: LogicalPlan,
      names: Seq[String]): Seq[Attribute] = {
    // Save the output attributes to a variable to avoid duplicated function calls.
    val outputAttributes = query.output
    assert(outputAttributes.length == names.length,
      "The length of provided names doesn't match the length of output attributes.")
    outputAttributes.zip(names).map { case (attr, outputName) =>
      attr.withName(outputName)
    }
  }
}

Source File: FileStreamSink.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import scala.util.control.NonFatal

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.internal.Logging
import org.apache.spark.internal.io.FileCommitProtocol
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormat, FileFormatWriter}
import org.apache.spark.util.SerializableConfiguration

object FileStreamSink extends Logging {
  // The name of the subdirectory that is used to store metadata about which files are valid.
  val metadataDir = "_spark_metadata"

  
class FileStreamSink(
    sparkSession: SparkSession,
    path: String,
    fileFormat: FileFormat,
    partitionColumnNames: Seq[String],
    options: Map[String, String]) extends Sink with Logging {

  private val basePath = new Path(path)
  private val logPath = new Path(basePath, FileStreamSink.metadataDir)
  private val fileLog =
    new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toUri.toString)
  private val hadoopConf = sparkSession.sessionState.newHadoopConf()

  private def basicWriteJobStatsTracker: BasicWriteJobStatsTracker = {
    val serializableHadoopConf = new SerializableConfiguration(hadoopConf)
    new BasicWriteJobStatsTracker(serializableHadoopConf, BasicWriteJobStatsTracker.metrics)
  }

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
      logInfo(s"Skipping already committed batch $batchId")
    } else {
      val committer = FileCommitProtocol.instantiate(
        className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
        jobId = batchId.toString,
        outputPath = path)

      committer match {
        case manifestCommitter: ManifestFileCommitProtocol =>
          manifestCommitter.setupManifestOptions(fileLog, batchId)
        case _ =>  // Do nothing
      }

      // Get the actual partition columns as attributes after matching them by name with
      // the given columns names.
      val partitionColumns: Seq[Attribute] = partitionColumnNames.map { col =>
        val nameEquality = data.sparkSession.sessionState.conf.resolver
        data.logicalPlan.output.find(f => nameEquality(f.name, col)).getOrElse {
          throw new RuntimeException(s"Partition column $col not found in schema ${data.schema}")
        }
      }
      val qe = data.queryExecution

      FileFormatWriter.write(
        sparkSession = sparkSession,
        plan = qe.executedPlan,
        fileFormat = fileFormat,
        committer = committer,
        outputSpec = FileFormatWriter.OutputSpec(path, Map.empty, qe.analyzed.output),
        hadoopConf = hadoopConf,
        partitionColumns = partitionColumns,
        bucketSpec = None,
        statsTrackers = Seq(basicWriteJobStatsTracker),
        options = options)
    }
  }

  override def toString: String = s"FileSink[$path]"
}

Source File: StreamMetadata.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets
import java.util.ConcurrentModificationException

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataInputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: CancellableFSDataOutputStream = null
    try {
      val fileManager = CheckpointFileManager.create(metadataFile.getParent, hadoopConf)
      output = fileManager.createAtomic(metadataFile, overwriteIfPossible = false)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case e: FileAlreadyExistsException =>
        if (output != null) {
          output.cancel()
        }
        throw new ConcurrentModificationException(
          s"Multiple streaming queries are concurrently using $metadataFile", e)
      case e: Throwable =>
        if (output != null) {
          output.cancel()
        }
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    }
  }
}

Source File: StreamMetadataSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
}

Source File: ExcelOutputWriter.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import java.math.BigDecimal
import java.sql.Date
import java.sql.Timestamp
import java.text.DateFormat
import java.text.SimpleDateFormat
import java.util.Calendar

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.io.ArrayWritable
import org.apache.hadoop.mapreduce.RecordWriter
import org.apache.hadoop.mapreduce.TaskAttemptContext

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow }
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.datasources.OutputWriter
import org.apache.spark.sql.types._

import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO
import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration
import org.zuinnote.hadoop.office.format.common.util.msexcel.MSExcelUtil
import org.zuinnote.hadoop.office.format.mapreduce._

import org.apache.commons.logging.LogFactory
import org.apache.commons.logging.Log
import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration
import java.util.Locale
import java.text.DecimalFormat
import org.zuinnote.hadoop.office.format.common.converter.ExcelConverterSimpleSpreadSheetCellDAO
import java.text.NumberFormat

// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
private[excel] class ExcelOutputWriter(
  path:       String,
  dataSchema: StructType,
  context:    TaskAttemptContext, options: Map[String, String]) extends OutputWriter {
  
  def write(row: Row): Unit = {
    // check useHeader
    if (useHeader) {
      val headers = row.schema.fieldNames
      var i = 0
      for (x <- headers) {
        val headerColumnSCD = new SpreadSheetCellDAO(x, "", "", MSExcelUtil.getCellAddressA1Format(currentRowNum, i), defaultSheetName)
        recordWriter.write(NullWritable.get(), headerColumnSCD)
        i += 1
      }
      currentRowNum += 1
      useHeader = false
    }
    // for each value in the row
    if (row.size>0) {
      var currentColumnNum = 0;
      val simpleObject = new Array[AnyRef](row.size)
      for (i <- 0 to row.size - 1) { // for each element of the row
        val obj = row.get(i)
        if ((obj.isInstanceOf[Seq[String]]) && (obj.asInstanceOf[Seq[String]].length==5)) {
          val formattedValue = obj.asInstanceOf[Seq[String]](0)
          val comment = obj.asInstanceOf[Seq[String]](1)
          val formula = obj.asInstanceOf[Seq[String]](2)
          val address = obj.asInstanceOf[Seq[String]](3)
          val sheetName = obj.asInstanceOf[Seq[String]](4)
          simpleObject(i) = new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName)
        } else {
          simpleObject(i)=obj.asInstanceOf[AnyRef]
        }
      }
      // convert row to spreadsheetcellDAO
      val spreadSheetCellDAORow = simpleConverter.getSpreadSheetCellDAOfromSimpleDataType(simpleObject, defaultSheetName, currentRowNum)
      // write it
      for (x<- spreadSheetCellDAORow) {
        recordWriter.write(NullWritable.get(), x)
      }
    }
    currentRowNum += 1
  }

  override def close(): Unit = {
    recordWriter.close(context)
    currentRowNum = 0;
  }

}

Source File: HadoopFileExcelReader.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import java.io.Closeable
import java.net.URI

import org.apache.spark.sql.execution.datasources._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.ArrayWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{ FileSplit, LineRecordReader }
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.sql.execution.datasources.RecordReaderIterator

import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileInputFormat
import org.zuinnote.hadoop.office.format.mapreduce.ExcelRecordReader

import org.apache.commons.logging.LogFactory
import org.apache.commons.logging.Log


class HadoopFileExcelReader(
  file: PartitionedFile, conf: Configuration) extends Iterator[ArrayWritable] with Closeable {
  val LOG = LogFactory.getLog(classOf[HadoopFileExcelReader])
  private var reader: RecordReader[Text, ArrayWritable] = null
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)), file.start, file.length, Array.empty) // todo: implement locality (replace Array.empty with the locations)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val inputFormat = new ExcelFileInputFormat()
    reader = inputFormat.createRecordReader(fileSplit, hadoopAttemptContext)
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }
  
  def getReader: RecordReader[Text, ArrayWritable] = reader

  override def hasNext: Boolean = iterator.hasNext

  override def next(): ArrayWritable = iterator.next()

  override def close(): Unit = {
    if (reader != null) {
      reader.close()
    }
  }
}

Source File: ExcelOutputWriterFactory.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.TaskAttemptContext

import org.apache.spark.sql.execution.datasources.{ OutputWriter, OutputWriterFactory }
import org.apache.spark.sql.types.StructType

import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileOutputFormat
import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration

private[excel] class ExcelOutputWriterFactory(options: Map[String, String]) extends OutputWriterFactory {

  def newInstance(
    path:       String,
    bucketId:   Option[Int],
    dataSchema: StructType,
    context:    TaskAttemptContext): OutputWriter = {
    new ExcelOutputWriter(path, dataSchema, context, options)
  }

  def newInstance(
    path:       String,
    dataSchema: StructType,
    context:    TaskAttemptContext): OutputWriter = {
    new ExcelOutputWriter(path, dataSchema, context, options)
  }

  def getFileExtension(context: TaskAttemptContext): String = {
    val conf = context.getConfiguration();
    val defaultConf = conf.get(HadoopOfficeWriteConfiguration.CONF_MIMETYPE, ExcelFileOutputFormat.DEFAULT_MIMETYPE);
    conf.set(HadoopOfficeWriteConfiguration.CONF_MIMETYPE, defaultConf);
    ExcelFileOutputFormat.getSuffix(conf.get(HadoopOfficeWriteConfiguration.CONF_MIMETYPE))
  }

}

Source File: HadoopBundleFileSystem.scala From mleap with Apache License 2.0

5 votes

package ml.bundle.hdfs

import java.io.File
import java.net.URI
import java.nio.file.{Files, Paths}

import com.typesafe.config.Config
import ml.combust.bundle.fs.BundleFileSystem
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Try
import scala.collection.JavaConverters._

object HadoopBundleFileSystem {
  lazy val defaultSchemes: Seq[String] = Seq("hdfs")

  def createHadoopConfiguration(config: Config): Configuration = {
    val options: Map[String, String] = if(config.hasPath("options")) {
      config.getConfig("options").entrySet().asScala.map {
        entry => (entry.getKey, entry.getValue.unwrapped().toString)
      }.toMap
    } else {
      Map()
    }

    val c = new Configuration()
    for ((key, value) <- options) { c.set(key, value) }
    c
  }

  def createSchemes(config: Config): Seq[String] = if (config.hasPath("schemes")) {
    config.getStringList("schemes").asScala
  } else { Seq("hdfs") }
}

class HadoopBundleFileSystem(fs: FileSystem,
                             override val schemes: Seq[String] = HadoopBundleFileSystem.defaultSchemes) extends BundleFileSystem {
  def this(config: Config) = {
    this(FileSystem.get(HadoopBundleFileSystem.createHadoopConfiguration(config)),
      HadoopBundleFileSystem.createSchemes(config))
  }

  override def load(uri: URI): Try[File] = Try {
    val tmpDir = Files.createTempDirectory("hdfs-bundle")
    val tmpFile = Paths.get(tmpDir.toString, "bundle.zip")
    fs.copyToLocalFile(new Path(uri.toString), new Path(tmpFile.toString))
    tmpFile.toFile
  }

  override def save(uri: URI, localFile: File): Unit = {
    fs.copyFromLocalFile(new Path(localFile.toString), new Path(uri.toString))
  }
}

Source File: HadoopBundleFileSystemSpec.scala From mleap with Apache License 2.0

5 votes

package ml.bundle.hdfs

import java.net.URI
import java.nio.file.{Files, Paths}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.FunSpec

class HadoopBundleFileSystemSpec extends FunSpec {
  private val fs = FileSystem.get(new Configuration())
  private val bundleFs = new HadoopBundleFileSystem(fs)

  describe("scheme") {
    it("returns hdfs") {
      assert(bundleFs.schemes == Seq("hdfs"))
    }
  }

  describe("load") {
    it("loads a file from hadoop and saves to a local file") {
      val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt")
      Files.write(testFile.toAbsolutePath, "HELLO".getBytes())

      val loadedFile = bundleFs.load(testFile.toUri).get
      val contents = new String(Files.readAllBytes(loadedFile.toPath))

      assert(contents == "HELLO")
    }
  }

  describe("save") {
    it("saves local file to HDFS") {
      val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt")
      Files.write(testFile.toAbsolutePath, "HELLO".getBytes())

      val tmpDir = Files.createTempDirectory("HadoopBundleFileSystemSpec")
      val tmpFile = new URI(s"file://$tmpDir/test.txt")

      bundleFs.save(tmpFile, testFile.toFile)
      val contents = new String(Files.readAllBytes(Paths.get(tmpFile)))

      assert(contents == "HELLO")
    }
  }
}

Source File: OptionsParsing.scala From spark-distcp with Apache License 2.0

5 votes

package com.coxautodata

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

object OptionsParsing {

  
  def parse(args: Array[String], hadoopConfiguration: Configuration): Config = {

    val parser = new scopt.OptionParser[Config]("") {
      opt[Unit]("i")
        .action((_, c) => c.copyOptions(_.copy(ignoreErrors = true)))
        .text("Ignore failures")

      opt[String]("log")
        .action((log, c) => c.copyOptions(_.copy(log = Some(new URI(log)))))
        .text("Write logs to a URI")

      opt[Unit]("dryrun")
        .action((_, c) => c.copyOptions(_.copy(dryRun = true)))
        .text("Perform a trial run with no changes made")

      opt[Unit]("verbose")
        .action((_, c) => c.copyOptions(_.copy(verbose = true)))
        .text("Run in verbose mode")

      opt[Unit]("overwrite")
        .action((_, c) => c.copyOptions(_.copy(overwrite = true)))
        .text("Overwrite destination")

      opt[Unit]("update")
        .action((_, c) => c.copyOptions(_.copy(update = true)))
        .text("Overwrite if source and destination differ in size, or checksum")

      opt[String]("filters")
        .action((f, c) => c.copyOptions(_.withFiltersFromFile(new URI(f), hadoopConfiguration)))
        .text("The path to a file containing a list of pattern strings, one string per line, such that paths matching the pattern will be excluded from the copy.")

      opt[Unit]("delete")
        .action((_, c) => c.copyOptions(_.copy(delete = true)))
        .text("Delete the files existing in the dst but not in src")

      opt[Int]("numListstatusThreads")
        .action((i, c) => c.copyOptions(_.copy(numListstatusThreads = i)))
        .text("Number of threads to use for building file listing")

      opt[Unit]("consistentPathBehaviour")
        .action((_, c) => c.copyOptions(_.copy(consistentPathBehaviour = true)))
        .text("Revert the path behaviour when using overwrite or update to the path behaviour of non-overwrite/non-update")

      opt[Int]("maxFilesPerTask")
        .action((i, c) => c.copyOptions(_.copy(maxFilesPerTask = i)))
        .text("Maximum number of files to copy in a single Spark task")

      opt[Long]("maxBytesPerTask")
        .action((i, c) => c.copyOptions(_.copy(maxBytesPerTask = i)))
        .text("Maximum number of bytes to copy in a single Spark task")

      help("help").text("prints this usage text")

      arg[String]("[source_path...] <target_path>")
        .unbounded()
        .minOccurs(2)
        .action((u, c) => c.copy(URIs = c.URIs :+ new URI(u)))

    }

    parser.parse(args, Config()) match {
      case Some(config) =>
        config.options.validateOptions()
        config
      case _ =>
        throw new RuntimeException("Failed to parse arguments")
    }
  }

}

case class Config(options: SparkDistCPOptions = SparkDistCPOptions(), URIs: Seq[URI] = Seq.empty) {

  def copyOptions(f: SparkDistCPOptions => SparkDistCPOptions): Config = {
    this.copy(options = f(options))
  }

  def sourceAndDestPaths: (Seq[Path], Path) = {
    URIs.reverse match {
      case d :: s :: ts => ((s :: ts).reverse.map(u => new Path(u)), new Path(d))
      case _ => throw new RuntimeException("Incorrect number of URIs")
    }
  }

}

Source File: TestSpec.scala From spark-distcp with Apache License 2.0

5 votes

package com.coxautodata

import java.io.ByteArrayInputStream
import java.nio.file.Files

import com.coxautodata.objects.SerializableFileStatus
import com.coxautodata.utils.FileListing
import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers}

trait TestSpec extends FunSpec with Matchers with BeforeAndAfterEach {

  var testingBaseDir: java.nio.file.Path = _
  var testingBaseDirName: String = _
  var testingBaseDirPath: Path = _
  var localFileSystem: LocalFileSystem = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    testingBaseDir = Files.createTempDirectory("test_output")
    testingBaseDirName = testingBaseDir.toString
    localFileSystem = FileSystem.getLocal(new Configuration())
    testingBaseDirPath = localFileSystem.makeQualified(new Path(testingBaseDirName))
  }

  override def afterEach(): Unit = {
    super.afterEach()
    FileUtils.deleteDirectory(testingBaseDir.toFile)
  }

  def createFile(relativePath: Path, content: Array[Byte]): SerializableFileStatus = {
    val path = new Path(testingBaseDirPath, relativePath)
    localFileSystem.mkdirs(path.getParent)
    val in = new ByteArrayInputStream(content)
    val out = localFileSystem.create(path)
    IOUtils.copy(in, out)
    in.close()
    out.close()
    SerializableFileStatus(localFileSystem.getFileStatus(path))
  }

  def fileStatusToResult(f: SerializableFileStatus): FileListing = {
    FileListing(f.getPath.toString, if (f.isFile) Some(f.getLen) else None)
  }

}

Source File: L6-18Cassandra.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import java.nio.charset.StandardCharsets
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.JField
import org.json4s.JsonAST.JObject
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.Text
import java.nio.ByteBuffer
import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
import org.apache.cassandra.hadoop.ConfigHelper
import org.apache.cassandra.thrift.ColumnOrSuperColumn
import org.apache.cassandra.thrift.Column
import org.apache.cassandra.utils.ByteBufferUtil
import org.apache.cassandra.thrift.Mutation
import java.util.Arrays

object CassandraSinkApp {

  def main(args: Array[String]) {
    if (args.length != 6) {
      System.err.println(
        "Usage: CassandraSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
      .foreachRDD(rdd => {
        val jobConf = new Configuration()
        ConfigHelper.setOutputRpcPort(jobConf, cassandraPort)
        ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost)
        ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName)
        ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner")
        rdd.map(rec => {
          val c = new Column()
          c.setName(ByteBufferUtil.bytes(columnName))
          c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval)))
          c.setTimestamp(System.currentTimeMillis)
          val m = new Mutation()
          m.setColumn_or_supercolumn(new ColumnOrSuperColumn())
          m.column_or_supercolumn.setColumn(c)
          (ByteBufferUtil.bytes(rec._1), Arrays.asList(m))
        }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf)
      })

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object HBaseSinkApp {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
      .foreachRDD(rdd => {
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
        hbaseConf.set("hbase.master", hbaseMaster)
        val jobConf = new Configuration(hbaseConf)
        jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName)
        rdd.map(rec => {
          val put = new Put(rec._1.getBytes)
          put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
          (rec._1, put)
        }).saveAsNewAPIHadoopDataset(jobConf)
      })

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: DistCpTransformation.scala From schedoscope with Apache License 2.0

5 votes

package org.schedoscope.dsl.transformations

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.tools.{DistCp, DistCpOptions}
import org.schedoscope.dsl.View
import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver}

import scala.collection.JavaConverters._

object DistCpTransformation {

  def copyToView(sourceView: View, targetView: View): DistCpTransformation = {
    val target = targetView.fullPath.split("/").dropRight(1).mkString("/")
    DistCpTransformation(targetView, List(sourceView.fullPath), target)
  }

  def copyToDirToView(sourcePath: String, targetView: View): DistCpTransformation = {
    val target = targetView.fullPath.split("/").drop(1).mkString("/")
    DistCpTransformation(targetView, List(sourcePath), target)
  }

  def copyToFileToView(sourceFile: String, targetView: View): DistCpTransformation = {
    DistCpTransformation(targetView, List(sourceFile), targetView.fullPath)
  }

}

case class DistCpTransformation(v: View,
                                var sources: List[String],
                                var target: String,
                                deleteViewPath: Boolean = false,
                                config: Configuration = new Configuration())
  extends MapreduceBaseTransformation {

  var directoriesToDelete = if (deleteViewPath) List(v.fullPath) else List()

  override def stringsToChecksum: List[String] = target :: sources

  override def fileResourcesToChecksum = List()


  override val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) =>
    DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState

  lazy val job: Job = {
    val distCp = new DistCp(config, distCpOptions)
    val createJob = distCp.getClass.getDeclaredMethod("createJob")
    createJob.setAccessible(true)
    val job = createJob.invoke(distCp).asInstanceOf[Job]
    val prepareFileListing = distCp.getClass.getDeclaredMethod("prepareFileListing", job.getClass)
    prepareFileListing.setAccessible(true)
    prepareFileListing.invoke(distCp, job)
    job
  }

  def distCpOptions: DistCpOptions = if (configuration.nonEmpty) {
    DistCpConfiguration
      .fromConfig(configuration.toMap)
      .toDistCpOptions(sources.map(new Path(_)), new Path(target))
  } else {
    val s = sources.map(new Path(_)).asJava
    new DistCpOptions(s, new Path(target))
  }
}

Source File: WholeFileReader.scala From magellan with Apache License 2.0

5 votes

package magellan.mapreduce

import java.io.InputStream

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}

class WholeFileReader extends RecordReader[NullWritable, Text] {

  private val key = NullWritable.get()
  private val value = new Text()
  private var split: FileSplit = _
  private var conf: Configuration = _
  private var path: Path = _
  private var done: Boolean = false

  override def getProgress: Float = ???

  override def nextKeyValue(): Boolean = {
    if (done){
      false
    } else {
      val fs = path.getFileSystem(conf)
      var is: FSDataInputStream = null
      var in: InputStream = null
      var decompressor: Decompressor = null
      try {
        is = fs.open(split.getPath)
        val codec = new CompressionCodecFactory(conf).getCodec(path)
        if (codec != null) {
          decompressor = CodecPool.getDecompressor(codec)
          in = codec.createInputStream(is, decompressor)
        } else {
          in = is
        }
        val result = IOUtils.toByteArray(in)
        value.clear()
        value.set(result)
        done = true
        true
      } finally {
        if (in != null) {
          IOUtils.closeQuietly(in)
        }
        if (decompressor != null) {
          CodecPool.returnDecompressor(decompressor)
        }
      }
    }
  }

  override def getCurrentValue: Text = value

  override def initialize(inputSplit: InputSplit,
    taskAttemptContext: TaskAttemptContext): Unit = {
    this.split = inputSplit.asInstanceOf[FileSplit]
    this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext)
    this.path = this.split.getPath
  }

  override def getCurrentKey: NullWritable = key

  override def close() {}
}

Source File: ShxReaderSuite.scala From magellan with Apache License 2.0

5 votes

package magellan.mapreduce

import magellan.TestSparkContext
import magellan.io.PolygonReader
import org.apache.commons.io.EndianUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{ArrayWritable, LongWritable, Text}
import org.scalatest.FunSuite

class ShxReaderSuite extends FunSuite with TestSparkContext {

  test("Read shx file") {
    val path = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shx").getPath
    val conf = new Configuration()
    conf.set("mapreduce.input.fileinputformat.split.maxsize", "10000")

    val data = sc.newAPIHadoopFile(
      path,
      classOf[ShxInputFormat],
      classOf[Text],
      classOf[ArrayWritable],
      conf
    ).map { case (txt: Text, splits: ArrayWritable) =>
        val fileName = txt.toString
        val s = splits.get()
        val size = s.length
        var i = 0
        val v = Array.fill(size)(0L)
        while (i < size) {
          v.update(i, s(i).asInstanceOf[LongWritable].get())
          i += 1
        }
        (fileName, v)
      }
    assert(data.count() === 1)
    val (fileName, splits) = data.first()
    assert(fileName === "tl_2016_us_state")

    // the offsets should be correct
    val firstOffset = splits(0)
    val secondOffset = splits(1)

    // skipping to the first offset in the Shapefile should allow me to read the first polygon
    val shpFilePath = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shp").getPath

    val fs = FileSystem.get(sc.hadoopConfiguration)

    var dis = fs.open(new Path(shpFilePath))

    // skip  firstOffset # of bytes
    dis.seek(firstOffset)

    // skip record number
    assert(dis.readInt() === 1)

    // read content length
    var contentLength = 16 * (dis.readInt() + 4)

    // extract the shape type
    var shapeType = EndianUtils.swapInteger(dis.readInt())

    // expect a Polygon
    assert(shapeType === 5)

    // the first polygon's content should follow from here
    val polygonReader = new PolygonReader()
    val polygon = polygonReader.readFields(dis)
    assert(polygon != null)

    // seek to the second offset
    dis.seek(secondOffset)
    assert(dis.readInt() === 2)

  }
}

Source File: TikaParquetParser.scala From project-matt with MIT License

5 votes

package org.datafy.aws.app.matt.extras

import java.io.{File, FileOutputStream, IOException, InputStream}
import java.util

import scala.collection.JavaConverters._
import org.xml.sax.{ContentHandler, SAXException}
import org.apache.tika.metadata.Metadata
import org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE
import org.apache.tika.mime.MediaType
import org.apache.tika.parser.{AbstractParser, ParseContext}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.hadoop.ParquetReader
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.util.HadoopInputFile
import org.apache.parquet.tools.json.JsonRecordFormatter
import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord}
import org.apache.tika.exception.TikaException
import org.apache.tika.sax.XHTMLContentHandler

import scala.util.Random


class TikaParquetParser extends AbstractParser {
  // make some stuff here
  final val PARQUET_RAW = MediaType.application("x-parquet")

  private val SUPPORTED_TYPES: Set[MediaType] = Set(PARQUET_RAW)

  def getSupportedTypes(context: ParseContext): util.Set[MediaType] = {
    SUPPORTED_TYPES.asJava
  }

  @throws(classOf[IOException])
  @throws(classOf[SAXException])
  @throws(classOf[TikaException])
  def parse(stream: InputStream, handler: ContentHandler,
            metadata: Metadata, context: ParseContext): Unit = {
    // create temp file from stream
    val fileNamePrefix = Random.alphanumeric.take(5).mkString
    val tempFile = File.createTempFile(s"parquet-${fileNamePrefix}", ".parquet")
    IOUtils.copy(stream, new FileOutputStream(tempFile))

    val conf = new Configuration()
    val path = new Path(tempFile.getAbsolutePath)
    val parquetMetadata = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER)
    var defaultReader: ParquetReader[SimpleRecord] = null

    val columns = parquetMetadata.getFileMetaData.getSchema.getFields
    metadata.set(CONTENT_TYPE, PARQUET_RAW.toString)
    metadata.set("Total Number of Columns", columns.size.toString)
    metadata.set("Parquet Column Names", columns.toString)

    val xhtml = new XHTMLContentHandler(handler, metadata)
    xhtml.startDocument()
    xhtml.startElement("p")

    // ::TODO:: ensure parquet reader reads all files not only file row
    try {
      defaultReader = ParquetReader.builder(new SimpleReadSupport(), new Path(tempFile.getAbsolutePath)).build()
      if(defaultReader.read() != null) {
        val values: SimpleRecord = defaultReader.read()
        val jsonFormatter = JsonRecordFormatter.fromSchema(parquetMetadata.getFileMetaData.getSchema)

        val textContent: String = jsonFormatter.formatRecord(values)
        xhtml.characters(textContent)
        xhtml.endElement("p")
        xhtml.endDocument()
      }

    } catch {
        case e: Throwable => e.printStackTrace()
          if (defaultReader != null) {
          try {
            defaultReader.close()
          } catch{
            case _: Throwable =>
          }
        }
    } finally {
      if (tempFile != null) tempFile.delete()
    }
  }

}

Source File: TikaHadoopOrcParser.scala From project-matt with MIT License

5 votes

package org.datafy.aws.app.matt.extras

import java.io.{File, FileOutputStream, IOException, InputStream}
import java.util

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration

import scala.collection.JavaConverters._
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.serde2.objectinspector.StructField
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
import org.apache.orc.OrcFile
import org.apache.orc.OrcFile.ReaderOptions
import org.apache.orc.Reader
import org.apache.orc.RecordReader
import org.apache.tika.exception.TikaException
import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType
import org.apache.tika.parser.{AbstractParser, ParseContext}
import org.xml.sax.{ContentHandler, SAXException}

import scala.util.Random


class TikaHadoopOrcParser extends AbstractParser  {
  final val ORC_RAW = MediaType.application("x-orc")

  private val SUPPORTED_TYPES: Set[MediaType] = Set(ORC_RAW)

  def getSupportedTypes(context: ParseContext): util.Set[MediaType] = {
    SUPPORTED_TYPES.asJava
  }

  @throws(classOf[IOException])
  @throws(classOf[SAXException])
  @throws(classOf[TikaException])
  def parse(stream: InputStream, handler: ContentHandler,
            metadata: Metadata, context: ParseContext): Unit = {
    // create temp file from stream
    try {
      val fileNamePrefix = Random.alphanumeric.take(5).mkString
      val tempFile = File.createTempFile(s"orc-${fileNamePrefix}", ".orc")
      IOUtils.copy(stream, new FileOutputStream(tempFile))

      val path = new Path(tempFile.getAbsolutePath)
      val conf = new Configuration()
      val orcReader = OrcFile.createReader(path, new ReaderOptions(conf))
      val records: RecordReader = orcReader.rows()

      val storeRecord = null
      val firstBlockKey = null

    } catch {
      case e: Throwable => e.printStackTrace()
    }



//    val fields =

  }
}

Source File: WorkbookReader.scala From spark-excel with Apache License 2.0

5 votes

package com.crealytics.spark.excel

import java.io.InputStream

import com.crealytics.spark.excel.Utils.MapIncluding
import com.github.pjfanning.xlsx.StreamingReader
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory}

trait WorkbookReader {
  protected def openWorkbook(): Workbook
  def withWorkbook[T](f: Workbook => T): T = {
    val workbook = openWorkbook()
    val res = f(workbook)
    workbook.close()
    res
  }
  def sheetNames: Seq[String] = {
    withWorkbook(
      workbook =>
        for (sheetIx <- (0 until workbook.getNumberOfSheets())) yield {
          workbook.getSheetAt(sheetIx).getSheetName()
        }
    )
  }
}

object WorkbookReader {
  val WithLocationMaxRowsInMemoryAndPassword =
    MapIncluding(Seq("path"), optionally = Seq("maxRowsInMemory", "workbookPassword"))

  def apply(parameters: Map[String, String], hadoopConfiguration: Configuration): WorkbookReader = {
    def readFromHadoop(location: String) = {
      val path = new Path(location)
      FileSystem.get(path.toUri, hadoopConfiguration).open(path)
    }
    parameters match {
      case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(Some(maxRowsInMemory), passwordOption)) =>
        new StreamingWorkbookReader(readFromHadoop(location), passwordOption, maxRowsInMemory.toInt)
      case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(None, passwordOption)) =>
        new DefaultWorkbookReader(readFromHadoop(location), passwordOption)
    }
  }
}
class DefaultWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String])
    extends WorkbookReader {
  protected def openWorkbook(): Workbook =
    workbookPassword
      .fold(WorkbookFactory.create(inputStreamProvider))(
        password => WorkbookFactory.create(inputStreamProvider, password)
      )
}

class StreamingWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String], maxRowsInMem: Int)
    extends WorkbookReader {
  override protected def openWorkbook(): Workbook = {
    val builder = StreamingReader
      .builder()
      .rowCacheSize(maxRowsInMem)
      .bufferSize(4096)
    workbookPassword
      .fold(builder)(password => builder.password(password))
      .open(inputStreamProvider)
  }
}

Source File: FilterHelper.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.utils

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate}
import org.apache.parquet.hadoop.ParquetInputFormat

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.parquet.ParquetFiltersWrapper
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

object FilterHelper {

  def tryToPushFilters(
      sparkSession: SparkSession,
      requiredSchema: StructType,
      filters: Seq[Filter]): Option[FilterPredicate] = {
    tryToPushFilters(sparkSession.sessionState.conf, requiredSchema, filters)
  }

  def tryToPushFilters(
      conf: SQLConf,
      requiredSchema: StructType,
      filters: Seq[Filter]): Option[FilterPredicate] = {
    if (conf.parquetFilterPushDown) {
      filters
        // Collects all converted Parquet filter predicates. Notice that not all predicates can be
        // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
        // is used here.
        .flatMap(ParquetFiltersWrapper.createFilter(conf, requiredSchema, _))
        .reduceOption(FilterApi.and)
    } else {
      None
    }
  }

  def setFilterIfExist(configuration: Configuration, pushed: Option[FilterPredicate]): Unit = {
    pushed match {
      case Some(filters) => ParquetInputFormat.setFilterPredicate(configuration, filters)
      case _ => // do nothing
    }
  }
}

Source File: BitmapReaderV2.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.conf.Configuration

import org.apache.spark.sql.execution.datasources.oap.filecache.BitmapFiberId
import org.apache.spark.sql.execution.datasources.oap.index.impl.IndexFileReaderImpl
import org.apache.spark.sql.execution.datasources.oap.utils.{BitmapUtils, OapBitmapWrappedFiberCache}
import org.apache.spark.sql.types.StructType

private[oap] class BitmapReaderV2(
    fileReader: IndexFileReaderImpl,
    intervalArray: ArrayBuffer[RangeInterval],
    internalLimit: Int,
    keySchema: StructType,
    conf: Configuration)
    extends BitmapReader(fileReader, intervalArray, keySchema, conf) with Iterator[Int] {

  @transient private var bmRowIdIterator: Iterator[Int] = _
  private var bmWfcSeq: Seq[OapBitmapWrappedFiberCache] = _
  private var empty: Boolean = _

 
  override def hasNext: Boolean =
    if (!empty && bmRowIdIterator.hasNext) {
      true
    } else {
      clearCache()
      false
    }

  override def next(): Int = bmRowIdIterator.next()
  override def toString: String = "BitmapReaderV2"

  override def clearCache(): Unit = {
    super.clearCache()
    if (bmWfcSeq != null) {
      bmWfcSeq.foreach(wfc => wfc.release)
    }
  }

  private def getDesiredWfcSeq(): Seq[OapBitmapWrappedFiberCache] = {
    val keySeq = readBmUniqueKeyList(bmUniqueKeyListCache)
    intervalArray.flatMap{
      case range if !range.isNullPredicate =>
        val (startIdx, endIdx) = getKeyIdx(keySeq, range)
        if (startIdx == -1 || endIdx == -1) {
          Seq.empty
        } else {
          (startIdx until (endIdx + 1)).map(idx => {
            val curIdxOffset = getIdxOffset(bmOffsetListCache, 0L, idx)
            val entrySize = getIdxOffset(bmOffsetListCache, 0L, idx + 1) - curIdxOffset
            val entryFiber = BitmapFiberId(() => fileReader.readFiberCache(curIdxOffset, entrySize),
              fileReader.getName, BitmapIndexSectionId.entryListSection, idx)
            new OapBitmapWrappedFiberCache(fiberCacheManager.get(entryFiber))
          })
        }
      case range if range.isNullPredicate =>
        val nullListCache =
          new OapBitmapWrappedFiberCache(fiberCacheManager.get(bmNullListFiber))
        if (nullListCache.size != 0) {
          Seq(nullListCache)
        } else {
          Seq.empty
        }
    }
  }

  def initRowIdIterator(): Unit = {
    initDesiredSegments()
    bmWfcSeq = getDesiredWfcSeq
    if (bmWfcSeq.nonEmpty) {
      val iterator = BitmapUtils.iterator(bmWfcSeq)
      bmRowIdIterator =
        if (internalLimit > 0) iterator.take(internalLimit) else iterator
      empty = false
    } else {
      empty = true
    }
  }
}

Source File: BPlusTreeScanner.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.execution.datasources.oap._
import org.apache.spark.sql.execution.datasources.oap.statistics.StatsAnalysisResult

// we scan the index from the smallest to the largest,
// this will scan the B+ Tree (index) leaf node.
private[oap] class BPlusTreeScanner(idxMeta: IndexMeta) extends IndexScanner(idxMeta) {
  override def toString(): String = "BPlusTreeScanner"

  @transient var recordReader: BTreeIndexRecordReader = _

  // Set by analyzeStatistics()
  private var _totalRows: Long = 0
  override def totalRows(): Long = _totalRows

  def initialize(dataPath: Path, conf: Configuration): IndexScanner = {
    assert(keySchema ne null)
    val indexPath = IndexUtils.getIndexFilePath(
      conf, dataPath, meta.name, meta.time)
    logDebug("Loading Index File: " + indexPath)
    logDebug("\tFile Size: " + indexPath.getFileSystem(conf).getFileStatus(indexPath).getLen)

    recordReader = BTreeIndexRecordReader(conf, keySchema, indexPath)
    recordReader.initialize(indexPath, intervalArray)

    // For some case, analyzeStatistics will be skipped, so we have to get totalRows here as well.
    _totalRows = recordReader.totalRows()
    this
  }

  override protected def analyzeStatistics(
      indexPath: Path,
      conf: Configuration): StatsAnalysisResult = {
    var recordReader = BTreeIndexRecordReader(conf, keySchema, indexPath)
    try {
      val result = recordReader.analyzeStatistics(keySchema, intervalArray)
      _totalRows = recordReader.totalRows()
      result
    } finally {
      if (recordReader != null) {
        recordReader.close()
        recordReader = null
      }
    }
  }

  override def hasNext: Boolean = recordReader.hasNext

  override def next(): Int = recordReader.next()
}

Source File: BitMapScanner.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.execution.datasources.OapException
import org.apache.spark.sql.execution.datasources.oap.IndexMeta
import org.apache.spark.sql.execution.datasources.oap.index.OapIndexProperties.IndexVersion
import org.apache.spark.sql.execution.datasources.oap.index.impl.IndexFileReaderImpl
import org.apache.spark.sql.execution.datasources.oap.statistics.StatsAnalysisResult

private[oap] case class BitMapScanner(idxMeta: IndexMeta) extends IndexScanner(idxMeta) {

  private var _totalRows: Long = 0
  @transient private var bmRowIdIterator: Iterator[Int] = _
  override def hasNext: Boolean = bmRowIdIterator.hasNext
  override def next(): Int = bmRowIdIterator.next
  override def totalRows(): Long = _totalRows

  // TODO: If the index file is not changed, bypass the repetitive initialization for queries.
  override def initialize(dataPath: Path, conf: Configuration): IndexScanner = {
    assert(keySchema ne null)
    // Currently OAP index type supports the column with one single field.
    assert(keySchema.fields.length == 1)
    val indexPath = IndexUtils.getIndexFilePath(
      conf, dataPath, meta.name, meta.time)
    val fileReader = IndexFileReaderImpl(conf, indexPath)

    val bitmapReader = IndexUtils.readVersion(fileReader) match {
      case Some(version) =>
        IndexVersion(version) match {
          case IndexVersion.OAP_INDEX_V1 =>
            val reader = new BitmapReaderV1(
              fileReader, intervalArray, internalLimit, keySchema, conf)
            reader.initRowIdIterator
            bmRowIdIterator = reader
            reader
          case IndexVersion.OAP_INDEX_V2 =>
            val reader = new BitmapReaderV2(
              fileReader, intervalArray, internalLimit, keySchema, conf)
            reader.initRowIdIterator
            bmRowIdIterator = reader
            reader
        }
      case None =>
        throw new OapException("not a valid index file")
    }
    _totalRows = bitmapReader.totalRows
    fileReader.close()
    this
  }

  override protected def analyzeStatistics(
      idxPath: Path,
      conf: Configuration): StatsAnalysisResult = {
    val fileReader = IndexFileReaderImpl(conf, idxPath)
    val reader = BitmapReader(fileReader, intervalArray, keySchema, conf)
    _totalRows = reader.totalRows
    try {
      reader.analyzeStatistics()
    } finally {
      fileReader.close()
    }
  }

  override def toString: String = "BitMapScanner"

}

Source File: IndexFileWriterImpl.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index.impl

import java.io.OutputStream

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.execution.datasources.oap.index.IndexFileWriter

private[index] case class IndexFileWriterImpl(
    configuration: Configuration,
    indexPath: Path) extends IndexFileWriter {

  protected override val os: OutputStream =
    indexPath.getFileSystem(configuration).create(indexPath, true)

  // Give RecordWriter a chance which file it's writing to.
  override def getName: String = indexPath.toString

  override def tempRowIdWriter: IndexFileWriter = {
    val tempFileName = new Path(indexPath.getParent, indexPath.getName + ".id")
    IndexFileWriterImpl(configuration, tempFileName)
  }

  override def writeRowId(tempWriter: IndexFileWriter): Unit = {
    val path = new Path(tempWriter.getName)
    val is = path.getFileSystem(configuration).open(path)
    val length = path.getFileSystem(configuration).getFileStatus(path).getLen
    val bufSize = configuration.getInt("io.file.buffer.size", 4096)
    val bytes = new Array[Byte](bufSize)
    var remaining = length
    while (remaining > 0) {
      val readSize = math.min(bufSize, remaining).toInt
      is.readFully(bytes, 0, readSize)
      os.write(bytes, 0, readSize)
      remaining -= readSize
    }
    is.close()
    path.getFileSystem(configuration).delete(path, false)
  }
}

Source File: StatisticsType.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.statistics

import org.apache.hadoop.conf.Configuration

import org.apache.spark.sql.types.StructType

private[oap] object StatisticsType {
  val TYPE_MIN_MAX: Int = 0
  val TYPE_SAMPLE_BASE: Int = 1
  val TYPE_PART_BY_VALUE: Int = 2
  val TYPE_BLOOM_FILTER: Int = 3

  def unapply(t: Int): Option[StructType => StatisticsReader] = t match {
    case TYPE_MIN_MAX => Some(new MinMaxStatisticsReader(_))
    case TYPE_SAMPLE_BASE => Some(new SampleBasedStatisticsReader(_))
    case TYPE_PART_BY_VALUE => Some(new PartByValueStatisticsReader(_))
    case TYPE_BLOOM_FILTER => Some(new BloomFilterStatisticsReader(_))
    case _ => None
  }

  def unapply(name: String): Option[(StructType, Configuration) => StatisticsWriter] = name match {
    case "MINMAX" =>
      Some((schema: StructType, conf: Configuration) => new MinMaxStatisticsWriter(schema, conf))
    case "SAMPLE" =>
      Some((schema: StructType, conf: Configuration) =>
        new SampleBasedStatisticsWriter(schema, conf))
    case "PARTBYVALUE" =>
      Some((schema: StructType, conf: Configuration) =>
        new PartByValueStatisticsWriter(schema, conf))
    case "BLOOM" =>
      Some((schema: StructType, conf: Configuration) =>
        new BloomFilterStatisticsWriter(schema, conf))
    case _ => None
  }
}

Source File: ParquetFiberDataLoader.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import java.io.IOException
import java.time.ZoneId

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.hadoop.ParquetFiberDataReader
import org.apache.parquet.hadoop.api.InitContext
import org.apache.parquet.hadoop.utils.Collections3

import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache
import org.apache.spark.sql.execution.datasources.parquet.{ParquetReadSupportWrapper, VectorizedColumnReader}
import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
import org.apache.spark.sql.oap.OapRuntime
import org.apache.spark.sql.types._


private[oap] case class ParquetFiberDataLoader(
    configuration: Configuration,
    reader: ParquetFiberDataReader,
    blockId: Int) {

  @throws[IOException]
  def loadSingleColumn: FiberCache = {
    val footer = reader.getFooter
    val fileSchema = footer.getFileMetaData.getSchema
    val fileMetadata = footer.getFileMetaData.getKeyValueMetaData
    val readContext = new ParquetReadSupportWrapper()
      .init(new InitContext(configuration, Collections3.toSetMultiMap(fileMetadata), fileSchema))
    val requestedSchema = readContext.getRequestedSchema
    val sparkRequestedSchemaString =
      configuration.get(ParquetReadSupportWrapper.SPARK_ROW_REQUESTED_SCHEMA)
    val sparkSchema = StructType.fromString(sparkRequestedSchemaString)
    assert(sparkSchema.length == 1, s"Only can get single column every time " +
      s"by loadSingleColumn, the columns = ${sparkSchema.mkString}")
    val dataType = sparkSchema.fields(0).dataType
    // Notes: rowIds is IntegerType in oap index.
    val rowCount = reader.getFooter.getBlocks.get(blockId).getRowCount.toInt


    val columnDescriptor = requestedSchema.getColumns.get(0)
    val originalType = requestedSchema.asGroupType.getFields.get(0).getOriginalType
    val blockMetaData = footer.getBlocks.get(blockId)
    val fiberData = reader.readFiberData(blockMetaData, columnDescriptor)
    val columnReader =
      new VectorizedColumnReader(columnDescriptor, originalType,
        fiberData.getPageReader(columnDescriptor), ZoneId.systemDefault, true)

    if (OapRuntime.getOrCreate.fiberCacheManager.dataCacheCompressEnable) {
      ParquetDataFiberCompressedWriter.dumpToCache(
        columnReader, rowCount, dataType)
    } else {
      val column = new OnHeapColumnVector(rowCount, dataType)
      columnReader.readBatch(rowCount, column)
      ParquetDataFiberWriter.dumpToCache(
        column.asInstanceOf[OnHeapColumnVector], rowCount)
    }
  }
}

Source File: CodecFactory.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream}

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodec}
import org.apache.hadoop.util.ReflectionUtils
import org.apache.parquet.format.{CompressionCodec => ParquetCodec}
import org.apache.parquet.hadoop.metadata.CompressionCodecName

// This is a simple version of parquet's CodeFactory.
// TODO: [linhong] Need change this into Scala Code style
private[oap] class CodecFactory(conf: Configuration) {

  private val compressors = new mutable.HashMap[ParquetCodec, BytesCompressor]
  private val decompressors = new mutable.HashMap[ParquetCodec, BytesDecompressor]
  private val codecByName = new mutable.HashMap[String, CompressionCodec]

  private def getCodec(codecString: String): Option[CompressionCodec] = {
    codecByName.get(codecString) match {
      case Some(codec) => Some(codec)
      case None =>
        val codecName = CompressionCodecName.valueOf(codecString)
        val codecClass = codecName.getHadoopCompressionCodecClass
        if (codecClass == null) {
          None
        } else {
          val codec = ReflectionUtils.newInstance(codecClass, conf).asInstanceOf[CompressionCodec]
          codecByName.put(codecString, codec)
          Some(codec)
        }
    }
  }

  def getCompressor(codec: ParquetCodec): BytesCompressor = {
    compressors.getOrElseUpdate(codec, new BytesCompressor(getCodec(codec.name)))
  }

  def getDecompressor(codec: ParquetCodec): BytesDecompressor = {
    decompressors.getOrElseUpdate(codec, new BytesDecompressor(getCodec(codec.name)))
  }

  def release(): Unit = {
    compressors.values.foreach(_.release())
    compressors.clear()
    decompressors.values.foreach(_.release())
    decompressors.clear()
  }
}

private[oap] class BytesCompressor(compressionCodec: Option[CompressionCodec]) {

  private lazy val compressedOutBuffer = new ByteArrayOutputStream()
  private lazy val compressor = compressionCodec match {
    case Some(codec) => CodecPool.getCompressor(codec)
    case None => null
  }

  def compress(bytes: Array[Byte]): Array[Byte] = {
    compressionCodec match {
      case Some(codec) =>
        compressedOutBuffer.reset()
        // null compressor for non-native gzip
        if (compressor != null) {
          compressor.reset()
        }
        val cos = codec.createOutputStream(compressedOutBuffer, compressor)
        cos.write(bytes)
        cos.finish()
        cos.close()
        compressedOutBuffer.toByteArray
      case None => bytes
    }
  }

  def release(): Unit = CodecPool.returnCompressor(compressor)
}

private[oap] class BytesDecompressor(compressionCodec: Option[CompressionCodec]) {

  private lazy val decompressor = compressionCodec match {
    case Some(codec) => CodecPool.getDecompressor(codec)
    case None => null
  }

  def decompress(bytes: Array[Byte], uncompressedSize: Int): Array[Byte] = {
    compressionCodec match {
      case Some(codec) =>
        decompressor.reset()
        val cis = codec.createInputStream(new ByteArrayInputStream(bytes), decompressor)
        val decompressed = new Array[Byte](uncompressedSize)
        new DataInputStream(cis).readFully(decompressed)
        decompressed
      case None => bytes
    }
  }

  def release(): Unit = CodecPool.returnDecompressor(decompressor)
}

Source File: OrcDataFileMeta.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FSDataInputStream
import org.apache.hadoop.fs.Path
import org.apache.orc.OrcFile
import org.apache.orc.Reader
import org.apache.orc.mapred.OrcInputFormat

private[oap] class OrcDataFileMeta(val path: Path, val configuration: Configuration)
    extends DataFileMeta {

  val fs = path.getFileSystem(configuration)
  private val readerOptions = OrcFile.readerOptions(configuration).filesystem(fs)
  private val fileReader = OrcFile.createReader(path, readerOptions)
  val length = fs.getFileStatus(path).getLen
//  val options: Reader.Options = OrcInputFormat.buildOptions(configuration, fileReader, 0, length)
  // Record reader from ORC row batch.
//  val recordReader = fileReader.rows(options)

  def getOrcFileReader(): Reader = fileReader
  val listStripeInformation = fileReader.getStripes()

  def numberOfRows: Long = fileReader.getNumberOfRows()
  override def len: Long = fileReader.getContentLength()
  override def getGroupCount: Int = fileReader.getStripes().size()
  override def getFieldCount: Int = fileReader.getSchema().getFieldNames().size()
  // Not used by orc data file.
  override def fin: FSDataInputStream = null
}

Source File: ParquetDataFileMeta.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.hadoop.util.StringUtils
import org.apache.parquet.hadoop.OapParquetFileReader
import org.apache.parquet.hadoop.metadata.ParquetFooter

private[oap] class ParquetDataFileMeta(val footer: ParquetFooter) extends DataFileMeta {

  require(footer != null, "footer of ParquetDataFileMeta should not be null.")

  override def fin: FSDataInputStream = null

  override def len: Long = 0

  override def getGroupCount: Int = footer.getBlocks.size()

  override def getFieldCount: Int =
    footer.getFileMetaData.getSchema.getColumns.size()
}

private[oap] object ParquetDataFileMeta {
  def apply(conf: Configuration, pathString: String): ParquetDataFileMeta = {
    val path = new Path(StringUtils.unEscapeString(pathString))
    new ParquetDataFileMeta(OapParquetFileReader.readParquetFooter(conf, path))
  }
}

Source File: ParquetReadSupportWrapper.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import java.util.{Map => JMap}

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
import org.apache.parquet.io.api.RecordMaterializer
import org.apache.parquet.schema._

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow


  override def prepareForRead(
      conf: Configuration,
      keyValueMetaData: JMap[String, String],
      fileSchema: MessageType,
      readContext: ReadContext): RecordMaterializer[InternalRow] = {
    readSupport.prepareForRead(conf, keyValueMetaData, fileSchema, readContext)
  }
}
object ParquetReadSupportWrapper {
  // Proxy ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA value.
  val SPARK_ROW_REQUESTED_SCHEMA: String = ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA
}

Source File: FilterHelperSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.utils

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.hadoop.ParquetInputFormat

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._

class FilterHelperSuite extends SparkFunSuite {
  val conf = SQLConf.get

  test("Pushed And Set") {
    val requiredSchema = new StructType()
      .add(StructField("a", IntegerType))
      .add(StructField("b", StringType))
    val filters = Seq(GreaterThan("a", 1), EqualTo("b", "2"))
    val expected = s"""and(gt(a, 1), eq(b, Binary{"2"}))"""
    conf.setConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED, true)
    val pushed = FilterHelper.tryToPushFilters(conf, requiredSchema, filters)
    assert(pushed.isDefined)
    assert(pushed.get.toString.equals(expected))
    val config = new Configuration()
    FilterHelper.setFilterIfExist(config, pushed)
    val humanReadable = config.get(ParquetInputFormat.FILTER_PREDICATE + ".human.readable")
    assert(humanReadable.nonEmpty)
    assert(humanReadable.equals(expected))
  }

  test("Not Pushed") {
    val requiredSchema = new StructType()
      .add(StructField("a", IntegerType))
      .add(StructField("b", StringType))
    val filters = Seq(GreaterThan("a", 1), EqualTo("b", "2"))
    conf.setConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED, false)
    val pushed = FilterHelper.tryToPushFilters(conf, requiredSchema, filters)
    assert(pushed.isEmpty)
    val config = new Configuration()
    FilterHelper.setFilterIfExist(config, pushed)
    assert(config.get(ParquetInputFormat.FILTER_PREDICATE) == null)
    assert(config.get(ParquetInputFormat.FILTER_PREDICATE + ".human.readable") == null)
  }
}

Source File: OapBitmapWrappedFiberCacheSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.utils

import java.io.{ByteArrayOutputStream, DataOutputStream, FileOutputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.roaringbitmap.RoaringBitmap

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.execution.datasources.OapException
import org.apache.spark.sql.execution.datasources.oap.filecache.{BitmapFiberId, FiberCache}
import org.apache.spark.sql.oap.OapRuntime
import org.apache.spark.sql.test.oap.SharedOapContext
import org.apache.spark.util.Utils

// Below are used to test the functionality of OapBitmapWrappedFiberCache class.
class OapBitmapWrappedFiberCacheSuite
  extends QueryTest with SharedOapContext {

  private def loadRbFile(fin: FSDataInputStream, offset: Long, size: Int): FiberCache =
    OapRuntime.getOrCreate.fiberCacheManager.toIndexFiberCache(fin, offset, size)

  test("test the functionality of OapBitmapWrappedFiberCache class") {
    val CHUNK_SIZE = 1 << 16
    val dataForRunChunk = (1 to 9).toSeq
    val dataForArrayChunk = Seq(1, 3, 5, 7, 9)
    val dataForBitmapChunk = (1 to 10000).filter(_ % 2 == 1)
    val dataCombination =
      dataForBitmapChunk ++ dataForArrayChunk ++ dataForRunChunk
    val dataArray =
      Array(dataForRunChunk, dataForArrayChunk, dataForBitmapChunk, dataCombination)
    dataArray.foreach(dataIdx => {
      val dir = Utils.createTempDir()
      val rb = new RoaringBitmap()
      dataIdx.foreach(rb.add)
      val rbFile = dir.getAbsolutePath + "rb.bin"
      rb.runOptimize()
      val rbFos = new FileOutputStream(rbFile)
      val rbBos = new ByteArrayOutputStream()
      val rbDos = new DataOutputStream(rbBos)
      rb.serialize(rbDos)
      rbBos.writeTo(rbFos)
      rbBos.close()
      rbDos.close()
      rbFos.close()
      val rbPath = new Path(rbFile.toString)
      val conf = new Configuration()
      val fin = rbPath.getFileSystem(conf).open(rbPath)
      val rbFileSize = rbPath.getFileSystem(conf).getFileStatus(rbPath).getLen
      val rbFiber = BitmapFiberId(
        () => loadRbFile(fin, 0L, rbFileSize.toInt), rbPath.toString, 0, 0)
      val rbWfc = new OapBitmapWrappedFiberCache(
        OapRuntime.getOrCreate.fiberCacheManager.get(rbFiber))
      rbWfc.init
      val chunkLength = rbWfc.getTotalChunkLength
      val length = dataIdx.size / CHUNK_SIZE
      assert(chunkLength == (length + 1))
      val chunkKeys = rbWfc.getChunkKeys
      assert(chunkKeys(0).toInt == 0)
      rbWfc.setOffset(0)
      val chunk = rbWfc.getIteratorForChunk(0)
      chunk match {
        case RunChunkIterator(rbWfc) => assert(chunk == RunChunkIterator(rbWfc))
        case ArrayChunkIterator(rbWfc, 0) => assert(chunk == ArrayChunkIterator(rbWfc, 0))
        case BitmapChunkIterator(rbWfc) => assert(chunk == BitmapChunkIterator(rbWfc))
        case _ => throw new OapException("unexpected chunk in OapBitmapWrappedFiberCache.")
      }
      rbWfc.release
      fin.close
      dir.delete
    })
  }
}

Source File: CodecFactorySuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.format.CompressionCodec
import org.scalacheck.{Arbitrary, Gen, Properties}
import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.prop.Checkers

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter

class CodecFactoryCheck extends Properties("CodecFactory") {

  private val codecFactory = new CodecFactory(new Configuration())

  private val gen = Gen.sized { size =>
    for {
      codec <- Arbitrary.arbitrary[CompressionCodec]
      times <- Gen.posNum[Int]
      bytes <- Gen.containerOfN[Array, Byte](size * 100, Arbitrary.arbitrary[Byte])
    } yield (codec, times, bytes)
  }

  property("compress/decompress") = forAllNoShrink(gen) {
    // Array[Array[Byte]] means one group of fibers' data
    case (codec, times, bytes) =>
      val compressor = codecFactory.getCompressor(codec)
      val decompressor = codecFactory.getDecompressor(codec)

      (0 until times).forall(_ => decompressor.decompress(compressor.compress(bytes), bytes.length)
        .sameElements(bytes))
  }

  implicit lazy val arbCompressionCodec: Arbitrary[CompressionCodec] = {
    Arbitrary(genCompressionCodec)
  }
  private lazy val genCompressionCodec: Gen[CompressionCodec] = Gen.oneOf(
    CompressionCodec.UNCOMPRESSED, CompressionCodec.GZIP,
    CompressionCodec.SNAPPY, CompressionCodec.LZO)
}

class CodecFactorySuite extends SparkFunSuite with Checkers {

  test("Check CodecFactory Compress/Decompress") {
    check(PropertiesAdapter.getProp(new CodecFactoryCheck()))
  }
}

Source File: TestDataFile.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration

import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

private[oap] case class TestDataFile(path: String, schema: StructType, configuration: Configuration)
  extends DataFile {

  override def iterator(
      requiredIds: Array[Int],
      filters: Seq[Filter]): OapCompletionIterator[Any] =
    new OapCompletionIterator(Iterator.empty, {})

  override def iteratorWithRowIds(
      requiredIds: Array[Int],
      rowIds: Array[Int],
      filters: Seq[Filter]):
  OapCompletionIterator[Any] = new OapCompletionIterator(Iterator.empty, {})

  override def totalRows(): Long = 0

  override def getDataFileMeta(): DataFileMeta =
    throw new UnsupportedOperationException

  override def cache(groupId: Int, fiberId: Int): FiberCache =
    throw new UnsupportedOperationException
}

Source File: DataFileSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.execution.datasources.OapException
import org.apache.spark.sql.execution.datasources.oap.OapFileFormat
import org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase
import org.apache.spark.sql.test.oap.SharedOapContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.Utils

class DataFileSuite extends QueryTest with SharedOapContext {

  override def beforeEach(): Unit = {
    val path = Utils.createTempDir().getAbsolutePath
  }

  // Override afterEach because OapDataFile will open a InputStream for OapDataFileMeta
  // but no method to manual close it and we can not to check open streams.
  override def afterEach(): Unit = {}

  test("apply and cache") {
    val data = (0 to 10).map(i => (i, (i + 'a').toChar.toString))
    val schema = new StructType()
    val config = new Configuration()

    withTempPath { dir =>
      val df = spark.createDataFrame(data)
      df.repartition(1).write.format("oap").save(dir.getAbsolutePath)
      val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0)
      val datafile =
        DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config)
      assert(datafile.path == file)
      assert(datafile.schema == schema)
      assert(datafile.configuration == config)
    }

    withTempPath { dir =>
      val df = spark.createDataFrame(data)
      df.repartition(1).write.parquet(dir.getAbsolutePath)
      val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0)
      val datafile =
        DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config)
      assert(datafile.path == file)
      assert(datafile.schema == schema)
      assert(datafile.configuration == config)
    }

    withTempPath { dir =>
      val df = spark.createDataFrame(data)
      df.repartition(1).write.format("orc").save(dir.getAbsolutePath)
      val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0)
      val datafile =
        DataFile(file, schema, OapFileFormat.ORC_DATA_FILE_CLASSNAME, config)
      assert(datafile.path == file)
      assert(datafile.schema == schema)
      assert(datafile.configuration == config)
    }

    // DataFile object is global. After OrcDataFile is added, then need to change to 3 if
    // we run the whole tests.
    assert(DataFile.cachedConstructorCount == 3)

    intercept[OapException] {
      DataFile("nofile", schema, "NotExistClass", config)
      assert(DataFile.cachedConstructorCount == 2)
    }
  }

  test("DataFile equals") {
    val data = (0 to 10).map(i => (i, (i + 'a').toChar.toString))
    val schema = new StructType()
    val config = new Configuration()
    withTempPath { dir =>
      val df = spark.createDataFrame(data)
      df.repartition(1).write.parquet(dir.getAbsolutePath)
      val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0)
      val datafile1 =
        DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config)
      val datafile2 =
        DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config)
      assert(datafile1.equals(datafile2))
      assert(datafile1.hashCode() == datafile2.hashCode())
    }

    withTempPath { dir =>
      val df = spark.createDataFrame(data)
      df.repartition(1).write.format("oap").save(dir.getAbsolutePath)
      val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0)
      val datafile1 =
        DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config)
      val datafile2 =
        DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config)
      assert(datafile1.equals(datafile2))
      assert(datafile1.hashCode() == datafile2.hashCode())
    }
  }
}

Source File: SharedOapContext.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.test.oap

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.sql.{OapExtensions, SparkSession}
import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, SparkPlan}
import org.apache.spark.sql.execution.datasources.oap.{IndexType, OapFileFormat}
import org.apache.spark.sql.internal.oap.OapConf
import org.apache.spark.sql.oap.{OapDriverRuntime, OapRuntime}
import org.apache.spark.sql.test.OapSharedSQLContext

trait SharedOapContext extends SharedOapContextBase {
  protected override def createSparkSession: SparkSession = {
    SparkSession.cleanupAnyExistingSession()
    val session = SparkSession.builder()
      .master("local[2]")
      .appName("test-oap-context")
      .config(oapSparkConf).getOrCreate()
    OapRuntime.getOrCreate.asInstanceOf[OapDriverRuntime].setTestSession(session)
    session
  }
}


  protected def withFileSystem(f: FileSystem => Unit): Unit = {
    var fs: FileSystem = null
    try {
      fs = FileSystem.get(configuration)
      f(fs)
    } finally {
      if (fs != null) {
        fs.close()
      }
    }
  }
}

case class TestPartition(key: String, value: String)

case class TestIndex(
    tableName: String,
    indexName: String,
    partitions: TestPartition*)

Source File: ArrowFileFormat.scala From OAP with Apache License 2.0

5 votes

package com.intel.oap.spark.sql.execution.datasources.arrow

import scala.collection.JavaConverters._

import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions}
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._
import org.apache.arrow.dataset.scanner.ScanOptions
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable {

  val batchSize = 4096

  def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = {
    ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava))
  }

  override def inferSchema(
                            sparkSession: SparkSession,
                            options: Map[String, String],
                            files: Seq[FileStatus]): Option[StructType] = {
    convert(files, options)
  }

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    throw new UnsupportedOperationException("Write is not supported for Arrow source")
  }

  override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true

  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
      dataSchema: StructType,
      partitionSchema: StructType,
      requiredSchema: StructType,
      filters: Seq[Filter],
      options: Map[String, String],
      hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
    (file: PartitionedFile) => {

      val sqlConf = sparkSession.sessionState.conf;
      val enableFilterPushDown = sqlConf.arrowFilterPushDown
      val factory = ArrowUtils.makeArrowDiscovery(
        file.filePath, new ArrowOptions(
          new CaseInsensitiveStringMap(
            options.asJava).asScala.toMap))

      // todo predicate validation / pushdown
      val dataset = factory.finish();

      val filter = if (enableFilterPushDown) {
        ArrowFilters.translateFilters(filters)
      } else {
        org.apache.arrow.dataset.filter.Filter.EMPTY
      }

      val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray,
        filter, batchSize)
      val scanner = dataset.newScan(scanOptions)
      val itrList = scanner
        .scan()
        .iterator()
        .asScala
        .map(task => task.scan())
        .toList

      val itr = itrList
        .toIterator
        .flatMap(itr => itr.asScala)
        .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema))
      new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]]
    }
  }

  override def shortName(): String = "arrow"
}

object ArrowFileFormat {
  class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] {
    override def hasNext: Boolean = delegate.hasNext

    override def next(): T = delegate.next()
  }
}

Source File: WithHDFSSupport.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileUtil
import org.apache.hadoop.hdfs.MiniDFSCluster
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, Suite}

trait WithHDFSSupport extends BeforeAndAfterAll { self: Suite =>

  protected var sparkSession: SparkSession = _

  private var hdfsCluster: MiniDFSCluster = _
  protected var hdfsURI: String = _

  private def cleanupAnyExistingSession(): Unit = {
    val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession)
    if (session.isDefined) {
      session.get.sessionState.catalog.reset()
      session.get.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    }
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    cleanupAnyExistingSession()

    val baseDir = new File("./target/hdfs/").getAbsoluteFile()
    FileUtil.fullyDelete(baseDir)

    val conf = new Configuration()
    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath())
    val builder = new MiniDFSCluster.Builder(conf)

    hdfsCluster = builder.build()
    hdfsURI = s"hdfs://localhost:${hdfsCluster.getNameNodePort()}/"

    sparkSession = SparkSession.builder()
      .master("local")
      .appName(this.getClass.getCanonicalName)
      .enableHiveSupport()
      .config("spark.hadoop.fs.defaultFS", hdfsURI)
      .config("spark.ui.enabled", "false")
      .getOrCreate()
  }

  override protected def afterAll(): Unit = {
    try {
      sparkSession.sessionState.catalog.reset()
      sparkSession.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    } finally {
      sparkSession = null
    }
    System.clearProperty("spark.driver.port")

    hdfsCluster.shutdown(true)

    super.afterAll()
  }
}

Source File: HadoopConfig.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
import scala.language.implicitConversions

import org.apache.hadoop.conf.Configuration

import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.util.Constants._

class HadoopConfig(config: UserConfig) {

  def withHadoopConf(conf: Configuration): UserConfig = {
    config.withBytes(HADOOP_CONF, serializeHadoopConf(conf))
  }

  def hadoopConf: Configuration = deserializeHadoopConf(config.getBytes(HADOOP_CONF).get)

  private def serializeHadoopConf(conf: Configuration): Array[Byte] = {
    val out = new ByteArrayOutputStream()
    val dataOut = new DataOutputStream(out)
    conf.write(dataOut)
    dataOut.close()
    out.toByteArray
  }

  private def deserializeHadoopConf(bytes: Array[Byte]): Configuration = {
    val in = new ByteArrayInputStream(bytes)
    val dataIn = new DataInputStream(in)
    val result = new Configuration()
    result.readFields(dataIn)
    dataIn.close()
    result
  }
}

object HadoopConfig {
  def empty: HadoopConfig = new HadoopConfig(UserConfig.empty)
  def apply(config: UserConfig): HadoopConfig = new HadoopConfig(config)

  implicit def userConfigToHadoopConfig(userConf: UserConfig): HadoopConfig = {
    HadoopConfig(userConf)
  }
}

Source File: SequenceFileIO.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import org.apache.hadoop.conf.Configuration
import org.slf4j.Logger

import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.cluster.client.ClientContext
import org.apache.gearpump.cluster.main.{ArgumentsParser, CLIOption, ParseResult}
import org.apache.gearpump.streaming.partitioner.ShufflePartitioner
import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._
import org.apache.gearpump.streaming.{Processor, StreamApplication}
import org.apache.gearpump.util.Graph._
import org.apache.gearpump.util.{AkkaApp, Graph, LogUtil}

object SequenceFileIO extends AkkaApp with ArgumentsParser {
  private val LOG: Logger = LogUtil.getLogger(getClass)

  override val options: Array[(String, CLIOption[Any])] = Array(
    "source" -> CLIOption[Int]("<sequence file reader number>", required = false,
      defaultValue = Some(1)),
    "sink" -> CLIOption[Int]("<sequence file writer number>", required = false,
      defaultValue = Some(1)),
    "input" -> CLIOption[String]("<input file path>", required = true),
    "output" -> CLIOption[String]("<output file directory>", required = true)
  )

  def application(config: ParseResult): StreamApplication = {
    val spoutNum = config.getInt("source")
    val boltNum = config.getInt("sink")
    val input = config.getString("input")
    val output = config.getString("output")
    val appConfig = UserConfig.empty.withString(SeqFileStreamProducer.INPUT_PATH, input)
      .withString(SeqFileStreamProcessor.OUTPUT_PATH, output)
    val hadoopConfig = appConfig.withHadoopConf(new Configuration())
    val partitioner = new ShufflePartitioner()
    val streamProducer = Processor[SeqFileStreamProducer](spoutNum)
    val streamProcessor = Processor[SeqFileStreamProcessor](boltNum)

    val app = StreamApplication("SequenceFileIO",
      Graph(streamProducer ~ partitioner ~> streamProcessor), hadoopConfig)
    app
  }

  override def main(akkaConf: Config, args: Array[String]): Unit = {
    val config = parse(args)
    val context = ClientContext(akkaConf)
    val appId = context.submit(application(config))
    context.close()
  }
}

Source File: HadoopConfigSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import org.apache.hadoop.conf.Configuration
import org.scalatest.{Matchers, WordSpec}

import org.apache.gearpump.cluster.UserConfig

class HadoopConfigSpec extends WordSpec with Matchers {

  "HadoopConfig" should {
    "serialize and deserialze hadoop configuration properly" in {
      val hadoopConf = new Configuration()
      val key = "test_key"
      val value = "test_value"
      hadoopConf.set(key, value)

      val user = UserConfig.empty

      import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._
      assert(user.withHadoopConf(hadoopConf).hadoopConf.get(key) == value)
    }
  }
}

Source File: SeqFileStreamProcessorSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import java.io.File
import java.time.Instant
import scala.collection.mutable.ArrayBuffer

import akka.actor.ActorSystem
import akka.testkit.TestProbe
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile.Reader
import org.apache.hadoop.io.{SequenceFile, Text}
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.prop.PropertyChecks
import org.scalatest.{BeforeAndAfter, Matchers, PropSpec}

import org.apache.gearpump.Message
import org.apache.gearpump.cluster.{TestUtil, UserConfig}
import org.apache.gearpump.streaming.task.TaskId
import org.apache.gearpump.streaming.{MockUtil, Processor}
class SeqFileStreamProcessorSpec
  extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter {

  val kvPairs = new ArrayBuffer[(String, String)]
  val outputDirectory = "SeqFileStreamProcessor_Test"
  val sequenceFilePath = new Path(outputDirectory + File.separator + TaskId(0, 0))
  val hadoopConf = new Configuration()
  val fs = FileSystem.get(hadoopConf)
  val textClass = new Text().getClass
  val _key = new Text()
  val _value = new Text()

  val kvGenerator = for {
    key <- Gen.alphaStr
    value <- Gen.alphaStr
  } yield (key, value)

  before {
    implicit val system1 = ActorSystem("SeqFileStreamProcessor", TestUtil.DEFAULT_CONFIG)
    val system2 = ActorSystem("Reporter", TestUtil.DEFAULT_CONFIG)
    val watcher = TestProbe()(system1)
    val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProcessor.OUTPUT_PATH,
      outputDirectory)).withHadoopConf(new Configuration())
    val context = MockUtil.mockTaskContext

    val processorDescription =
      Processor.ProcessorToProcessorDescription(id = 0, Processor[SeqFileStreamProcessor](1))

    val taskId = TaskId(0, 0)
    when(context.taskId).thenReturn(taskId)

    val processor = new SeqFileStreamProcessor(context, conf)
    processor.onStart(Instant.EPOCH)

    forAll(kvGenerator) { kv =>
      val (key, value) = kv
      kvPairs.append((key, value))
      processor.onNext(Message(key + "++" + value))
    }
    processor.onStop()
  }

  property("SeqFileStreamProcessor should write the key-value pairs to a sequence file") {
    val reader = new SequenceFile.Reader(hadoopConf, Reader.file(sequenceFilePath))
    kvPairs.foreach { kv =>
      val (key, value) = kv
      if (value.length > 0 && reader.next(_key, _value)) {
        assert(_key.toString == key && _value.toString == value)
      }
    }
    reader.close()
  }

  after {
    fs.deleteOnExit(new Path(outputDirectory))
  }
}

Source File: SeqFileStreamProducerSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import java.time.Instant

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.hadoop.io.{SequenceFile, Text}
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.prop.PropertyChecks
import org.scalatest.{BeforeAndAfter, Matchers, PropSpec}

import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.streaming.MockUtil
import org.apache.gearpump.streaming.MockUtil._

class SeqFileStreamProducerSpec
  extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter {

  val kvPairs = new ArrayBuffer[(String, String)]
  val inputFile = "SeqFileStreamProducer_Test"
  val sequenceFilePath = new Path(inputFile)
  val hadoopConf = new Configuration()
  val fs = FileSystem.get(hadoopConf)
  val textClass = new Text().getClass
  val _key = new Text()
  val _value = new Text()

  val kvGenerator = for {
    key <- Gen.alphaStr
    value <- Gen.alphaStr
  } yield (key, value)

  before {
    fs.deleteOnExit(sequenceFilePath)
    val writer = SequenceFile.createWriter(hadoopConf, Writer.file(sequenceFilePath),
      Writer.keyClass(textClass), Writer.valueClass(textClass))
    forAll(kvGenerator) { kv =>
      _key.set(kv._1)
      _value.set(kv._2)
      kvPairs.append((kv._1, kv._2))
      writer.append(_key, _value)
    }
    writer.close()
  }

  property("SeqFileStreamProducer should read the key-value pairs from " +
    "a sequence file and deliver them") {

    val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProducer.INPUT_PATH,
      inputFile)).withHadoopConf(new Configuration())

    val context = MockUtil.mockTaskContext

    val producer = new SeqFileStreamProducer(context, conf)
    producer.onStart(Instant.EPOCH)
    producer.onNext(Message("start"))

    val expected = kvPairs.map(kv => kv._1 + "++" + kv._2).toSet
    verify(context).output(argMatch[Message](msg =>
      expected.contains(msg.value.asInstanceOf[String])))
  }

  after {
    fs.deleteOnExit(sequenceFilePath)
  }
}

Source File: WindowAverageApp.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.state

import akka.actor.ActorSystem
import org.apache.hadoop.conf.Configuration

import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.cluster.client.ClientContext
import org.apache.gearpump.cluster.main.{ArgumentsParser, CLIOption, ParseResult}
import org.apache.gearpump.streaming.partitioner.HashPartitioner
import org.apache.gearpump.streaming.examples.state.processor.{NumberGeneratorProcessor, WindowAverageProcessor}
import org.apache.gearpump.streaming.hadoop.HadoopCheckpointStoreFactory
import org.apache.gearpump.streaming.state.impl.{PersistentStateConfig, WindowConfig}
import org.apache.gearpump.streaming.{Processor, StreamApplication}
import org.apache.gearpump.util.Graph.Node
import org.apache.gearpump.util.{AkkaApp, Graph}


object WindowAverageApp extends AkkaApp with ArgumentsParser {

  override val options: Array[(String, CLIOption[Any])] = Array(
    "gen" -> CLIOption("<how many gen tasks>", required = false, defaultValue = Some(1)),
    "window" -> CLIOption("<how mange window tasks", required = false, defaultValue = Some(1)),
    "window_size" -> CLIOption("<window size in milliseconds>", required = false,
      defaultValue = Some(5000)),
    "window_step" -> CLIOption("<window step in milliseconds>", required = false,
      defaultValue = Some(5000))
  )

  def application(config: ParseResult)(implicit system: ActorSystem): StreamApplication = {
    val windowSize = config.getInt("window_size")
    val windowStep = config.getInt("window_step")
    val checkpointStoreFactory = new HadoopCheckpointStoreFactory("MessageCount", new Configuration)
    val taskConfig = UserConfig.empty.
      withBoolean(PersistentStateConfig.STATE_CHECKPOINT_ENABLE, true)
      .withLong(PersistentStateConfig.STATE_CHECKPOINT_INTERVAL_MS, 1000L)
      .withValue(PersistentStateConfig.STATE_CHECKPOINT_STORE_FACTORY, checkpointStoreFactory)
      .withValue(WindowConfig.NAME, WindowConfig(windowSize, windowStep))
    val gen = Processor[NumberGeneratorProcessor](config.getInt("gen"))
    val count = Processor[WindowAverageProcessor](config.getInt("window"), taskConf = taskConfig)
    val partitioner = new HashPartitioner()
    val app = StreamApplication("WindowAverage", Graph(gen ~ partitioner ~> count),
      UserConfig.empty)
    app
  }

  override def main(akkaConf: Config, args: Array[String]): Unit = {
    val config = parse(args)
    val context = ClientContext(akkaConf)

    implicit val system = context.system
    val appId = context.submit(application(config))
    context.close()
  }
}

Source File: MessageCountApp.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.state

import java.util.Properties

import akka.actor.ActorSystem
import org.apache.gearpump.streaming.kafka.util.KafkaConfig
import org.apache.hadoop.conf.Configuration

import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.cluster.client.ClientContext
import org.apache.gearpump.cluster.main.{ArgumentsParser, CLIOption, ParseResult}
import org.apache.gearpump.streaming.partitioner.HashPartitioner
import org.apache.gearpump.streaming.examples.state.processor.CountProcessor
import org.apache.gearpump.streaming.hadoop.HadoopCheckpointStoreFactory
import org.apache.gearpump.streaming.hadoop.lib.rotation.FileSizeRotation
import org.apache.gearpump.streaming.kafka.{KafkaStoreFactory, KafkaSink, KafkaSource}
import org.apache.gearpump.streaming.sink.DataSinkProcessor
import org.apache.gearpump.streaming.source.DataSourceProcessor
import org.apache.gearpump.streaming.state.impl.PersistentStateConfig
import org.apache.gearpump.streaming.{Processor, StreamApplication}
import org.apache.gearpump.util.Graph.Node
import org.apache.gearpump.util.{AkkaApp, Graph}


object MessageCountApp extends AkkaApp with ArgumentsParser {
  val SOURCE_TASK = "sourceTask"
  val COUNT_TASK = "countTask"
  val SINK_TASK = "sinkTask"
  val SOURCE_TOPIC = "sourceTopic"
  val SINK_TOPIC = "sinkTopic"
  val ZOOKEEPER_CONNECT = "zookeeperConnect"
  val BROKER_LIST = "brokerList"
  val DEFAULT_FS = "defaultFS"

  override val options: Array[(String, CLIOption[Any])] = Array(
    SOURCE_TASK -> CLIOption[Int]("<how many kafka source tasks>", required = false,
      defaultValue = Some(1)),
    COUNT_TASK -> CLIOption("<how many count tasks>", required = false, defaultValue = Some(1)),
    SINK_TASK -> CLIOption[Int]("<how many kafka sink tasks>", required = false,
      defaultValue = Some(1)),
    SOURCE_TOPIC -> CLIOption[String]("<kafka source topic>", required = true),
    SINK_TOPIC -> CLIOption[String]("<kafka sink topic>", required = true),
    ZOOKEEPER_CONNECT -> CLIOption[String]("<Zookeeper connect string, e.g. localhost:2181/kafka>",
      required = true),
    BROKER_LIST -> CLIOption[String]("<Kafka broker list, e.g. localhost:9092>", required = true),
    DEFAULT_FS -> CLIOption[String]("<name of the default file system, e.g. hdfs://localhost:9000>",
      required = true)
  )

  def application(config: ParseResult)(implicit system: ActorSystem): StreamApplication = {
    val appName = "MessageCount"
    val hadoopConfig = new Configuration
    hadoopConfig.set("fs.defaultFS", config.getString(DEFAULT_FS))
    val checkpointStoreFactory = new HadoopCheckpointStoreFactory("MessageCount", hadoopConfig,
      // Rotates on 1KB
      new FileSizeRotation(1000))
    val taskConfig = UserConfig.empty
      .withBoolean(PersistentStateConfig.STATE_CHECKPOINT_ENABLE, true)
      .withLong(PersistentStateConfig.STATE_CHECKPOINT_INTERVAL_MS, 1000L)
      .withValue(PersistentStateConfig.STATE_CHECKPOINT_STORE_FACTORY, checkpointStoreFactory)

    val properties = new Properties
    properties.put(KafkaConfig.ZOOKEEPER_CONNECT_CONFIG, config.getString(ZOOKEEPER_CONNECT))
    val brokerList = config.getString(BROKER_LIST)
    properties.put(KafkaConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList)
    properties.put(KafkaConfig.CHECKPOINT_STORE_NAME_PREFIX_CONFIG, appName)
    val kafkaStoreFactory = new KafkaStoreFactory(properties)
    val sourceTopic = config.getString(SOURCE_TOPIC)
    val kafkaSource = new KafkaSource(sourceTopic, properties)
    kafkaSource.setCheckpointStore(kafkaStoreFactory)
    val sourceProcessor = DataSourceProcessor(kafkaSource, config.getInt(SOURCE_TASK))
    val countProcessor = Processor[CountProcessor](config.getInt(COUNT_TASK), taskConf = taskConfig)
    val kafkaSink = new KafkaSink(config.getString(SINK_TOPIC), properties)
    val sinkProcessor = DataSinkProcessor(kafkaSink, config.getInt(SINK_TASK))
    val partitioner = new HashPartitioner()
    val graph = Graph(sourceProcessor ~ partitioner
      ~> countProcessor ~ partitioner ~> sinkProcessor)
    val app = StreamApplication(appName, graph, UserConfig.empty)
    app
  }

  def main(akkaConf: Config, args: Array[String]): Unit = {

    val config = parse(args)
    val context = ClientContext(akkaConf)
    implicit val system = context.system
    val appId = context.submit(application(config))
    context.close()
  }
}

Source File: DFSJarStore.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.jarstore.dfs

import java.io.{InputStream, OutputStream}
import org.apache.gearpump.util.Constants
import org.apache.gearpump.jarstore.JarStore
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import com.typesafe.config.Config
import org.apache.hadoop.fs.permission.{FsAction, FsPermission}


  override def getFile(fileName: String): InputStream = {
    val filePath = new Path(rootPath, fileName)
    val fs = filePath.getFileSystem(new Configuration())
    fs.open(filePath)
  }

  private def createDirIfNotExists(path: Path): Unit = {
    val fs = path.getFileSystem(new Configuration())
    if (!fs.exists(path)) {
      fs.mkdirs(path, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL))
    }
  }
}

Source File: HadoopCheckpointStoreFactory.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.hadoop

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.gearpump.streaming.hadoop.lib.HadoopUtil
import org.apache.gearpump.streaming.hadoop.lib.rotation.{FileSizeRotation, Rotation}
import org.apache.gearpump.streaming.transaction.api.{CheckpointStore, CheckpointStoreFactory}

object HadoopCheckpointStoreFactory {
  val VERSION = 1
}

class HadoopCheckpointStoreFactory(
    dir: String,
    @transient private var hadoopConfig: Configuration,
    rotation: Rotation = new FileSizeRotation(128 * Math.pow(2, 20).toLong))
  extends CheckpointStoreFactory {
  import org.apache.gearpump.streaming.hadoop.HadoopCheckpointStoreFactory._

  
  private def readObject(in: ObjectInputStream): Unit = {
    in.defaultReadObject()
    hadoopConfig = new Configuration(false)
    hadoopConfig.readFields(in)
  }

  override def getCheckpointStore(name: String): CheckpointStore = {
    val dirPath = new Path(dir + Path.SEPARATOR + s"v$VERSION", name)
    val fs = HadoopUtil.getFileSystemForPath(dirPath, hadoopConfig)
    new HadoopCheckpointStore(dirPath, fs, hadoopConfig, rotation)
  }
}

Source File: HadoopCheckpointStoreReader.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.hadoop.lib

import java.io.EOFException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.gearpump.Time.MilliSeconds

class HadoopCheckpointStoreReader(
    path: Path,
    hadoopConfig: Configuration)
  extends Iterator[(MilliSeconds, Array[Byte])] {

  private val stream = HadoopUtil.getInputStream(path, hadoopConfig)
  private var nextTimeStamp: Option[MilliSeconds] = None
  private var nextData: Option[Array[Byte]] = None

  override def hasNext: Boolean = {
    if (nextTimeStamp.isDefined) {
      true
    } else {
      try {
        nextTimeStamp = Some(stream.readLong())
        val length = stream.readInt()
        val buffer = new Array[Byte](length)
        stream.readFully(buffer)
        nextData = Some(buffer)
        true
      } catch {
        case e: EOFException =>
          close()
          false
        case e: Exception =>
          close()
          throw e
      }
    }
  }

  override def next(): (MilliSeconds, Array[Byte]) = {
    val timeAndData = for {
      time <- nextTimeStamp
      data <- nextData
    } yield (time, data)
    nextTimeStamp = None
    nextData = None
    timeAndData.get
  }

  def close(): Unit = {
    stream.close()
  }
}

Source File: HadoopCheckpointStoreWriter.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.hadoop.lib

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.gearpump.Time.MilliSeconds

class HadoopCheckpointStoreWriter(path: Path, hadoopConfig: Configuration) {
  private lazy val stream = HadoopUtil.getOutputStream(path, hadoopConfig)

  def write(timestamp: MilliSeconds, data: Array[Byte]): Long = {
    stream.writeLong(timestamp)
    stream.writeInt(data.length)
    stream.write(data)
    stream.hflush()
    stream.getPos()
  }

  def close(): Unit = {
    stream.close()
  }
}

Source File: HadoopUtil.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.hadoop.lib

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.hadoop.security.UserGroupInformation

import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.util.{Constants, FileUtils}

private[hadoop] object HadoopUtil {

  def getOutputStream(path: Path, hadoopConfig: Configuration): FSDataOutputStream = {
    val dfs = getFileSystemForPath(path, hadoopConfig)
    val stream: FSDataOutputStream = {
      if (dfs.isFile(path)) {
        dfs.append(path)
      } else {
        dfs.create(path)
      }
    }
    stream
  }

  def getInputStream(path: Path, hadoopConfig: Configuration): FSDataInputStream = {
    val dfs = getFileSystemForPath(path, hadoopConfig)
    val stream = dfs.open(path)
    stream
  }

  def getFileSystemForPath(path: Path, hadoopConfig: Configuration): FileSystem = {
    // For local file systems, return the raw local file system, such calls to flush()
    // actually flushes the stream.
    val fs = path.getFileSystem(hadoopConfig)
    fs match {
      case localFs: LocalFileSystem => localFs.getRawFileSystem
      case _ => fs
    }
  }

  def login(userConfig: UserConfig, configuration: Configuration): Unit = {
    if (UserGroupInformation.isSecurityEnabled) {
      val principal = userConfig.getString(Constants.GEARPUMP_KERBEROS_PRINCIPAL)
      val keytabContent = userConfig.getBytes(Constants.GEARPUMP_KEYTAB_FILE)
      if (principal.isEmpty || keytabContent.isEmpty) {
        val errorMsg = s"HDFS is security enabled, user should provide kerberos principal in " +
          s"${Constants.GEARPUMP_KERBEROS_PRINCIPAL} " +
          s"and keytab file in ${Constants.GEARPUMP_KEYTAB_FILE}"
        throw new Exception(errorMsg)
      }
      val keytabFile = File.createTempFile("login", ".keytab")
      FileUtils.writeByteArrayToFile(keytabFile, keytabContent.get)
      keytabFile.setExecutable(false)
      keytabFile.setWritable(false)
      keytabFile.setReadable(true, true)

      UserGroupInformation.setConfiguration(configuration)
      UserGroupInformation.loginUserFromKeytab(principal.get, keytabFile.getAbsolutePath)
      keytabFile.delete()
    }
  }
}

Source File: HadoopCheckpointStoreIntegrationSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.mock.MockitoSugar
import org.scalatest.prop.PropertyChecks
import org.scalatest.{Matchers, PropSpec}

import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.streaming.MockUtil
import org.apache.gearpump.streaming.hadoop.lib.HadoopUtil
import org.apache.gearpump.streaming.hadoop.lib.rotation.FileSizeRotation
import org.apache.gearpump.streaming.task.TaskId

class HadoopCheckpointStoreIntegrationSpec
  extends PropSpec with PropertyChecks with MockitoSugar with Matchers {

  property("HadoopCheckpointStore should persist and recover checkpoints") {
    val fileSizeGen = Gen.chooseNum[Int](100, 1000)
    forAll(fileSizeGen) { (fileSize: Int) =>
      val userConfig = UserConfig.empty
      val taskContext = MockUtil.mockTaskContext
      val hadoopConfig = new Configuration()

      when(taskContext.appId).thenReturn(0)
      when(taskContext.taskId).thenReturn(TaskId(0, 0))

      val rootDirName = "test"
      val rootDir = new Path(rootDirName + Path.SEPARATOR +
        s"v${HadoopCheckpointStoreFactory.VERSION}")
      val subDirName = "app0-task0_0"
      val subDir = new Path(rootDir, subDirName)

      val fs = HadoopUtil.getFileSystemForPath(rootDir, hadoopConfig)
      fs.delete(rootDir, true)
      fs.exists(rootDir) shouldBe false

      val checkpointStoreFactory = new HadoopCheckpointStoreFactory(
        rootDirName, hadoopConfig, new FileSizeRotation(fileSize))
      val checkpointStore = checkpointStoreFactory.getCheckpointStore(subDirName)

      checkpointStore.persist(0L, Array(0.toByte))

      val tempFile = new Path(subDir, "checkpoints-0.store")
      fs.exists(tempFile) shouldBe true

      checkpointStore.persist(1L, Array.fill(fileSize)(0.toByte))
      fs.exists(tempFile) shouldBe false
      fs.exists(new Path(subDir, "checkpoints-0-1.store")) shouldBe true

      checkpointStore.persist(2L, Array(0.toByte))
      val newTempFile = new Path(subDir, "checkpoints-2.store")
      fs.exists(newTempFile) shouldBe true

      for (i <- 0 to 2) {
        val optCp = checkpointStore.recover(i)
        optCp should not be empty
      }
      fs.exists(newTempFile) shouldBe false
      fs.exists(new Path(subDir, "checkpoints-2-2.store")) shouldBe true

      checkpointStore.close()
      fs.delete(rootDir, true)
      fs.close()
    }
  }
}

Source File: HBaseSinkSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.external.hbase

import akka.actor.ActorSystem
import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.external.hbase.HBaseSink.{HBaseWriter, HBaseWriterFactory}
import org.apache.gearpump.streaming.MockUtil
import org.apache.gearpump.streaming.task.TaskContext
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.mock.MockitoSugar
import org.scalatest.prop.PropertyChecks
import org.scalatest.{Matchers, PropSpec}

class HBaseSinkSpec extends PropSpec with PropertyChecks with Matchers with MockitoSugar {


  property("HBaseSink should invoke HBaseWriter for writing message to HBase") {

    val hbaseWriter = mock[HBaseWriter]
    val hbaseWriterFactory = mock[HBaseWriterFactory]

    implicit val system: ActorSystem = MockUtil.system

    val userConfig = UserConfig.empty
    val tableName = "hbase"

    when(hbaseWriterFactory.getHBaseWriter(userConfig, tableName))
      .thenReturn(hbaseWriter)

    val hbaseSink = new HBaseSink(userConfig, tableName, hbaseWriterFactory)

    hbaseSink.open(MockUtil.mockTaskContext)

    forAll(Gen.alphaStr) { (value: String) =>
      val message = Message(value)
      hbaseSink.write(message)
      verify(hbaseWriter, atLeastOnce()).put(value)
    }

    hbaseSink.close()
    verify(hbaseWriter).close()
  }

  property("HBaseWriter should insert a row successfully") {

    val table = mock[Table]
    val config = mock[Configuration]
    val connection = mock[Connection]
    val taskContext = mock[TaskContext]

    val map = Map[String, String]("HBASESINK" -> "hbasesink", "TABLE_NAME" -> "hbase.table.name",
      "COLUMN_FAMILY" -> "hbase.table.column.family", "COLUMN_NAME" -> "hbase.table.column.name",
      "HBASE_USER" -> "hbase.user", "GEARPUMP_KERBEROS_PRINCIPAL" -> "gearpump.kerberos.principal",
      "GEARPUMP_KEYTAB_FILE" -> "gearpump.keytab.file"
    )
    val userConfig = new UserConfig(map)
    val tableName = "hbase"
    val row = "row"
    val group = "group"
    val name = "name"
    val value = "3.0"

    when(connection.getTable(TableName.valueOf(tableName))).thenReturn(table)

    val put = new Put(Bytes.toBytes(row))
    put.addColumn(Bytes.toBytes(group), Bytes.toBytes(name), Bytes.toBytes(value))

    val hbaseWriter = new HBaseWriter(connection, tableName)
    hbaseWriter.insert(Bytes.toBytes(row), Bytes.toBytes(group), Bytes.toBytes(name),
      Bytes.toBytes(value))

    verify(table).put(MockUtil.argMatch[Put](_.getRow sameElements Bytes.toBytes(row)))
  }
}

Source File: MiniClusterUtils.scala From incubator-livy with Apache License 2.0

5 votes

package org.apache.livy.test.framework

import java.io._
import java.nio.charset.StandardCharsets.UTF_8
import java.util.Properties

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration

trait MiniClusterUtils {

  protected def saveProperties(props: Map[String, String], dest: File): Unit = {
    val jprops = new Properties()
    props.foreach { case (k, v) => jprops.put(k, v) }

    val tempFile = new File(dest.getAbsolutePath() + ".tmp")
    val out = new OutputStreamWriter(new FileOutputStream(tempFile), UTF_8)
    try {
      jprops.store(out, "Configuration")
    } finally {
      out.close()
    }
    tempFile.renameTo(dest)
  }

  protected def loadProperties(file: File): Map[String, String] = {
    val in = new InputStreamReader(new FileInputStream(file), UTF_8)
    val props = new Properties()
    try {
      props.load(in)
    } finally {
      in.close()
    }
    props.asScala.toMap
  }

  protected def saveConfig(conf: Configuration, dest: File): Unit = {
    val redacted = new Configuration(conf)
    // This setting references a test class that is not available when using a real Spark
    // installation, so remove it from client configs.
    redacted.unset("net.topology.node.switch.mapping.impl")

    val out = new FileOutputStream(dest)
    try {
      redacted.writeXml(out)
    } finally {
      out.close()
    }
  }
}

Source File: HDFSCluster.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hdfs.MiniDFSCluster
import org.apache.hadoop.test.PathUtils

class HDFSCluster extends HDFSClusterLike

trait HDFSClusterLike {
  @transient private var hdfsCluster: MiniDFSCluster = null

  def startHDFS() = {
    println("Starting HDFS Cluster...")
    val baseDir = new File(PathUtils.getTestDir(getClass()), "miniHDFS")

    val conf = new Configuration()
    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath())

    val builder = new MiniDFSCluster.Builder(conf)
    hdfsCluster = builder.nameNodePort(8020).format(true).build()
    hdfsCluster.waitClusterUp()
  }

  def getNameNodeURI(): String = {
    "hdfs://localhost:" + hdfsCluster.getNameNodePort()
  }

  def shutdownHDFS(): Unit = {
    hdfsCluster.shutdown()
  }
}

Source File: HDFSClusterTest.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import java.io.{
  BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter}

import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

class HDFSClusterTest extends FunSuite with SharedSparkContext with RDDComparisons {

  var hdfsCluster: HDFSCluster = null

  override def beforeAll(): Unit = {
    super.beforeAll()
    hdfsCluster = new HDFSCluster
    hdfsCluster.startHDFS()
  }

  test("get the namenode uri") {
    val nameNodeURI = hdfsCluster.getNameNodeURI()
    assert(nameNodeURI == "hdfs://localhost:8020")
  }

  test("read and write from spark to hdfs") {
    val list = List(1, 2, 3, 4, 5)
    val numRDD: RDD[Int] = sc.parallelize(list)

    val path = hdfsCluster.getNameNodeURI() + "/myRDD"
    numRDD.saveAsTextFile(path)

    val loadedRDD: RDD[Int] = sc.textFile(path).map(_.toInt)
    assertRDDEquals(numRDD, loadedRDD)
  }

  test("test creating local file to hdfs") {
    val path = new Path(hdfsCluster.getNameNodeURI() + "/myfile")
    val fs = FileSystem.get(path.toUri, new Configuration())

    val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path)))
    val writtenString = "hello, it's me"
    writer.write(writtenString)
    writer.close()

    val reader = new BufferedReader(new InputStreamReader(fs.open(path)))
    val readString = reader.readLine()
    reader.close()

    assert(writtenString == readString)
  }

  override def afterAll() {
    hdfsCluster.shutdownHDFS()
    super.afterAll()
  }
}

Source File: OrcFileOperator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
      val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
      val schema = readerInspector.getTypeName
      logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
      CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: HiveExternalCatalogSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.hadoop.conf.Configuration

import org.apache.spark.SparkConf
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.types.StructType


class HiveExternalCatalogSuite extends ExternalCatalogSuite {

  private val externalCatalog: HiveExternalCatalog = {
    val catalog = new HiveExternalCatalog(new SparkConf, new Configuration)
    catalog.client.reset()
    catalog
  }

  protected override val utils: CatalogTestUtils = new CatalogTestUtils {
    override val tableInputFormat: String = "org.apache.hadoop.mapred.SequenceFileInputFormat"
    override val tableOutputFormat: String = "org.apache.hadoop.mapred.SequenceFileOutputFormat"
    override def newEmptyCatalog(): ExternalCatalog = externalCatalog
  }

  protected override def resetState(): Unit = {
    externalCatalog.client.reset()
  }

  import utils._

  test("list partitions by filter") {
    val catalog = newBasicCatalog()
    val selectedPartitions = catalog.listPartitionsByFilter("db2", "tbl2", Seq('a.int === 1))
    assert(selectedPartitions.length == 1)
    assert(selectedPartitions.head.spec == part1.spec)
  }

  test("SPARK-18647: do not put provider in table properties for Hive serde table") {
    val catalog = newBasicCatalog()
    val hiveTable = CatalogTable(
      identifier = TableIdentifier("hive_tbl", Some("db1")),
      tableType = CatalogTableType.MANAGED,
      storage = storageFormat,
      schema = new StructType().add("col1", "int").add("col2", "string"),
      provider = Some("hive"))
    catalog.createTable(hiveTable, ignoreIfExists = false)

    val rawTable = externalCatalog.client.getTable("db1", "hive_tbl")
    assert(!rawTable.properties.contains(HiveExternalCatalog.DATASOURCE_PROVIDER))
    assert(externalCatalog.getTable("db1", "hive_tbl").provider == Some(DDLUtils.HIVE_PROVIDER))
  }
}

Source File: HiveClientBuilder.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.client

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.util.VersionInfo

import org.apache.spark.SparkConf
import org.apache.spark.util.Utils

private[client] class HiveClientBuilder {
  private val sparkConf = new SparkConf()

  // In order to speed up test execution during development or in Jenkins, you can specify the path
  // of an existing Ivy cache:
  private val ivyPath: Option[String] = {
    sys.env.get("SPARK_VERSIONS_SUITE_IVY_PATH").orElse(
      Some(new File(sys.props("java.io.tmpdir"), "hive-ivy-cache").getAbsolutePath))
  }

  private def buildConf() = {
    lazy val warehousePath = Utils.createTempDir()
    lazy val metastorePath = Utils.createTempDir()
    metastorePath.delete()
    Map(
      "javax.jdo.option.ConnectionURL" -> s"jdbc:derby:;databaseName=$metastorePath;create=true",
      "hive.metastore.warehouse.dir" -> warehousePath.toString)
  }

  def buildClient(version: String, hadoopConf: Configuration): HiveClient = {
    IsolatedClientLoader.forVersion(
      hiveMetastoreVersion = version,
      hadoopVersion = VersionInfo.getVersion,
      sparkConf = sparkConf,
      hadoopConf = hadoopConf,
      config = buildConf(),
      ivyPath = ivyPath).createClient()
  }
}

Source File: HiveClientSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.client

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.conf.HiveConf

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal}
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.types.IntegerType

class HiveClientSuite extends SparkFunSuite {
  private val clientBuilder = new HiveClientBuilder

  private val tryDirectSqlKey = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname

  test(s"getPartitionsByFilter returns all partitions when $tryDirectSqlKey=false") {
    val testPartitionCount = 5

    val storageFormat = CatalogStorageFormat(
      locationUri = None,
      inputFormat = None,
      outputFormat = None,
      serde = None,
      compressed = false,
      properties = Map.empty)

    val hadoopConf = new Configuration()
    hadoopConf.setBoolean(tryDirectSqlKey, false)
    val client = clientBuilder.buildClient(HiveUtils.hiveExecutionVersion, hadoopConf)
    client.runSqlHive("CREATE TABLE test (value INT) PARTITIONED BY (part INT)")

    val partitions = (1 to testPartitionCount).map { part =>
      CatalogTablePartition(Map("part" -> part.toString), storageFormat)
    }
    client.createPartitions(
      "default", "test", partitions, ignoreIfExists = false)

    val filteredPartitions = client.getPartitionsByFilter(client.getTable("default", "test"),
      Seq(EqualTo(AttributeReference("part", IntegerType)(), Literal(3))))

    assert(filteredPartitions.size == testPartitionCount)
  }
}

Source File: CompressionCodecs.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.util

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.SequenceFile.CompressionType
import org.apache.hadoop.io.compress._

import org.apache.spark.util.Utils

object CompressionCodecs {
  private val shortCompressionCodecNames = Map(
    "none" -> null,
    "uncompressed" -> null,
    "bzip2" -> classOf[BZip2Codec].getName,
    "deflate" -> classOf[DeflateCodec].getName,
    "gzip" -> classOf[GzipCodec].getName,
    "lz4" -> classOf[Lz4Codec].getName,
    "snappy" -> classOf[SnappyCodec].getName)

  
  def setCodecConfiguration(conf: Configuration, codec: String): Unit = {
    if (codec != null) {
      conf.set("mapreduce.output.fileoutputformat.compress", "true")
      conf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString)
      conf.set("mapreduce.output.fileoutputformat.compress.codec", codec)
      conf.set("mapreduce.map.output.compress", "true")
      conf.set("mapreduce.map.output.compress.codec", codec)
    } else {
      // This infers the option `compression` is set to `uncompressed` or `none`.
      conf.set("mapreduce.output.fileoutputformat.compress", "false")
      conf.set("mapreduce.map.output.compress", "false")
    }
  }
}

Source File: HadoopFileLinesReader.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new LineRecordReader()
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Source File: StreamMetadata.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = FileSystem.get(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
}

Source File: StreamMetadataSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
}

Source File: HBaseCredentialProvider.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import scala.reflect.runtime.universe
import scala.util.control.NonFatal

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.security.Credentials
import org.apache.hadoop.security.token.{Token, TokenIdentifier}

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging

private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging {

  override def serviceName: String = "hbase"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    try {
      val mirror = universe.runtimeMirror(getClass.getClassLoader)
      val obtainToken = mirror.classLoader.
        loadClass("org.apache.hadoop.hbase.security.token.TokenUtil").
        getMethod("obtainToken", classOf[Configuration])

      logDebug("Attempting to fetch HBase security token.")
      val token = obtainToken.invoke(null, hbaseConf(hadoopConf))
        .asInstanceOf[Token[_ <: TokenIdentifier]]
      logInfo(s"Get token from HBase: ${token.toString}")
      creds.addToken(token.getService, token)
    } catch {
      case NonFatal(e) =>
        logDebug(s"Failed to get token from service $serviceName", e)
    }

    None
  }

  override def credentialsRequired(hadoopConf: Configuration): Boolean = {
    hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos"
  }

  private def hbaseConf(conf: Configuration): Configuration = {
    try {
      val mirror = universe.runtimeMirror(getClass.getClassLoader)
      val confCreate = mirror.classLoader.
        loadClass("org.apache.hadoop.hbase.HBaseConfiguration").
        getMethod("create", classOf[Configuration])
      confCreate.invoke(null, conf).asInstanceOf[Configuration]
    } catch {
      case NonFatal(e) =>
        logDebug("Fail to invoke HBaseConfiguration", e)
        conf
    }
  }
}

Source File: HDFSCredentialProvider.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import java.io.{ByteArrayInputStream, DataInputStream}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.security.Credentials

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._

private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
  // Token renewal interval, this value will be set in the first call,
  // if None means no token renewer specified, so cannot get token renewal interval.
  private var tokenRenewalInterval: Option[Long] = null

  override val serviceName: String = "hdfs"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    // NameNode to access, used to get tokens from different FileSystems
    nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
      val dstFs = dst.getFileSystem(hadoopConf)
      logInfo("getting token for namenode: " + dst)
      dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
    }

    // Get the token renewal interval if it is not set. It will only be called once.
    if (tokenRenewalInterval == null) {
      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
    }

    // Get the time of next renewal.
    tokenRenewalInterval.map { interval =>
      creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .map { t =>
          val identifier = new DelegationTokenIdentifier()
          identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
          identifier.getIssueDate + interval
      }.foldLeft(0L)(math.max)
    }
  }

  private def getTokenRenewalInterval(
      hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
    // We cannot use the tokens generated with renewer yarn. Trying to renew
    // those will fail with an access control issue. So create new tokens with the logged in
    // user as renewer.
    sparkConf.get(PRINCIPAL).flatMap { renewer =>
      val creds = new Credentials()
      nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
        val dstFs = dst.getFileSystem(hadoopConf)
        dstFs.addDelegationTokens(renewer, creds)
      }
      val hdfsToken = creds.getAllTokens.asScala
        .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
      hdfsToken.map { t =>
        val newExpiration = t.renew(hadoopConf)
        val identifier = new DelegationTokenIdentifier()
        identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
        val interval = newExpiration - identifier.getIssueDate
        logInfo(s"Renewal Interval is $interval")
        interval
      }
    }
  }

  private def getTokenRenewer(conf: Configuration): String = {
    val delegTokenRenewer = Master.getMasterPrincipal(conf)
    logDebug("delegation token renewer is: " + delegTokenRenewer)
    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
      logError(errorMessage)
      throw new SparkException(errorMessage)
    }

    delegTokenRenewer
  }

  private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
    sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
      sparkConf.get(STAGING_DIR).map(new Path(_))
        .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
  }
}

Source File: HDFSCredentialProviderSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.scalatest.{Matchers, PrivateMethodTester}

import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}

class HDFSCredentialProviderSuite
    extends SparkFunSuite
    with PrivateMethodTester
    with Matchers {
  private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer)

  private def getTokenRenewer(
      hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = {
    hdfsCredentialProvider invokePrivate _getTokenRenewer(conf)
  }

  private var hdfsCredentialProvider: HDFSCredentialProvider = null

  override def beforeAll() {
    super.beforeAll()

    if (hdfsCredentialProvider == null) {
      hdfsCredentialProvider = new HDFSCredentialProvider()
    }
  }

  override def afterAll() {
    if (hdfsCredentialProvider != null) {
      hdfsCredentialProvider = null
    }

    super.afterAll()
  }

  test("check token renewer") {
    val hadoopConf = new Configuration()
    hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
    hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]")
    val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf)
    renewer should be ("yarn/myrm:[email protected]")
  }

  test("check token renewer default") {
    val hadoopConf = new Configuration()
    val caught =
      intercept[SparkException] {
        getTokenRenewer(hdfsCredentialProvider, hadoopConf)
      }
    assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
  }
}

Source File: FileBasedWriteAheadLogRandomReader.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.Closeable
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration


private[streaming] class FileBasedWriteAheadLogRandomReader(path: String, conf: Configuration)
  extends Closeable {

  private val instream = HdfsUtils.getInputStream(path, conf)
  private var closed = (instream == null) // the file may be deleted as we're opening the stream

  def read(segment: FileBasedWriteAheadLogSegment): ByteBuffer = synchronized {
    assertOpen()
    instream.seek(segment.offset)
    val nextLength = instream.readInt()
    HdfsUtils.checkState(nextLength == segment.length,
      s"Expected message length to be ${segment.length}, but was $nextLength")
    val buffer = new Array[Byte](nextLength)
    instream.readFully(buffer)
    ByteBuffer.wrap(buffer)
  }

  override def close(): Unit = synchronized {
    closed = true
    instream.close()
  }

  private def assertOpen() {
    HdfsUtils.checkState(!closed, "Stream is closed. Create a new Reader to read from the file.")
  }
}

Source File: HdfsUtils.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{FileNotFoundException, IOException}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._

private[streaming] object HdfsUtils {

  def getOutputStream(path: String, conf: Configuration): FSDataOutputStream = {
    val dfsPath = new Path(path)
    val dfs = getFileSystemForPath(dfsPath, conf)
    // If the file exists and we have append support, append instead of creating a new file
    val stream: FSDataOutputStream = {
      if (dfs.isFile(dfsPath)) {
        if (conf.getBoolean("hdfs.append.support", false) || dfs.isInstanceOf[RawLocalFileSystem]) {
          dfs.append(dfsPath)
        } else {
          throw new IllegalStateException("File exists and there is no append support!")
        }
      } else {
        dfs.create(dfsPath)
      }
    }
    stream
  }

  def getInputStream(path: String, conf: Configuration): FSDataInputStream = {
    val dfsPath = new Path(path)
    val dfs = getFileSystemForPath(dfsPath, conf)
    try {
      dfs.open(dfsPath)
    } catch {
      case _: FileNotFoundException =>
        null
      case e: IOException =>
        // If we are really unlucky, the file may be deleted as we're opening the stream.
        // This can happen as clean up is performed by daemon threads that may be left over from
        // previous runs.
        if (!dfs.isFile(dfsPath)) null else throw e
    }
  }

  def checkState(state: Boolean, errorMsg: => String) {
    if (!state) {
      throw new IllegalStateException(errorMsg)
    }
  }

  
  def checkFileExists(path: String, conf: Configuration): Boolean = {
    val hdpPath = new Path(path)
    val fs = getFileSystemForPath(hdpPath, conf)
    fs.isFile(hdpPath)
  }
}

Source File: FileBasedWriteAheadLogWriter.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io._
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration

import org.apache.spark.util.Utils


  def write(data: ByteBuffer): FileBasedWriteAheadLogSegment = synchronized {
    assertOpen()
    data.rewind() // Rewind to ensure all data in the buffer is retrieved
    val lengthToWrite = data.remaining()
    val segment = new FileBasedWriteAheadLogSegment(path, nextOffset, lengthToWrite)
    stream.writeInt(lengthToWrite)
    Utils.writeByteBuffer(data, stream: OutputStream)
    flush()
    nextOffset = stream.getPos()
    segment
  }

  override def close(): Unit = synchronized {
    closed = true
    stream.close()
  }

  private def flush() {
    stream.hflush()
    // Useful for local file system where hflush/sync does not work (HADOOP-7844)
    stream.getWrappedStream.flush()
  }

  private def assertOpen() {
    HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.")
  }
}

Source File: FileBasedWriteAheadLogReader.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{Closeable, EOFException, IOException}
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration

import org.apache.spark.internal.Logging


private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration)
  extends Iterator[ByteBuffer] with Closeable with Logging {

  private val instream = HdfsUtils.getInputStream(path, conf)
  private var closed = (instream == null) // the file may be deleted as we're opening the stream
  private var nextItem: Option[ByteBuffer] = None

  override def hasNext: Boolean = synchronized {
    if (closed) {
      return false
    }

    if (nextItem.isDefined) { // handle the case where hasNext is called without calling next
      true
    } else {
      try {
        val length = instream.readInt()
        val buffer = new Array[Byte](length)
        instream.readFully(buffer)
        nextItem = Some(ByteBuffer.wrap(buffer))
        logTrace("Read next item " + nextItem.get)
        true
      } catch {
        case e: EOFException =>
          logDebug("Error reading next item, EOF reached", e)
          close()
          false
        case e: IOException =>
          logWarning("Error while trying to read data. If the file was deleted, " +
            "this should be okay.", e)
          close()
          if (HdfsUtils.checkFileExists(path, conf)) {
            // If file exists, this could be a legitimate error
            throw e
          } else {
            // File was deleted. This can occur when the daemon cleanup thread takes time to
            // delete the file during recovery.
            false
          }

        case e: Exception =>
          logWarning("Error while trying to read data from HDFS.", e)
          close()
          throw e
      }
    }
  }

  override def next(): ByteBuffer = synchronized {
    val data = nextItem.getOrElse {
      close()
      throw new IllegalStateException(
        "next called without calling hasNext or after hasNext returned false")
    }
    nextItem = None // Ensure the next hasNext call loads new data.
    data
  }

  override def close(): Unit = synchronized {
    if (!closed) {
      instream.close()
    }
    closed = true
  }
}

Source File: SerializableWritable.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {

  def value: T = t

  override def toString: String = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration(false))
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Source File: SerializableConfiguration.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
}

Source File: WholeTextFileRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat


private[spark] class WholeTextFileRDD(
    sc : SparkContext,
    inputFormatClass: Class[_ <: WholeTextFileInputFormat],
    keyClass: Class[Text],
    valueClass: Class[Text],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
}

Source File: BinaryFileRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.StreamFileInputFormat

private[spark] class BinaryFileRDD[T](
    @transient private val sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(sc, jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
}

Source File: SerializableWritable.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {
  def value = t
  override def toString = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration())
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Source File: SparkHadoopMapReduceUtil.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.mapreduce

import java.lang.{Boolean => JBoolean, Integer => JInteger}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID}

private[spark]
trait SparkHadoopMapReduceUtil {
  def newJobContext(conf: Configuration, jobId: JobID): JobContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.JobContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.JobContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID])
    ctor.newInstance(conf, jobId).asInstanceOf[JobContext]
  }

  def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.TaskAttemptContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID])
    ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext]
  }

  def newTaskAttemptID(
      jtIdentifier: String,
      jobId: Int,
      isMap: Boolean,
      taskId: Int,
      attemptId: Int) = {
    val klass = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptID")
    try {
      // First, attempt to use the old-style constructor that takes a boolean isMap
      // (not available in YARN)
      val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean],
        classOf[Int], classOf[Int])
      ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId),
        new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
    } catch {
      case exc: NoSuchMethodException => {
        // If that failed, look for the new constructor that takes a TaskType (not available in 1.x)
        val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType")
          .asInstanceOf[Class[Enum[_]]]
        val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke(
          taskTypeClass, if(isMap) "MAP" else "REDUCE")
        val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass,
          classOf[Int], classOf[Int])
        ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId),
          new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
      }
    }
  }

  private def firstAvailableClass(first: String, second: String): Class[_] = {
    try {
      Class.forName(first)
    } catch {
      case e: ClassNotFoundException =>
        Class.forName(second)
    }
  }
}

Source File: BinaryFileRDD.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.hadoop.conf.{ Configurable, Configuration }
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.spark.input.StreamFileInputFormat
import org.apache.spark.{ Partition, SparkContext }

private[spark] class BinaryFileRDD[T](
    sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    @transient conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = newJobContext(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
}

Source File: CarbonHiveMetastoreListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.hive

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.metastore.MetaStorePreEventListener
import org.apache.hadoop.hive.metastore.api.{FieldSchema, MetaException}
import org.apache.hadoop.hive.metastore.events._
import org.apache.hadoop.hive.metastore.events.PreEventContext.PreEventType._
import org.apache.spark.sql.types.{DataType, StructField, StructType}

class CarbonHiveMetastoreListener(conf: Configuration) extends MetaStorePreEventListener(conf) {

  override def onEvent(preEventContext: PreEventContext): Unit = {
    preEventContext.getEventType match {
      case CREATE_TABLE =>
        val table = preEventContext.asInstanceOf[PreCreateTableEvent].getTable
        val tableProps = table.getParameters
        if (tableProps != null
          && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource"
          || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) {
          val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts")
          if (numSchemaParts != null && !numSchemaParts.isEmpty) {
            val parts = (0 until numSchemaParts.toInt).map { index =>
              val part = tableProps.get(s"spark.sql.sources.schema.part.${index}")
              if (part == null) {
                throw new MetaException(s"spark.sql.sources.schema.part.${index} is missing!")
              }
              part
            }
            // Stick all parts back to a single schema string.
            val schema = DataType.fromJson(parts.mkString).asInstanceOf[StructType]
            val hiveSchema = schema.map(toHiveColumn).asJava
            table.getSd.setCols(hiveSchema)
            table.getSd.setInputFormat("org.apache.carbondata.hive.MapredCarbonInputFormat")
            table.getSd.setOutputFormat("org.apache.carbondata.hive.MapredCarbonOutputFormat")
            val serdeInfo = table.getSd.getSerdeInfo
            serdeInfo.setSerializationLib("org.apache.carbondata.hive.CarbonHiveSerDe")
            val tablePath = serdeInfo.getParameters.get("tablePath")
            if (tablePath != null) {
              table.getSd.setLocation(tablePath)
            }
          }
        }
      case ALTER_TABLE =>
        val table = preEventContext.asInstanceOf[PreAlterTableEvent].getNewTable
        val tableProps = table.getParameters
        if (tableProps != null
          && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource"
          || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) {
          val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts")
          if (numSchemaParts != null && !numSchemaParts.isEmpty) {
            val schemaParts = (0 until numSchemaParts.toInt).map { index =>
              val schemaPart = tableProps.get(s"spark.sql.sources.schema.part.$index")
              if (schemaPart == null) {
                throw new MetaException(s"spark.sql.sources.schema.part.$index is missing!")
              }
              schemaPart
            }
            // Stick all schemaParts back to a single schema string.
            val schema = DataType.fromJson(schemaParts.mkString).asInstanceOf[StructType]
            val hiveSchema = schema.map(toHiveColumn).asJava
            table.getSd.setCols(hiveSchema)
          }
        }
      case _ =>
      // do nothing
    }
  }

  private def toHiveColumn(c: StructField): FieldSchema = {
    val typeString = if (c.metadata.contains("HIVE_TYPE_STRING")) {
      c.metadata.getString("HIVE_TYPE_STRING")
    } else {
      c.dataType.catalogString
    }
    new FieldSchema(c.name, typeString, c.getComment().orNull)
  }
}

Source File: CarbonLoadParams.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.management

import java.text.SimpleDateFormat
import java.util

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.command.UpdateTableModel
import org.apache.spark.sql.execution.datasources.LogicalRelation

import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.statusmanager.SegmentStatus
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.events.OperationContext
import org.apache.carbondata.processing.loading.model.CarbonLoadModel


case class CarbonLoadParams(
    sparkSession: SparkSession,
    tableName: String,
    sizeInBytes: Long,
    isOverwriteTable: Boolean,
    carbonLoadModel: CarbonLoadModel,
    hadoopConf: Configuration,
    logicalPartitionRelation: LogicalRelation,
    dateFormat : SimpleDateFormat,
    timeStampFormat : SimpleDateFormat,
    optionsOriginal: Map[String, String],
    finalPartition : Map[String, Option[String]],
    currPartitions: util.List[PartitionSpec],
    partitionStatus : SegmentStatus,
    var dataFrame: Option[DataFrame],
    scanResultRDD : Option[RDD[InternalRow]],
    updateModel: Option[UpdateTableModel],
    operationContext: OperationContext) {
}

Source File: FileUtils.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{File, IOException}

import org.apache.hadoop.conf.Configuration
import org.apache.spark.SparkContext

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.datastore.filesystem.CarbonFile
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.metadata.DatabaseLocationProvider
import org.apache.carbondata.core.util.CarbonUtil
import org.apache.carbondata.events.{CreateDatabasePostExecutionEvent, OperationContext, OperationListenerBus}
import org.apache.carbondata.processing.exception.DataLoadingException

object FileUtils {
  
  def getPaths(inputPath: String): String = {
    getPaths(inputPath, FileFactory.getConfiguration)
  }

  def getPaths(inputPath: String, hadoopConf: Configuration): String = {
    if (inputPath == null || inputPath.isEmpty) {
      throw new DataLoadingException("Input file path cannot be empty.")
    } else {
      val stringBuild = new StringBuilder()
      val filePaths = inputPath.split(",").map(_.trim)
      for (i <- 0 until filePaths.size) {
        val filePath = CarbonUtil.checkAndAppendHDFSUrl(filePaths(i))
        val carbonFile = FileFactory.getCarbonFile(filePath, hadoopConf)
        if (!carbonFile.exists()) {
          throw new DataLoadingException(
            s"The input file does not exist: ${CarbonUtil.removeAKSK(filePaths(i))}" )
        }
        getPathsFromCarbonFile(carbonFile, stringBuild, hadoopConf)
      }
      if (stringBuild.nonEmpty) {
        stringBuild.substring(0, stringBuild.size - 1)
      } else {
        throw new DataLoadingException("Please check your input path and make sure " +
                                       "that files end with '.csv' and content is not empty.")
      }
    }
  }

  def getSpaceOccupied(inputPath: String, hadoopConfiguration: Configuration): Long = {
    var size : Long = 0
    if (inputPath == null || inputPath.isEmpty) {
      size
    } else {
      val filePaths = inputPath.split(",")
      for (i <- 0 until filePaths.size) {
        val carbonFile = FileFactory.getCarbonFile(filePaths(i), hadoopConfiguration)
        size = size + carbonFile.getSize
      }
      size
    }
  }

  def createDatabaseDirectory(dbName: String, storePath: String, sparkContext: SparkContext) {
    val databasePath: String =
      storePath + File.separator + DatabaseLocationProvider.get().provide(dbName.toLowerCase)
    FileFactory.mkdirs(databasePath)
    val operationContext = new OperationContext
    val createDatabasePostExecutionEvent = new CreateDatabasePostExecutionEvent(dbName,
      databasePath, sparkContext)
    OperationListenerBus.getInstance.fireEvent(createDatabasePostExecutionEvent, operationContext)
  }

}

Source File: TableLoader.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.util

import java.util.Properties

import scala.collection.{immutable, mutable}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.command.management.CarbonLoadDataCommand

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.util.CarbonProperties


// scalastyle:off
object TableLoader {

  def extractOptions(propertiesFile: String): immutable.Map[String, String] = {
    val props = new Properties
    val path = new Path(propertiesFile)
    val fs = path.getFileSystem(FileFactory.getConfiguration)
    props.load(fs.open(path))
    val elments = props.entrySet().iterator()
    val map = new mutable.HashMap[String, String]()
    System.out.println("properties file:")
    while (elments.hasNext) {
      val elment = elments.next()
      System.out.println(s"${elment.getKey}=${elment.getValue}")
      map.put(elment.getKey.asInstanceOf[String], elment.getValue.asInstanceOf[String])
    }

    immutable.Map(map.toSeq: _*)
  }

  def extractStorePath(map: immutable.Map[String, String]): String = {
    map.get(CarbonCommonConstants.STORE_LOCATION) match {
      case Some(path) => path
      case None => throw new Exception(s"${CarbonCommonConstants.STORE_LOCATION} can't be empty")
    }
  }

  def loadTable(spark: SparkSession, dbName: Option[String], tableName: String, inputPaths: String,
      options: scala.collection.immutable.Map[String, String]): Unit = {
    CarbonLoadDataCommand(dbName, tableName, inputPaths, Nil, options, false).run(spark)
  }

  def main(args: Array[String]): Unit = {
    if (args.length < 3) {
      System.err.println("Usage: TableLoader <properties file> <table name> <input files>")
      System.exit(1)
    }
    System.out.println("parameter list:")
    args.foreach(System.out.println)
    val map = extractOptions(TableAPIUtil.escape(args(0)))
    val storePath = extractStorePath(map)
    System.out.println(s"${CarbonCommonConstants.STORE_LOCATION}:$storePath")
    val (dbName, tableName) = TableAPIUtil.parseSchemaName(TableAPIUtil.escape(args(1)))
    System.out.println(s"table name: $dbName.$tableName")
    val inputPaths = TableAPIUtil.escape(args(2))

    val spark = TableAPIUtil.spark(storePath, s"TableLoader: $dbName.$tableName")

    loadTable(spark, Option(dbName), tableName, inputPaths, map)
  }

}

Source File: CarbonDeltaRowScanRDD.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.rdd

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.conf.Configuration
import org.apache.spark.Partition
import org.apache.spark.sql.SparkSession

import org.apache.carbondata.converter.SparkDataTypeConverterImpl
import org.apache.carbondata.core.index.IndexFilter
import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier
import org.apache.carbondata.core.metadata.schema.table.{CarbonTable, TableInfo}
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager
import org.apache.carbondata.core.util.DataTypeConverter
import org.apache.carbondata.hadoop.{CarbonMultiBlockSplit, CarbonProjection}
import org.apache.carbondata.hadoop.api.CarbonTableInputFormat
import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport
import org.apache.carbondata.spark.InitInputMetrics


class CarbonDeltaRowScanRDD[T: ClassTag](
    @transient private val spark: SparkSession,
    @transient private val serializedTableInfo: Array[Byte],
    @transient private val tableInfo: TableInfo,
    @transient override val partitionNames: Seq[PartitionSpec],
    override val columnProjection: CarbonProjection,
    var filter: IndexFilter,
    identifier: AbsoluteTableIdentifier,
    inputMetricsStats: InitInputMetrics,
    override val dataTypeConverterClz: Class[_ <: DataTypeConverter] =
    classOf[SparkDataTypeConverterImpl],
    override val readSupportClz: Class[_ <: CarbonReadSupport[_]] =
    SparkReadSupport.readSupportClass,
    deltaVersionToRead: String) extends
  CarbonScanRDD[T](
    spark,
    columnProjection,
    filter,
    identifier,
    serializedTableInfo,
    tableInfo,
    inputMetricsStats,
    partitionNames,
    dataTypeConverterClz,
    readSupportClz) {
  override def internalGetPartitions: Array[Partition] = {
    val table = CarbonTable.buildFromTableInfo(getTableInfo)
    val updateStatusManager = new SegmentUpdateStatusManager(table, deltaVersionToRead)

    val parts = super.internalGetPartitions
    parts.map { p =>
      val partition = p.asInstanceOf[CarbonSparkPartition]
      val splits = partition.multiBlockSplit.getAllSplits.asScala.filter { s =>
        updateStatusManager.getDetailsForABlock(
          CarbonUpdateUtil.getSegmentBlockNameKey(s.getSegmentId,
            s.getBlockPath,
            table.isHivePartitionTable)) != null
      }.asJava
      new CarbonSparkPartition(partition.rddId, partition.index,
        new CarbonMultiBlockSplit(splits, partition.multiBlockSplit.getLocations))
    }.filter(p => p.multiBlockSplit.getAllSplits.size() > 0).zipWithIndex.map{ case (p, index) =>
      new CarbonSparkPartition(p.rddId, index, p.multiBlockSplit)
    }.asInstanceOf[Array[Partition]]
  }

  override def createInputFormat(conf: Configuration): CarbonTableInputFormat[Object] = {
    val format = super.createInputFormat(conf)
    conf.set("updateDeltaVersion", deltaVersionToRead)
    conf.set("readDeltaOnly", "true")
    format
  }
}

Source File: CarbonRDD.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.rdd

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.conf.Configuration
import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.util.SparkSQLUtil

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.schema.table.TableInfo
import org.apache.carbondata.core.util._


abstract class CarbonRDDWithTableInfo[T: ClassTag](
    @transient private val ss: SparkSession,
    @transient private var deps: Seq[Dependency[_]],
    serializedTableInfo: Array[Byte]) extends CarbonRDD[T](ss, deps) {

  def this(@transient sparkSession: SparkSession, @transient oneParent: RDD[_],
      serializedTableInfo: Array[Byte]) = {
    this (sparkSession, List(new OneToOneDependency(oneParent)), serializedTableInfo)
  }

  def getTableInfo: TableInfo = TableInfo.deserialize(serializedTableInfo)
}

Source File: TestDataLoadWithFileName.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.dataload

import scala.collection.JavaConverters._
import java.io.{File, FilenameFilter}

import org.apache.hadoop.conf.Configuration

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.reader.CarbonIndexFileReader
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.core.util.path.CarbonTablePath
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.index.Segment
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.metadata.{CarbonMetadata, SegmentFileStore}

class TestDataLoadWithFileName extends QueryTest with BeforeAndAfterAll {
  var originVersion = ""

  override def beforeAll() {
    originVersion =
      CarbonProperties.getInstance.getProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION)
  }

  test("Check the file_name in carbonindex with v3 format") {
    CarbonProperties.getInstance.addProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION, "3")
    sql("DROP TABLE IF EXISTS test_table_v3")
    sql(
      """
        | CREATE TABLE test_table_v3(id int, name string, city string, age int)
        | STORED AS carbondata
      """.stripMargin)
    val testData = s"$resourcesPath/sample.csv"
    sql(s"LOAD DATA LOCAL INPATH '$testData' into table test_table_v3")
    val indexReader = new CarbonIndexFileReader()
    val carbonTable = CarbonMetadata.getInstance().getCarbonTable("default", "test_table_v3")
    val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, "0")

    val carbonIndexPaths = if (FileFactory.isFileExist(segmentDir)) {
      new File(segmentDir)
        .listFiles(new FilenameFilter {
          override def accept(dir: File, name: String): Boolean = {
            name.endsWith(CarbonTablePath.getCarbonIndexExtension)
          }
        })
    } else {
      val segment = Segment.getSegment("0", carbonTable.getTablePath)
      val store = new SegmentFileStore(carbonTable.getTablePath, segment.getSegmentFileName)
      store.readIndexFiles(new Configuration(false))
      store.getIndexCarbonFiles.asScala.map(f => new File(f.getAbsolutePath)).toArray
    }
    for (carbonIndexPath <- carbonIndexPaths) {
      indexReader.openThriftReader(carbonIndexPath.getCanonicalPath)
      assert(indexReader.readIndexHeader().getVersion === 3)
      while (indexReader.hasNext) {
        val readBlockIndexInfo = indexReader.readBlockIndexInfo()
        assert(readBlockIndexInfo.getFile_name.startsWith(CarbonTablePath.getCarbonDataPrefix))
        assert(readBlockIndexInfo.getFile_name.endsWith(CarbonTablePath.getCarbonDataExtension))
      }
    }
  }

  override protected def afterAll() {
    sql("DROP TABLE IF EXISTS test_table_v1")
    sql("DROP TABLE IF EXISTS test_table_v2")
    sql("DROP TABLE IF EXISTS test_table_v3")
    CarbonProperties.getInstance.addProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION,
      originVersion)
  }
}

Source File: HadoopFileExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.{SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}
import org.apache.carbondata.hadoop.CarbonProjection

// scalastyle:off println
object HadoopFileExample {

  def main(args: Array[String]): Unit = {
    val spark = ExampleUtils.createSparkSession("HadoopFileExample")
    val rootPath = new File(this.getClass.getResource("/").getPath
                            + "../../../..").getCanonicalPath
    val storeLocation: String = rootPath + "/examples/spark/target/store/default"
    exampleBody(spark, storeLocation)
    spark.close()
  }

  def exampleBody(spark : SparkSession, storeLocation : String): Unit = {

    import spark.implicits._
    val df = spark.sparkContext.parallelize(1 to 1000)
      .map(x => ("a", "b", x))
      .toDF("c1", "c2", "c3")

    df.write.format("carbondata")
      .option("tableName", "Hadoopfile_table")
      .option("compress", "true")
      .mode(SaveMode.Overwrite).save()

    // read two columns
    val projection = new CarbonProjection
    projection.addColumn("c1")  // column c1
    projection.addColumn("c3")  // column c3
    val conf = new Configuration()

    CarbonInputFormat.setColumnProjection(conf, projection)
    CarbonInputFormat.setDatabaseName(conf, "default")
    CarbonInputFormat.setTableName(conf, "Hadoopfile_table")


    val input = spark.sparkContext.newAPIHadoopFile(s"${storeLocation}/Hadoopfile_table",
      classOf[CarbonTableInputFormat[Array[Object]]],
      classOf[Void],
      classOf[Array[Object]],
      conf)
    val result = input.map(x => x._2.toList).collect
    result.foreach(x => println(x.mkString(", ")))

    // delete carbondata file
    ExampleUtils.cleanSampleCarbonFile(spark, "Hadoopfile_table")
  }
}
// scalastyle:on println

Source File: FlinkExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import org.apache.flink.api.java.ExecutionEnvironment
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.Job

import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.carbondata.hadoop.CarbonProjection
import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}

// Write carbondata file by spark and read it by flink
// scalastyle:off println
object FlinkExample {

  def main(args: Array[String]): Unit = {
    // write carbondata file by spark
    val cc = ExampleUtils.createCarbonSession("FlinkExample")
    val path = ExampleUtils.writeSampleCarbonFile(cc, "carbon1")

    // read two columns by flink
    val projection = new CarbonProjection
    projection.addColumn("c1")  // column c1
    projection.addColumn("c3")  // column c3
    val conf = new Configuration()
    CarbonInputFormat.setColumnProjection(conf, projection)

    val env = ExecutionEnvironment.getExecutionEnvironment
    val ds = env.readHadoopFile(
      new CarbonTableInputFormat[Array[Object]],
      classOf[Void],
      classOf[Array[Object]],
      path,
      new Job(conf)
    )

    // print result
    val result = ds.collect()
    for (i <- 0 until result.size()) {
      println(result.get(i).f1.mkString(","))
    }

    // delete carbondata file
    ExampleUtils.cleanSampleCarbonFile(cc, "carbon1")
  }
}
// scalastyle:on println

org.apache.hadoop.conf.Configuration Scala Examples