org.apache.spark.sql.types.AtomicType Scala Examples
The following examples show how to use org.apache.spark.sql.types.AtomicType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CompressibleColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 2
Source File: CompressionScheme.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.columnar.{ColumnType, NativeColumnType} import org.apache.spark.sql.types.AtomicType private[sql] trait Encoder[T <: AtomicType] { def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {} def compressedSize: Int def uncompressedSize: Int def compressionRatio: Double = { if (uncompressedSize > 0) compressedSize.toDouble / uncompressedSize else 1.0 } def compress(from: ByteBuffer, to: ByteBuffer): ByteBuffer } private[sql] trait Decoder[T <: AtomicType] { def next(row: MutableRow, ordinal: Int): Unit def hasNext: Boolean } private[sql] trait CompressionScheme { def typeId: Int def supports(columnType: ColumnType[_, _]): Boolean def encoder[T <: AtomicType](columnType: NativeColumnType[T]): Encoder[T] def decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T]): Decoder[T] } private[sql] trait WithCompressionSchemes { def schemes: Seq[CompressionScheme] } private[sql] trait AllCompressionSchemes extends WithCompressionSchemes { override val schemes: Seq[CompressionScheme] = CompressionScheme.all } private[sql] object CompressionScheme { val all: Seq[CompressionScheme] = Seq(PassThrough, RunLengthEncoding, DictionaryEncoding, BooleanBitSet, IntDelta, LongDelta) private val typeIdToScheme = all.map(scheme => scheme.typeId -> scheme).toMap def apply(typeId: Int): CompressionScheme = { typeIdToScheme.getOrElse(typeId, throw new UnsupportedOperationException( s"Unrecognized compression scheme type ID: $typeId")) } def columnHeaderSize(columnBuffer: ByteBuffer): Int = { val header = columnBuffer.duplicate().order(ByteOrder.nativeOrder) val nullCount = header.getInt(4) // Column type ID + null count + null positions 4 + 4 + 4 * nullCount } }
Example 3
Source File: TestCompressibleColumnBuilder.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import org.apache.spark.sql.columnar._ import org.apache.spark.sql.types.AtomicType class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } }
Example 4
Source File: ColumnarTestUtils.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import java.sql.Timestamp import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types.{UTF8String, DataType, Decimal, AtomicType} object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { val row = new GenericMutableRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[T <: DataType, JvmType](columnType: ColumnType[T, JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case STRING => UTF8String(Random.nextString(Random.nextInt(32))) case BOOLEAN => Random.nextBoolean() case BINARY => randomBytes(Random.nextInt(32)) case DATE => Random.nextInt() case TIMESTAMP => val timestamp = new Timestamp(Random.nextLong()) timestamp.setNanos(Random.nextInt(999999999)) timestamp case _ => // Using a random one-element map instead of an arbitrary object Map(Random.nextInt() -> Random.nextString(Random.nextInt(32))) }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_ <: DataType, _], tail: ColumnType[_ <: DataType, _]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[T <: DataType, JvmType]( columnType: ColumnType[T, JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_ <: DataType, _], tail: ColumnType[_ <: DataType, _]*): Row = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Row = { val row = new GenericMutableRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericMutableRow(1) row(0) = value row } (values, rows) } }
Example 5
Source File: CompressibleColumnBuilder.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType private[sql] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } protected def isWorthCompressing(encoder: Encoder[T]) = { encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { var i = 0 while (i < compressionEncoders.length) { compressionEncoders(i).gatherCompressibilityStats(row, ordinal) i += 1 } } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val typeId = nonNullBuffer.getInt() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = column type ID + null count + null positions val headerSize = 4 + 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(typeId) .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } }
Example 6
Source File: CompressibleColumnAccessor.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.types.AtomicType private[sql] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: MutableRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } }
Example 7
Source File: CompressionScheme.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.columnar.{ColumnType, NativeColumnType} import org.apache.spark.sql.types.AtomicType private[sql] trait Encoder[T <: AtomicType] { def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {} def compressedSize: Int def uncompressedSize: Int def compressionRatio: Double = { if (uncompressedSize > 0) compressedSize.toDouble / uncompressedSize else 1.0 } def compress(from: ByteBuffer, to: ByteBuffer): ByteBuffer } private[sql] trait Decoder[T <: AtomicType] { def next(row: MutableRow, ordinal: Int): Unit def hasNext: Boolean } private[sql] trait CompressionScheme { def typeId: Int def supports(columnType: ColumnType[_]): Boolean def encoder[T <: AtomicType](columnType: NativeColumnType[T]): Encoder[T] def decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T]): Decoder[T] } private[sql] trait WithCompressionSchemes { def schemes: Seq[CompressionScheme] } private[sql] trait AllCompressionSchemes extends WithCompressionSchemes { override val schemes: Seq[CompressionScheme] = CompressionScheme.all } private[sql] object CompressionScheme { val all: Seq[CompressionScheme] = Seq(PassThrough, RunLengthEncoding, DictionaryEncoding, BooleanBitSet, IntDelta, LongDelta) private val typeIdToScheme = all.map(scheme => scheme.typeId -> scheme).toMap def apply(typeId: Int): CompressionScheme = { typeIdToScheme.getOrElse(typeId, throw new UnsupportedOperationException( s"Unrecognized compression scheme type ID: $typeId")) } def columnHeaderSize(columnBuffer: ByteBuffer): Int = { val header = columnBuffer.duplicate().order(ByteOrder.nativeOrder) val nullCount = header.getInt(4) // Column type ID + null count + null positions 4 + 4 + 4 * nullCount } }
Example 8
Source File: TestCompressibleColumnBuilder.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import org.apache.spark.sql.columnar._ import org.apache.spark.sql.types.AtomicType //测试可压缩列生成器 class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } }
Example 9
Source File: ColumnarTestUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types.{DataType, Decimal, AtomicType} import org.apache.spark.unsafe.types.UTF8String //列测试工具 object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { val row = new GenericMutableRow(length) (0 until length).foreach(row.setNullAt) row } //产生随机值 def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case DATE => Random.nextInt() case LONG => Random.nextLong() case TIMESTAMP => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case _ => // Using a random one-element map instead of an arbitrary object //使用随机一元映射代替任意对象 Map(Random.nextInt() -> Random.nextString(Random.nextInt(32))) }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } //使唯一随机值 def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericMutableRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } //使唯一值和单值行 def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericMutableRow(1) row(0) = value row } (values, rows) } }
Example 10
Source File: CompressibleColumnAccessor.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.types.AtomicType private[sql] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: MutableRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } }
Example 11
Source File: CompressibleColumnAccessor.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.execution.vectorized.WritableColumnVector import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = decoder.decompress(columnVector, capacity) }
Example 12
Source File: TestCompressibleColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.{AtomicType, DataType} class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } } object ColumnBuilderHelper { def apply( dataType: DataType, batchSize: Int, name: String, useCompression: Boolean): ColumnBuilder = { ColumnBuilder(dataType, batchSize, name, useCompression) } }
Example 13
Source File: ColumnarTestUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 14
Source File: CompressibleColumnBuilder.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } protected def isWorthCompressing(encoder: Encoder[T]) = { encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { var i = 0 while (i < compressionEncoders.length) { compressionEncoders(i).gatherCompressibilityStats(row, ordinal) i += 1 } } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } }
Example 15
Source File: CompressibleColumnAccessor.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: MutableRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } }
Example 16
Source File: CompressionScheme.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.execution.columnar.{ColumnType, NativeColumnType} import org.apache.spark.sql.types.AtomicType private[columnar] trait Encoder[T <: AtomicType] { def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {} def compressedSize: Int def uncompressedSize: Int def compressionRatio: Double = { if (uncompressedSize > 0) compressedSize.toDouble / uncompressedSize else 1.0 } def compress(from: ByteBuffer, to: ByteBuffer): ByteBuffer } private[columnar] trait Decoder[T <: AtomicType] { def next(row: MutableRow, ordinal: Int): Unit def hasNext: Boolean } private[columnar] trait CompressionScheme { def typeId: Int def supports(columnType: ColumnType[_]): Boolean def encoder[T <: AtomicType](columnType: NativeColumnType[T]): Encoder[T] def decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T]): Decoder[T] } private[columnar] trait WithCompressionSchemes { def schemes: Seq[CompressionScheme] } private[columnar] trait AllCompressionSchemes extends WithCompressionSchemes { override val schemes: Seq[CompressionScheme] = CompressionScheme.all } private[columnar] object CompressionScheme { val all: Seq[CompressionScheme] = Seq(PassThrough, RunLengthEncoding, DictionaryEncoding, BooleanBitSet, IntDelta, LongDelta) private val typeIdToScheme = all.map(scheme => scheme.typeId -> scheme).toMap def apply(typeId: Int): CompressionScheme = { typeIdToScheme.getOrElse(typeId, throw new UnsupportedOperationException( s"Unrecognized compression scheme type ID: $typeId")) } def columnHeaderSize(columnBuffer: ByteBuffer): Int = { val header = columnBuffer.duplicate().order(ByteOrder.nativeOrder) val nullCount = header.getInt() // null count + null positions 4 + 4 * nullCount } }
Example 17
Source File: TestCompressibleColumnBuilder.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.AtomicType class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } }
Example 18
Source File: ColumnarTestUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericMutableRow} import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { val row = new GenericMutableRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericMutableRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericMutableRow(1) row(0) = value row } (values, rows) } }
Example 19
Source File: CompressibleColumnAccessor.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } }
Example 20
Source File: CompressibleColumnAccessor.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } }
Example 21
Source File: TestCompressibleColumnBuilder.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.AtomicType class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } }
Example 22
Source File: ColumnarTestUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 23
Source File: CompressibleColumnBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 24
Source File: CompressibleColumnAccessor.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.execution.vectorized.WritableColumnVector import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = decoder.decompress(columnVector, capacity) }
Example 25
Source File: TestCompressibleColumnBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.{AtomicType, DataType} class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } } object ColumnBuilderHelper { def apply( dataType: DataType, batchSize: Int, name: String, useCompression: Boolean): ColumnBuilder = { ColumnBuilder(dataType, batchSize, name, useCompression) } }
Example 26
Source File: ColumnarTestUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 27
Source File: CompressibleColumnBuilder.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 28
Source File: CompressibleColumnBuilder.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 29
Source File: TestCompressibleColumnBuilder.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.AtomicType class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } }
Example 30
Source File: ColumnarTestUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 31
Source File: CompressibleColumnBuilder.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 32
Source File: CompressibleColumnAccessor.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } }
Example 33
Source File: TestCompressibleColumnBuilder.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.AtomicType class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } }
Example 34
Source File: ColumnarTestUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 35
Source File: CompressibleColumnBuilder.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.Logging import org.apache.spark.sql.Row import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType private[sql] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } protected def isWorthCompressing(encoder: Encoder[T]) = { encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = { var i = 0 while (i < compressionEncoders.length) { compressionEncoders(i).gatherCompressibilityStats(row, ordinal) i += 1 } } abstract override def appendFrom(row: Row, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val typeId = nonNullBuffer.getInt() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = column type ID + null count + null positions val headerSize = 4 + 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(typeId) .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } }