java.nio.channels.Channels Scala Examples
The following examples show how to use java.nio.channels.Channels.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SerializableBuffer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 2
Source File: ProtobufUtilTest.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.util import java.io.File import java.nio.channels.Channels import java.nio.file.Files import com.spotify.scio.ScioContext import com.spotify.scio.avro._ import com.spotify.scio.coders.Coder import com.spotify.scio.proto.Track.TrackPB import org.apache.avro.file.DataFileStream import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.beam.sdk.io.{FileSystems, LocalResources} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers import scala.jdk.CollectionConverters._ class ProtobufUtilTest extends AnyFlatSpec with Matchers { "ProtobufUtil" should "convert Message -> GenericRecords that can be written and read" in { val sc = ScioContext() val dir = Files.createTempDirectory("protobuf-util-") val (path1, path2) = (new File(s"$dir/1"), new File(s"$dir/2")) path1.deleteOnExit() path2.deleteOnExit() dir.toFile.deleteOnExit() implicit val grCoder: Coder[GenericRecord] = ProtobufUtil.AvroMessageCoder val messages = sc .parallelize(1 to 10) .map(i => TrackPB.newBuilder().setTrackId(i.toString).build()) messages .map(ProtobufUtil.toAvro[TrackPB]) .saveAsAvroFile( path1.getPath, suffix = ".protobuf", metadata = ProtobufUtil.schemaMetadataOf[TrackPB], schema = ProtobufUtil.AvroMessageSchema, numShards = 1 ) val protoWriteTap = messages.saveAsProtobufFile(path2.getPath, numShards = 1) val result = sc.run().waitUntilDone() val (tapFromAvroWrite, tapFromProtoWrite) = ( ObjectFileTap[TrackPB](ScioUtil.addPartSuffix(path1.getPath)), protoWriteTap.get(result) ) tapFromAvroWrite.value.toList should contain theSameElementsAs tapFromProtoWrite.value.toList getMetadata(path1) should contain theSameElementsAs getMetadata(path2) } private def getMetadata(dir: File): Map[String, AnyRef] = { val files = dir.listFiles() if (files.length != 1) { fail(s"Directory $dir should contain 1 Avro file. Instead, found ${files.toList}") } val dfs = new DataFileStream[GenericRecord]( Channels.newInputStream(FileSystems.open(LocalResources.fromFile(files(0), false))), new GenericDatumReader[GenericRecord] ) dfs.getMetaKeys.asScala.map(k => (k, dfs.getMetaString(k))).toMap } }
Example 3
Source File: TFRecordCodec.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.tensorflow import java.io.{InputStream, PushbackInputStream} import java.nio.channels.Channels import java.nio.{ByteBuffer, ByteOrder} import java.util.zip.GZIPInputStream import org.apache.beam.sdk.io.Compression import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.hash.Hashing import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.primitives.Ints import org.apache.commons.compress.compressors.deflate._ import org.apache.commons.compress.compressors.gzip._ private object TFRecordCodec { private val headerLength: Int = (java.lang.Long.SIZE + java.lang.Integer.SIZE) / java.lang.Byte.SIZE private val footerLength: Int = java.lang.Integer.SIZE / java.lang.Byte.SIZE private val crc32c = Hashing.crc32c() private def mask(crc: Int): Int = ((crc >>> 15) | (crc << 17)) + 0xa282ead8 def read(input: InputStream): Array[Byte] = { val headerBytes = readFully(input, headerLength) if (headerBytes != null) { val headerBuf = ByteBuffer.wrap(headerBytes).order(ByteOrder.LITTLE_ENDIAN) val length = headerBuf.getLong val maskedCrc32OfLength = headerBuf.getInt require(hashLong(length) == maskedCrc32OfLength, "Invalid masked CRC32 of length") val data = readFully(input, length.toInt) val footerBytes = readFully(input, footerLength) val footerBuf = ByteBuffer.wrap(footerBytes).order(ByteOrder.LITTLE_ENDIAN) val maskedCrc32OfData = footerBuf.getInt require(hashBytes(data) == maskedCrc32OfData, "Invalid masked CRC32 of data") data } else { null } } // InflaterInputStream#read may not fill a buffer fully even when there are more data available private def readFully(input: InputStream, length: Int): Array[Byte] = { val data = Array.ofDim[Byte](length) var n = 0 var off = 0 do { n = input.read(data, off, data.length - off) if (n > 0) { off += n } } while (n > 0 && off < data.length) if (n <= 0) null else data } def wrapInputStream(stream: InputStream, compression: Compression): InputStream = { val deflateParam = new DeflateParameters() deflateParam.setWithZlibHeader(true) compression match { case Compression.AUTO => val pushback = new PushbackInputStream(stream, 2) if (isInflaterInputStream(pushback)) { new DeflateCompressorInputStream(pushback, deflateParam) } else if (isGzipInputStream(pushback)) { new GzipCompressorInputStream(pushback) } else { pushback } case Compression.UNCOMPRESSED => stream case _ => Channels.newInputStream(compression.readDecompressed(Channels.newChannel(stream))) } } private def hashLong(x: Long): Int = mask(crc32c.hashLong(x).asInt()) private def hashBytes(x: Array[Byte]): Int = mask(crc32c.hashBytes(x).asInt()) private def isGzipInputStream(pushback: PushbackInputStream): Boolean = { val b1 = pushback.read() val b2 = pushback.read() if (b2 != -1) pushback.unread(b2) if (b1 != -1) pushback.unread(b1) val zero: Byte = 0x00 val header = Ints.fromBytes(zero, zero, b2.toByte, b1.toByte) (b1 != -1 && b2 != -1) && header == GZIPInputStream.GZIP_MAGIC } private def isInflaterInputStream(pushback: PushbackInputStream): Boolean = { val b1 = pushback.read() val b2 = pushback.read() if (b2 != -1) pushback.unread(b2) if (b1 != -1) pushback.unread(b1) (b1 != -1 && b2 != -1) && (b1 == 0x78 && (b1 * 256 + b2) % 31 == 0) } }
Example 4
Source File: AttachmentService.scala From BacklogMigration-Redmine with MIT License | 5 votes |
package com.nulabinc.backlog.r2b.exporter.service import java.io.{File, FileOutputStream} import java.net.{HttpURLConnection, URL} import java.nio.channels.Channels import com.nulabinc.backlog.migration.common.utils.ControlUtil.using import com.nulabinc.backlog.migration.common.utils.Logging object AttachmentService extends Logging { private val MAX_REDIRECT_COUNT = 10 def download(url: URL, file: File): Unit = { val redirected = followRedirect(url) doDownload(redirected, file) } private def doDownload(url: URL, file: File): Unit = try { val rbc = Channels.newChannel(url.openStream()) val fos = new FileOutputStream(file) fos.getChannel.transferFrom(rbc, 0, java.lang.Long.MAX_VALUE) rbc.close() fos.close() } catch { case e: Throwable => logger.warn("Download attachment failed: " + e.getMessage) } private def followRedirect(url: URL, count: Int = 0): URL = url.openConnection match { case http: HttpURLConnection => http.setRequestMethod("GET") http.connect() using(http) { connection => connection.getResponseCode match { case 301 | 302 | 303 => val newUrl = new URL(connection.getHeaderField("Location")) if (count < MAX_REDIRECT_COUNT) followRedirect(newUrl, count + 1) else newUrl case _ => url } } case _ => url } }
Example 5
Source File: SerializableBuffer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value: ByteBuffer = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 6
Source File: SerializableBuffer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value: ByteBuffer = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 7
Source File: SerializableBuffer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value: ByteBuffer = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区 buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 8
Source File: SerializableBuffer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value: ByteBuffer = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 9
Source File: SerializableBuffer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value: ByteBuffer = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 10
Source File: SerializableBuffer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value: ByteBuffer = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 11
Source File: SerializableBuffer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream} import java.nio.ByteBuffer import java.nio.channels.Channels private[spark] class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable { def value: ByteBuffer = buffer private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { val length = in.readInt() buffer = ByteBuffer.allocate(length) var amountRead = 0 val channel = Channels.newChannel(in) while (amountRead < length) { val ret = channel.read(buffer) if (ret == -1) { throw new EOFException("End of file before fully reading buffer") } amountRead += ret } buffer.rewind() // Allow us to read it later } private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.writeInt(buffer.limit()) if (Channels.newChannel(out).write(buffer) != buffer.limit()) { throw new IOException("Could not fully write buffer to output stream") } buffer.rewind() // Allow us to write it again later } }
Example 12
Source File: PythonSQLUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.api.python import java.io.InputStream import java.nio.channels.Channels import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDDServer import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.arrow.ArrowConverters import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText) // This is needed when generating SQL documentation for built-in functions. def listBuiltinFunctionInfos(): Array[ExpressionInfo] = { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } private[sql] class ArrowRDDServer(sqlContext: SQLContext) extends PythonRDDServer { override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = { // Create array to consume iterator so that we can safely close the inputStream val batches = ArrowConverters.getBatchesFromStream(Channels.newChannel(input)).toArray // Parallelize the record batches to create an RDD JavaRDD.fromRDD(sqlContext.sparkContext.parallelize(batches, batches.length)) } }
Example 13
Source File: CreateJacksonParser.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.json import java.io.{ByteArrayInputStream, InputStream, InputStreamReader} import java.nio.channels.Channels import java.nio.charset.Charset import com.fasterxml.jackson.core.{JsonFactory, JsonParser} import org.apache.hadoop.io.Text import sun.nio.cs.StreamDecoder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.unsafe.types.UTF8String private[sql] object CreateJacksonParser extends Serializable { def string(jsonFactory: JsonFactory, record: String): JsonParser = { jsonFactory.createParser(record) } def utf8String(jsonFactory: JsonFactory, record: UTF8String): JsonParser = { val bb = record.getByteBuffer assert(bb.hasArray) val bain = new ByteArrayInputStream( bb.array(), bb.arrayOffset() + bb.position(), bb.remaining()) jsonFactory.createParser(new InputStreamReader(bain, "UTF-8")) } def text(jsonFactory: JsonFactory, record: Text): JsonParser = { jsonFactory.createParser(record.getBytes, 0, record.getLength) } // Jackson parsers can be ranked according to their performance: // 1. Array based with actual encoding UTF-8 in the array. This is the fastest parser // but it doesn't allow to set encoding explicitly. Actual encoding is detected automatically // by checking leading bytes of the array. // 2. InputStream based with actual encoding UTF-8 in the stream. Encoding is detected // automatically by analyzing first bytes of the input stream. // 3. Reader based parser. This is the slowest parser used here but it allows to create // a reader with specific encoding. // The method creates a reader for an array with given encoding and sets size of internal // decoding buffer according to size of input array. private def getStreamDecoder(enc: String, in: Array[Byte], length: Int): StreamDecoder = { val bais = new ByteArrayInputStream(in, 0, length) val byteChannel = Channels.newChannel(bais) val decodingBufferSize = Math.min(length, 8192) val decoder = Charset.forName(enc).newDecoder() StreamDecoder.forDecoder(byteChannel, decoder, decodingBufferSize) } def text(enc: String, jsonFactory: JsonFactory, record: Text): JsonParser = { val sd = getStreamDecoder(enc, record.getBytes, record.getLength) jsonFactory.createParser(sd) } def inputStream(jsonFactory: JsonFactory, is: InputStream): JsonParser = { jsonFactory.createParser(is) } def inputStream(enc: String, jsonFactory: JsonFactory, is: InputStream): JsonParser = { jsonFactory.createParser(new InputStreamReader(is, enc)) } def internalRow(jsonFactory: JsonFactory, row: InternalRow): JsonParser = { val ba = row.getBinary(0) jsonFactory.createParser(ba, 0, ba.length) } def internalRow(enc: String, jsonFactory: JsonFactory, row: InternalRow): JsonParser = { val binary = row.getBinary(0) val sd = getStreamDecoder(enc, binary, binary.length) jsonFactory.createParser(sd) } }
Example 14
Source File: GcsStore.scala From fs2-blobstore with Apache License 2.0 | 5 votes |
package blobstore.gcs import java.nio.channels.Channels import java.time.Instant import java.util.Date import blobstore.{Path, Store} import cats.effect.{Blocker, ContextShift, Sync} import com.google.api.gax.paging.Page import com.google.cloud.storage.{Acl, Blob, BlobId, BlobInfo, Storage} import com.google.cloud.storage.Storage.{BlobListOption, CopyRequest} import fs2.{Chunk, Pipe, Stream} import scala.jdk.CollectionConverters._ final class GcsStore[F[_]](storage: Storage, blocker: Blocker, acls: List[Acl] = Nil)(implicit F: Sync[F], CS: ContextShift[F]) extends Store[F] { private def _chunk(pg: Page[Blob]): Chunk[Path] = { val (dirs, files) = pg.getValues.asScala.toSeq.partition(_.isDirectory) val dirPaths = Chunk.seq(dirs.map(b => Path(root = b.getBucket, key = b.getName.stripSuffix("/"), size = None, isDir = true, lastModified = None))) val filePaths = Chunk.seq(files.map{b => val size = Option(b.getSize: java.lang.Long).map(_.toLong) // Prevent throwing NPE (see https://github.com/scala/bug/issues/9634) val lastModified = Option(b.getUpdateTime: java.lang.Long).map(millis => Date.from(Instant.ofEpochMilli(millis))) // Prevent throwing NPE (see https://github.com/scala/bug/issues/9634) Path(b.getBucket, key = b.getName, size = size, isDir = false, lastModified = lastModified) }) Chunk.concat(List(dirPaths, filePaths)) } def list(path: Path): fs2.Stream[F, Path] = { Stream.unfoldChunkEval[F, () => Option[Page[Blob]], Path]{ () => Some(storage.list(path.root, BlobListOption.currentDirectory(), BlobListOption.prefix(path.key))) }{getPage => blocker.delay{ getPage().map{pg => if (pg.hasNextPage){ (_chunk(pg), () => Some(pg.getNextPage)) } else { (_chunk(pg), () => None) } } } } } def get(path: Path, chunkSize: Int): fs2.Stream[F, Byte] = { val is = blocker.delay(Channels.newInputStream(storage.get(path.root, path.key).reader())) fs2.io.readInputStream(is, chunkSize, blocker, closeAfterUse = true) } def put(path: Path): Pipe[F, Byte, Unit] = { val fos = Sync[F].delay{ val builder = { val b = BlobInfo.newBuilder(path.root, path.key) if (acls.nonEmpty) b.setAcl(acls.asJava) else b } val blobInfo = builder.build() val writer = storage.writer(blobInfo) Channels.newOutputStream(writer) } fs2.io.writeOutputStream(fos, blocker, closeAfterUse = true) } def move(src: Path, dst: Path): F[Unit] = F.productR(copy(src, dst))(remove(src)) def copy(src: Path, dst: Path): F[Unit] = { val req = CopyRequest.newBuilder().setSource(src.root, src.key).setTarget(BlobId.of(dst.root, dst.key)).build() F.void(blocker.delay(storage.copy(req).getResult)) } def remove(path: Path): F[Unit] = F.void(blocker.delay(storage.delete(path.root, path.key))) } object GcsStore{ def apply[F[_]]( storage: Storage, blocker: Blocker, acls: List[Acl] )(implicit F: Sync[F], CS: ContextShift[F]): GcsStore[F] = new GcsStore(storage, blocker, acls) }
Example 15
Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import java.io.ByteArrayOutputStream import java.nio.channels.Channels import java.util.concurrent.TimeUnit import com.twosigma.flint.arrow.ArrowUtils import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot } import org.apache.spark.sql.functions.{ array, col, lit, struct } import org.apache.spark.sql.types._ class ConcatArrowAndExplodeSpec extends TimeSeriesSuite { "ConcatArrowAndExplode" should "work" in { val batchSize = 10 var df = spark.range(1000, 2000, 1000).toDF("time") val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v"))) df = df.withColumn("base_rows", array(columns: _*)) val allocator = new RootAllocator(Long.MaxValue) val schema1 = StructType(Seq(StructField("v1", DoubleType))) val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator) val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector] vector1.allocateNew() for (i <- 0 until batchSize) { vector1.set(i, i + 10.0) } vector1.setValueCount(batchSize) val out1 = new ByteArrayOutputStream() val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1)) arrowWriter1.writeBatch() arrowWriter1.close() root1.close() df = df.withColumn("f1_schema", struct(lit(0.0).as("v1"))) df = df.withColumn("f1_data", lit(out1.toByteArray)) val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType))) val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator) val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector] val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector] vector2.allocateNew() vector3.allocateNew() for (i <- 0 until batchSize) { vector2.set(i, i + 20.0) } vector2.setValueCount(batchSize) for (i <- 0 until batchSize) { vector3.set(i, i + 30L) } vector3.setValueCount(batchSize) val out2 = new ByteArrayOutputStream() val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2)) arrowWriter2.writeBatch() arrowWriter2.close() root2.close() df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3"))) df = df.withColumn("f2_data", lit(out2.toByteArray)) var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data")) tsrdd.toDF.show() var expected = spark.range(1000, 1000 + batchSize).toDF("time") expected = expected.withColumn("v", col("time") - 1000.0) expected = expected.withColumn("v1", col("time") - 1000 + 10.0) expected = expected.withColumn("v2", col("time") - 1000 + 20.0) expected = expected.withColumn("v3", col("time") - 1000 + 30) val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) assertEquals(tsrdd, expectedTsrdd) } }
Example 16
Source File: ArrowSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.summarize.summarizer import java.io.ByteArrayOutputStream import java.nio.channels.Channels import java.util import com.twosigma.flint.arrow.{ ArrowFieldWriter, ArrowPayload, ArrowUtils, ArrowWriter } import org.apache.arrow.memory.{ BufferAllocator, RootAllocator } import org.apache.arrow.vector.VectorSchemaRoot import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.StructType import scala.collection.JavaConverters._ case class ArrowSummarizer(inputSchema: StructType, outputSchema: StructType, includeBaseRows: Boolean) extends Summarizer[InternalRow, ArrowSummarizerState, ArrowSummarizerResult] { private[this] val size = outputSchema.size require(size > 0, "Cannot create summarizer with no input columns") // This function will allocate memory from the BufferAllocator to initialize arrow vectors. override def zero(): ArrowSummarizerState = { new ArrowSummarizerState(false, null, null, null, null) } private def init(u: ArrowSummarizerState): Unit = { if (!u.initialized) { val arrowSchema = ArrowUtils.toArrowSchema(outputSchema) val allocator = new RootAllocator(Int.MaxValue) val root = VectorSchemaRoot.create(arrowSchema, allocator) val arrowWriter = ArrowWriter.create(inputSchema, outputSchema, root) u.initialized = true u.baseRows = new util.ArrayList[InternalRow]() u.allocator = allocator u.root = root u.arrowWriter = arrowWriter } } override def add(u: ArrowSummarizerState, row: InternalRow): ArrowSummarizerState = { if (!u.initialized) { init(u) } if (includeBaseRows) { u.baseRows.add(row) } u.arrowWriter.write(row) u } override def merge( u1: ArrowSummarizerState, u2: ArrowSummarizerState ): ArrowSummarizerState = throw new UnsupportedOperationException() // This can only be called once override def render(u: ArrowSummarizerState): ArrowSummarizerResult = { if (u.initialized) { val out = new ByteArrayOutputStream() val writer = new ArrowFileWriter(u.root, null, Channels.newChannel(out)) u.arrowWriter.finish() writer.writeBatch() writer.close() u.root.close() u.allocator.close() val rows = u.baseRows.toArray.asInstanceOf[Array[Any]] ArrowSummarizerResult(rows, out.toByteArray) } else { ArrowSummarizerResult(Array.empty, Array.empty) } } override def close(u: ArrowSummarizerState): Unit = { if (u.initialized) { u.arrowWriter.reset() u.root.close() u.allocator.close() } } }
Example 17
Source File: ArrowConverters.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.arrow import java.io.ByteArrayOutputStream import java.nio.channels.Channels import org.apache.arrow.memory.BufferAllocator import org.apache.arrow.vector._ import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types._ import com.twosigma.flint.util.Utils import org.apache.arrow.vector.ipc.{ ArrowFileReader, ArrowFileWriter } import org.apache.arrow.vector.ipc.message.ArrowRecordBatch trait ClosableIterator[T] extends Iterator[T] with AutoCloseable class ConcatClosableIterator[T](iters: Iterator[ClosableIterator[T]]) extends ClosableIterator[T] { var curIter: ClosableIterator[T] = _ private def advance(): Unit = { require(curIter == null || !curIter.hasNext, "Should not advance if curIter is not empty") require(iters.hasNext, "Should not advance if iters doesn't have next") closeCurrent() curIter = iters.next() } private def closeCurrent(): Unit = if (curIter != null) curIter.close() override def close(): Unit = closeCurrent() override def hasNext: Boolean = { if (curIter == null || !curIter.hasNext) { if (iters.hasNext) { advance() hasNext } else { false } } else { true } } override def next(): T = curIter.next() } def byteArrayToBatch( batchBytes: Array[Byte], allocator: BufferAllocator ): ArrowRecordBatch = { val in = new ByteArrayReadableSeekableByteChannel(batchBytes) val reader = new ArrowFileReader(in, allocator) // Read a batch from a byte stream, ensure the reader is closed Utils.tryWithSafeFinally { val root = reader.getVectorSchemaRoot // throws IOException val unloader = new VectorUnloader(root) reader.loadNextBatch() // throws IOException unloader.getRecordBatch } { reader.close() } } }
Example 18
Source File: FileManager.scala From slide-desktop with GNU General Public License v2.0 | 5 votes |
package slide import java.io.{File, FileOutputStream} import java.net.{URL, URLConnection} import java.nio.channels.{Channels, ReadableByteChannel} class FileManager { var currentFile: String = "" var numberOfDownloads: Int = 0 def downloadFile(dlsite: String, path: String): Unit = { val url: URL = new URL(dlsite) val file: File = new File(path) if (isConnected(url)) { currentFile = path onDownloadStart() new Thread(new Runnable { override def run(): Unit = { try { val rbc: ReadableByteChannel = Channels.newChannel(url.openStream()) val fos: FileOutputStream = new FileOutputStream(file) fos.getChannel.transferFrom(rbc, 0, java.lang.Long.MAX_VALUE) fos.close() numberOfDownloads += 1 onDownloadFinished() } catch { case e: Exception => println("Error: Could not download ADB, please run as Administrator") } } }).start() } } def isConnected(site: URL): Boolean = { try { // test connection val conn: URLConnection = site.openConnection() conn.setConnectTimeout(5000) conn.getContent true } catch { case e: Exception => false } } def onDownloadStart(): Unit = {} def onDownloadFinished(): Unit = {} // var onDownloadStart: () => Unit = null // var onDownloadFinished: () => Unit = null }