java.nio.channels.Channels Scala Example

Source File: SerializableBuffer.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: ProtobufUtilTest.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.util

import java.io.File
import java.nio.channels.Channels
import java.nio.file.Files

import com.spotify.scio.ScioContext
import com.spotify.scio.avro._
import com.spotify.scio.coders.Coder
import com.spotify.scio.proto.Track.TrackPB
import org.apache.avro.file.DataFileStream
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.beam.sdk.io.{FileSystems, LocalResources}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

import scala.jdk.CollectionConverters._

class ProtobufUtilTest extends AnyFlatSpec with Matchers {

  "ProtobufUtil" should "convert Message -> GenericRecords that can be written and read" in {
    val sc = ScioContext()

    val dir = Files.createTempDirectory("protobuf-util-")
    val (path1, path2) = (new File(s"$dir/1"), new File(s"$dir/2"))
    path1.deleteOnExit()
    path2.deleteOnExit()
    dir.toFile.deleteOnExit()

    implicit val grCoder: Coder[GenericRecord] = ProtobufUtil.AvroMessageCoder

    val messages = sc
      .parallelize(1 to 10)
      .map(i => TrackPB.newBuilder().setTrackId(i.toString).build())

    messages
      .map(ProtobufUtil.toAvro[TrackPB])
      .saveAsAvroFile(
        path1.getPath,
        suffix = ".protobuf",
        metadata = ProtobufUtil.schemaMetadataOf[TrackPB],
        schema = ProtobufUtil.AvroMessageSchema,
        numShards = 1
      )

    val protoWriteTap = messages.saveAsProtobufFile(path2.getPath, numShards = 1)

    val result = sc.run().waitUntilDone()

    val (tapFromAvroWrite, tapFromProtoWrite) = (
      ObjectFileTap[TrackPB](ScioUtil.addPartSuffix(path1.getPath)),
      protoWriteTap.get(result)
    )

    tapFromAvroWrite.value.toList should contain theSameElementsAs tapFromProtoWrite.value.toList
    getMetadata(path1) should contain theSameElementsAs getMetadata(path2)
  }

  private def getMetadata(dir: File): Map[String, AnyRef] = {
    val files = dir.listFiles()
    if (files.length != 1) {
      fail(s"Directory $dir should contain 1 Avro file. Instead, found ${files.toList}")
    }

    val dfs = new DataFileStream[GenericRecord](
      Channels.newInputStream(FileSystems.open(LocalResources.fromFile(files(0), false))),
      new GenericDatumReader[GenericRecord]
    )

    dfs.getMetaKeys.asScala.map(k => (k, dfs.getMetaString(k))).toMap
  }
}

Source File: TFRecordCodec.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.tensorflow

import java.io.{InputStream, PushbackInputStream}
import java.nio.channels.Channels
import java.nio.{ByteBuffer, ByteOrder}
import java.util.zip.GZIPInputStream

import org.apache.beam.sdk.io.Compression
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.hash.Hashing
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.primitives.Ints
import org.apache.commons.compress.compressors.deflate._
import org.apache.commons.compress.compressors.gzip._

private object TFRecordCodec {
  private val headerLength: Int =
    (java.lang.Long.SIZE + java.lang.Integer.SIZE) / java.lang.Byte.SIZE
  private val footerLength: Int = java.lang.Integer.SIZE / java.lang.Byte.SIZE
  private val crc32c = Hashing.crc32c()

  private def mask(crc: Int): Int = ((crc >>> 15) | (crc << 17)) + 0xa282ead8

  def read(input: InputStream): Array[Byte] = {
    val headerBytes = readFully(input, headerLength)
    if (headerBytes != null) {
      val headerBuf =
        ByteBuffer.wrap(headerBytes).order(ByteOrder.LITTLE_ENDIAN)
      val length = headerBuf.getLong
      val maskedCrc32OfLength = headerBuf.getInt
      require(hashLong(length) == maskedCrc32OfLength, "Invalid masked CRC32 of length")

      val data = readFully(input, length.toInt)

      val footerBytes = readFully(input, footerLength)
      val footerBuf =
        ByteBuffer.wrap(footerBytes).order(ByteOrder.LITTLE_ENDIAN)
      val maskedCrc32OfData = footerBuf.getInt
      require(hashBytes(data) == maskedCrc32OfData, "Invalid masked CRC32 of data")
      data
    } else {
      null
    }
  }

  // InflaterInputStream#read may not fill a buffer fully even when there are more data available
  private def readFully(input: InputStream, length: Int): Array[Byte] = {
    val data = Array.ofDim[Byte](length)
    var n = 0
    var off = 0
    do {
      n = input.read(data, off, data.length - off)
      if (n > 0) {
        off += n
      }
    } while (n > 0 && off < data.length)
    if (n <= 0) null else data
  }

  def wrapInputStream(stream: InputStream, compression: Compression): InputStream = {
    val deflateParam = new DeflateParameters()
    deflateParam.setWithZlibHeader(true)

    compression match {
      case Compression.AUTO =>
        val pushback = new PushbackInputStream(stream, 2)
        if (isInflaterInputStream(pushback)) {
          new DeflateCompressorInputStream(pushback, deflateParam)
        } else if (isGzipInputStream(pushback)) {
          new GzipCompressorInputStream(pushback)
        } else {
          pushback
        }
      case Compression.UNCOMPRESSED => stream
      case _ =>
        Channels.newInputStream(compression.readDecompressed(Channels.newChannel(stream)))
    }
  }

  private def hashLong(x: Long): Int = mask(crc32c.hashLong(x).asInt())
  private def hashBytes(x: Array[Byte]): Int = mask(crc32c.hashBytes(x).asInt())

  private def isGzipInputStream(pushback: PushbackInputStream): Boolean = {
    val b1 = pushback.read()
    val b2 = pushback.read()
    if (b2 != -1) pushback.unread(b2)
    if (b1 != -1) pushback.unread(b1)
    val zero: Byte = 0x00
    val header = Ints.fromBytes(zero, zero, b2.toByte, b1.toByte)
    (b1 != -1 && b2 != -1) && header == GZIPInputStream.GZIP_MAGIC
  }

  private def isInflaterInputStream(pushback: PushbackInputStream): Boolean = {
    val b1 = pushback.read()
    val b2 = pushback.read()
    if (b2 != -1) pushback.unread(b2)
    if (b1 != -1) pushback.unread(b1)
    (b1 != -1 && b2 != -1) && (b1 == 0x78 && (b1 * 256 + b2) % 31 == 0)
  }
}

Source File: AttachmentService.scala From BacklogMigration-Redmine with MIT License

5 votes

package com.nulabinc.backlog.r2b.exporter.service

import java.io.{File, FileOutputStream}
import java.net.{HttpURLConnection, URL}
import java.nio.channels.Channels

import com.nulabinc.backlog.migration.common.utils.ControlUtil.using
import com.nulabinc.backlog.migration.common.utils.Logging

object AttachmentService extends Logging {
  private val MAX_REDIRECT_COUNT = 10

  def download(url: URL, file: File): Unit = {
    val redirected = followRedirect(url)

    doDownload(redirected, file)
  }

  private def doDownload(url: URL, file: File): Unit =
    try {
      val rbc = Channels.newChannel(url.openStream())
      val fos = new FileOutputStream(file)
      fos.getChannel.transferFrom(rbc, 0, java.lang.Long.MAX_VALUE)

      rbc.close()
      fos.close()
    } catch {
      case e: Throwable => logger.warn("Download attachment failed: " + e.getMessage)
    }

  private def followRedirect(url: URL, count: Int = 0): URL =
    url.openConnection match {
      case http: HttpURLConnection =>
        http.setRequestMethod("GET")
        http.connect()
        using(http) { connection =>
          connection.getResponseCode match {
            case 301 | 302 | 303 =>
              val newUrl = new URL(connection.getHeaderField("Location"))
              if (count < MAX_REDIRECT_COUNT) followRedirect(newUrl, count + 1) else newUrl
            case _ =>
              url
          }
        }
      case _ =>
        url
    }
}

Source File: SerializableBuffer.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: SerializableBuffer.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: SerializableBuffer.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: SerializableBuffer.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: SerializableBuffer.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: SerializableBuffer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: SerializableBuffer.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
}

Source File: PythonSQLUtils.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.api.python

import java.io.InputStream
import java.nio.channels.Channels

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.python.PythonRDDServer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.execution.arrow.ArrowConverters
import org.apache.spark.sql.types.DataType

private[sql] object PythonSQLUtils {
  def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText)

  // This is needed when generating SQL documentation for built-in functions.
  def listBuiltinFunctionInfos(): Array[ExpressionInfo] = {
    FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
  }

  
private[sql] class ArrowRDDServer(sqlContext: SQLContext) extends PythonRDDServer {

  override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = {
    // Create array to consume iterator so that we can safely close the inputStream
    val batches = ArrowConverters.getBatchesFromStream(Channels.newChannel(input)).toArray
    // Parallelize the record batches to create an RDD
    JavaRDD.fromRDD(sqlContext.sparkContext.parallelize(batches, batches.length))
  }

}

Source File: CreateJacksonParser.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.json

import java.io.{ByteArrayInputStream, InputStream, InputStreamReader}
import java.nio.channels.Channels
import java.nio.charset.Charset

import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
import org.apache.hadoop.io.Text
import sun.nio.cs.StreamDecoder

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.unsafe.types.UTF8String

private[sql] object CreateJacksonParser extends Serializable {
  def string(jsonFactory: JsonFactory, record: String): JsonParser = {
    jsonFactory.createParser(record)
  }

  def utf8String(jsonFactory: JsonFactory, record: UTF8String): JsonParser = {
    val bb = record.getByteBuffer
    assert(bb.hasArray)

    val bain = new ByteArrayInputStream(
      bb.array(), bb.arrayOffset() + bb.position(), bb.remaining())

    jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
  }

  def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
    jsonFactory.createParser(record.getBytes, 0, record.getLength)
  }

  // Jackson parsers can be ranked according to their performance:
  // 1. Array based with actual encoding UTF-8 in the array. This is the fastest parser
  //    but it doesn't allow to set encoding explicitly. Actual encoding is detected automatically
  //    by checking leading bytes of the array.
  // 2. InputStream based with actual encoding UTF-8 in the stream. Encoding is detected
  //    automatically by analyzing first bytes of the input stream.
  // 3. Reader based parser. This is the slowest parser used here but it allows to create
  //    a reader with specific encoding.
  // The method creates a reader for an array with given encoding and sets size of internal
  // decoding buffer according to size of input array.
  private def getStreamDecoder(enc: String, in: Array[Byte], length: Int): StreamDecoder = {
    val bais = new ByteArrayInputStream(in, 0, length)
    val byteChannel = Channels.newChannel(bais)
    val decodingBufferSize = Math.min(length, 8192)
    val decoder = Charset.forName(enc).newDecoder()

    StreamDecoder.forDecoder(byteChannel, decoder, decodingBufferSize)
  }

  def text(enc: String, jsonFactory: JsonFactory, record: Text): JsonParser = {
    val sd = getStreamDecoder(enc, record.getBytes, record.getLength)
    jsonFactory.createParser(sd)
  }

  def inputStream(jsonFactory: JsonFactory, is: InputStream): JsonParser = {
    jsonFactory.createParser(is)
  }

  def inputStream(enc: String, jsonFactory: JsonFactory, is: InputStream): JsonParser = {
    jsonFactory.createParser(new InputStreamReader(is, enc))
  }

  def internalRow(jsonFactory: JsonFactory, row: InternalRow): JsonParser = {
    val ba = row.getBinary(0)

    jsonFactory.createParser(ba, 0, ba.length)
  }

  def internalRow(enc: String, jsonFactory: JsonFactory, row: InternalRow): JsonParser = {
    val binary = row.getBinary(0)
    val sd = getStreamDecoder(enc, binary, binary.length)

    jsonFactory.createParser(sd)
  }
}

Source File: GcsStore.scala From fs2-blobstore with Apache License 2.0

5 votes

package blobstore.gcs

import java.nio.channels.Channels
import java.time.Instant
import java.util.Date

import blobstore.{Path, Store}
import cats.effect.{Blocker, ContextShift, Sync}
import com.google.api.gax.paging.Page
import com.google.cloud.storage.{Acl, Blob, BlobId, BlobInfo, Storage}
import com.google.cloud.storage.Storage.{BlobListOption, CopyRequest}
import fs2.{Chunk, Pipe, Stream}

import scala.jdk.CollectionConverters._

final class GcsStore[F[_]](storage: Storage, blocker: Blocker, acls: List[Acl] = Nil)(implicit F: Sync[F], CS: ContextShift[F]) extends Store[F] {

  private def _chunk(pg: Page[Blob]): Chunk[Path] = {
    val (dirs, files) = pg.getValues.asScala.toSeq.partition(_.isDirectory)
    val dirPaths = Chunk.seq(dirs.map(b => Path(root = b.getBucket, key = b.getName.stripSuffix("/"), size = None, isDir = true, lastModified = None)))
    val filePaths = Chunk.seq(files.map{b =>
      val size = Option(b.getSize: java.lang.Long).map(_.toLong) // Prevent throwing NPE (see https://github.com/scala/bug/issues/9634)
      val lastModified = Option(b.getUpdateTime: java.lang.Long).map(millis => Date.from(Instant.ofEpochMilli(millis))) // Prevent throwing NPE (see https://github.com/scala/bug/issues/9634)
      Path(b.getBucket, key = b.getName, size = size, isDir = false, lastModified = lastModified)
    })
    Chunk.concat(List(dirPaths, filePaths))
  }

  def list(path: Path): fs2.Stream[F, Path] = {
    Stream.unfoldChunkEval[F, () => Option[Page[Blob]], Path]{
      () => Some(storage.list(path.root, BlobListOption.currentDirectory(), BlobListOption.prefix(path.key)))
    }{getPage =>
      blocker.delay{
        getPage().map{pg =>
          if (pg.hasNextPage){
            (_chunk(pg), () => Some(pg.getNextPage))
          } else {
            (_chunk(pg), () => None)
          }
        }
      }
    }
  }

  def get(path: Path, chunkSize: Int): fs2.Stream[F, Byte] = {
    val is = blocker.delay(Channels.newInputStream(storage.get(path.root, path.key).reader()))
    fs2.io.readInputStream(is, chunkSize, blocker, closeAfterUse = true)
  }

  def put(path: Path): Pipe[F, Byte, Unit] = {
    val fos = Sync[F].delay{
      val builder = {
        val b = BlobInfo.newBuilder(path.root, path.key)
        if (acls.nonEmpty) b.setAcl(acls.asJava) else b
      }
      val blobInfo = builder.build()
      val writer = storage.writer(blobInfo)
      Channels.newOutputStream(writer)
    }
    fs2.io.writeOutputStream(fos, blocker, closeAfterUse = true)
  }

  def move(src: Path, dst: Path): F[Unit] = F.productR(copy(src, dst))(remove(src))

  def copy(src: Path, dst: Path): F[Unit] = {
    val req = CopyRequest.newBuilder().setSource(src.root, src.key).setTarget(BlobId.of(dst.root, dst.key)).build()
    F.void(blocker.delay(storage.copy(req).getResult))
  }

  def remove(path: Path): F[Unit] =
    F.void(blocker.delay(storage.delete(path.root, path.key)))
}


object GcsStore{
  def apply[F[_]](
    storage: Storage,
    blocker: Blocker,
    acls: List[Acl]
  )(implicit F: Sync[F], CS: ContextShift[F]): GcsStore[F] = new GcsStore(storage, blocker, acls)
}

Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import java.io.ByteArrayOutputStream
import java.nio.channels.Channels
import java.util.concurrent.TimeUnit

import com.twosigma.flint.arrow.ArrowUtils
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.ipc.ArrowFileWriter
import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot }
import org.apache.spark.sql.functions.{ array, col, lit, struct }
import org.apache.spark.sql.types._

class ConcatArrowAndExplodeSpec extends TimeSeriesSuite {

  "ConcatArrowAndExplode" should "work" in {

    val batchSize = 10

    var df = spark.range(1000, 2000, 1000).toDF("time")
    val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v")))
    df = df.withColumn("base_rows", array(columns: _*))

    val allocator = new RootAllocator(Long.MaxValue)

    val schema1 = StructType(Seq(StructField("v1", DoubleType)))
    val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator)
    val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector]
    vector1.allocateNew()

    for (i <- 0 until batchSize) {
      vector1.set(i, i + 10.0)
    }
    vector1.setValueCount(batchSize)
    val out1 = new ByteArrayOutputStream()
    val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1))
    arrowWriter1.writeBatch()
    arrowWriter1.close()
    root1.close()
    df = df.withColumn("f1_schema", struct(lit(0.0).as("v1")))
    df = df.withColumn("f1_data", lit(out1.toByteArray))

    val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType)))
    val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator)
    val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector]
    val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector]
    vector2.allocateNew()
    vector3.allocateNew()

    for (i <- 0 until batchSize) {
      vector2.set(i, i + 20.0)
    }
    vector2.setValueCount(batchSize)

    for (i <- 0 until batchSize) {
      vector3.set(i, i + 30L)
    }
    vector3.setValueCount(batchSize)
    val out2 = new ByteArrayOutputStream()
    val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2))
    arrowWriter2.writeBatch()
    arrowWriter2.close()
    root2.close()
    df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3")))
    df = df.withColumn("f2_data", lit(out2.toByteArray))

    var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data"))
    tsrdd.toDF.show()

    var expected = spark.range(1000, 1000 + batchSize).toDF("time")
    expected = expected.withColumn("v", col("time") - 1000.0)
    expected = expected.withColumn("v1", col("time") - 1000 + 10.0)
    expected = expected.withColumn("v2", col("time") - 1000 + 20.0)
    expected = expected.withColumn("v3", col("time") - 1000 + 30)

    val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    assertEquals(tsrdd, expectedTsrdd)
  }

}

Source File: ArrowSummarizer.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd.function.summarize.summarizer

import java.io.ByteArrayOutputStream
import java.nio.channels.Channels
import java.util

import com.twosigma.flint.arrow.{ ArrowFieldWriter, ArrowPayload, ArrowUtils, ArrowWriter }
import org.apache.arrow.memory.{ BufferAllocator, RootAllocator }
import org.apache.arrow.vector.VectorSchemaRoot
import org.apache.arrow.vector.ipc.ArrowFileWriter
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.GenericArrayData
import org.apache.spark.sql.types.StructType

import scala.collection.JavaConverters._


case class ArrowSummarizer(inputSchema: StructType, outputSchema: StructType, includeBaseRows: Boolean)
  extends Summarizer[InternalRow, ArrowSummarizerState, ArrowSummarizerResult] {
  private[this] val size = outputSchema.size
  require(size > 0, "Cannot create summarizer with no input columns")

  // This function will allocate memory from the BufferAllocator to initialize arrow vectors.
  override def zero(): ArrowSummarizerState = {
    new ArrowSummarizerState(false, null, null, null, null)
  }

  private def init(u: ArrowSummarizerState): Unit = {
    if (!u.initialized) {
      val arrowSchema = ArrowUtils.toArrowSchema(outputSchema)
      val allocator = new RootAllocator(Int.MaxValue)
      val root = VectorSchemaRoot.create(arrowSchema, allocator)
      val arrowWriter = ArrowWriter.create(inputSchema, outputSchema, root)

      u.initialized = true
      u.baseRows = new util.ArrayList[InternalRow]()
      u.allocator = allocator
      u.root = root
      u.arrowWriter = arrowWriter
    }
  }

  override def add(u: ArrowSummarizerState, row: InternalRow): ArrowSummarizerState = {
    if (!u.initialized) {
      init(u)
    }

    if (includeBaseRows) {
      u.baseRows.add(row)
    }
    u.arrowWriter.write(row)
    u
  }

  override def merge(
    u1: ArrowSummarizerState,
    u2: ArrowSummarizerState
  ): ArrowSummarizerState = throw new UnsupportedOperationException()

  // This can only be called once
  override def render(u: ArrowSummarizerState): ArrowSummarizerResult = {
    if (u.initialized) {
      val out = new ByteArrayOutputStream()
      val writer = new ArrowFileWriter(u.root, null, Channels.newChannel(out))

      u.arrowWriter.finish()
      writer.writeBatch()

      writer.close()
      u.root.close()
      u.allocator.close()

      val rows = u.baseRows.toArray.asInstanceOf[Array[Any]]
      ArrowSummarizerResult(rows, out.toByteArray)
    } else {
      ArrowSummarizerResult(Array.empty, Array.empty)
    }
  }

  override def close(u: ArrowSummarizerState): Unit = {
    if (u.initialized) {
      u.arrowWriter.reset()
      u.root.close()
      u.allocator.close()
    }
  }
}

Source File: ArrowConverters.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.arrow

import java.io.ByteArrayOutputStream
import java.nio.channels.Channels

import org.apache.arrow.memory.BufferAllocator
import org.apache.arrow.vector._
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.types._
import com.twosigma.flint.util.Utils
import org.apache.arrow.vector.ipc.{ ArrowFileReader, ArrowFileWriter }
import org.apache.arrow.vector.ipc.message.ArrowRecordBatch

trait ClosableIterator[T] extends Iterator[T] with AutoCloseable

class ConcatClosableIterator[T](iters: Iterator[ClosableIterator[T]])
  extends ClosableIterator[T] {
  var curIter: ClosableIterator[T] = _

  private def advance(): Unit = {
    require(curIter == null || !curIter.hasNext, "Should not advance if curIter is not empty")
    require(iters.hasNext, "Should not advance if iters doesn't have next")
    closeCurrent()
    curIter = iters.next()
  }

  private def closeCurrent(): Unit = if (curIter != null) curIter.close()

  override def close(): Unit = closeCurrent()

  override def hasNext: Boolean = {
    if (curIter == null || !curIter.hasNext) {
      if (iters.hasNext) {
        advance()
        hasNext
      } else {
        false
      }
    } else {
      true
    }
  }

  override def next(): T = curIter.next()
}


  def byteArrayToBatch(
    batchBytes: Array[Byte],
    allocator: BufferAllocator
  ): ArrowRecordBatch = {
    val in = new ByteArrayReadableSeekableByteChannel(batchBytes)
    val reader = new ArrowFileReader(in, allocator)

    // Read a batch from a byte stream, ensure the reader is closed
    Utils.tryWithSafeFinally {
      val root = reader.getVectorSchemaRoot
      // throws IOException
      val unloader = new VectorUnloader(root)
      reader.loadNextBatch() // throws IOException
      unloader.getRecordBatch
    } {
      reader.close()
    }
  }
}

Source File: FileManager.scala From slide-desktop with GNU General Public License v2.0

5 votes

package slide

import java.io.{File, FileOutputStream}
import java.net.{URL, URLConnection}
import java.nio.channels.{Channels, ReadableByteChannel}

class FileManager {

    var currentFile: String = ""
    var numberOfDownloads: Int = 0

    def downloadFile(dlsite: String, path: String): Unit = {
        val url: URL = new URL(dlsite)
        val file: File = new File(path)

        if (isConnected(url)) {
            currentFile = path
            onDownloadStart()

            new Thread(new Runnable {
                override def run(): Unit = {
                    try {
                        val rbc: ReadableByteChannel = Channels.newChannel(url.openStream())
                        val fos: FileOutputStream = new FileOutputStream(file)

                        fos.getChannel.transferFrom(rbc, 0, java.lang.Long.MAX_VALUE)
                        fos.close()

                        numberOfDownloads += 1
                        onDownloadFinished()
                    } catch {
                        case e: Exception =>
                            println("Error: Could not download ADB, please run as Administrator")
                    }
                }
            }).start()
        }
    }

    def isConnected(site: URL): Boolean = {
        try {
            // test connection
            val conn: URLConnection = site.openConnection()
            conn.setConnectTimeout(5000)
            conn.getContent

            true
        } catch {
            case e: Exception => false
        }
    }

    def onDownloadStart(): Unit = {}

    def onDownloadFinished(): Unit = {}

    // var onDownloadStart: () => Unit = null
    // var onDownloadFinished: () => Unit = null
}

java.nio.channels.Channels Scala Examples