org.apache.hadoop.hbase.util.Bytes Scala Example

Source File: HBaseLocalClient.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.hbase.utilities

import java.io.File

import scala.collection.mutable.ArrayBuffer

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.util._
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}

import com.paypal.gimel.common.catalog.Field
import com.paypal.gimel.hbase.DataSet

class HBaseLocalClient extends FunSuite with Matchers with BeforeAndAfterAll {

  var sparkSession : SparkSession = _
  var dataSet: DataSet = _
  val hbaseTestingUtility = new HBaseTestingUtility()
  val tableName = "test_table"
  val cfs = Array("personal", "professional")
  val columns = Array("id", "name", "age", "address", "company", "designation", "salary")
  val fields = columns.map(col => new Field(col))

  val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]

  protected override def beforeAll(): Unit = {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    hbaseTestingUtility.startMiniCluster()
    SparkHBaseConf.conf = hbaseTestingUtility.getConfiguration
    createTable(tableName, cfs)
    val conf = new SparkConf
    conf.set(SparkHBaseConf.testConf, "true")
    sparkSession = SparkSession.builder()
      .master("local")
      .appName("HBase Test")
      .config(conf)
      .getOrCreate()

    val listener = new QueryExecutionListener {
      // Only test successful case here, so no need to implement `onFailure`
      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
        metrics += ((funcName, qe, duration))
      }
    }
    sparkSession.listenerManager.register(listener)
    sparkSession.sparkContext.setLogLevel("ERROR")
    dataSet = new DataSet(sparkSession)
  }

  protected override def afterAll(): Unit = {
    hbaseTestingUtility.shutdownMiniCluster()
    sparkSession.close()
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      hbaseTestingUtility.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ : Throwable =>
        println("No table = " + name + " found")
    }
    hbaseTestingUtility.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }

  // Mocks data for testing
  def mockDataInDataFrame(numberOfRows: Int): DataFrame = {
    def stringed(n: Int) = s"""{"id": "$n","name": "MAC-$n", "address": "MAC-${n + 1}", "age": "${n + 1}", "company": "MAC-$n", "designation": "MAC-$n", "salary": "${n * 10000}" }"""
    val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) }
    val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts)
    val dataFrame: DataFrame = sparkSession.read.json(rdd)
    dataFrame
  }
}

Source File: HBaseBulkGetExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName} is missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseForeachPartitionExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseForeachPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseForeachPartitionExample {tableName} {columnFamily} are missing an arguments")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseForeachPartitionExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)


      rdd.hbaseForeachPartition(hbaseContext,
        (it, connection) => {
          val m = connection.getBufferedMutator(TableName.valueOf(tableName))

          it.foreach(r => {
            val put = new Put(r._1)
            r._2.foreach((putValue) =>
              put.addColumn(putValue._1, putValue._2, putValue._3))
            m.mutate(put)
          })
          m.flush()
          m.close()
        })

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkDeleteExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeleteExample {tableName} are missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExample {
   def main(args: Array[String]) {
     if (args.length < 2) {
       println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments")
       return
     }

     val tableName = args(0)
     val columnFamily = args(1)

     val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
       tableName + " " + columnFamily)
     val sc = new SparkContext(sparkConf)

     try {
       //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
       val rdd = sc.parallelize(Array(
         (Bytes.toBytes("1"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
         (Bytes.toBytes("2"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
         (Bytes.toBytes("3"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
         (Bytes.toBytes("4"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
         (Bytes.toBytes("5"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
       ))

       val conf = HBaseConfiguration.create()

       val hbaseContext = new HBaseContext(sc, conf)

       rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName),
         (putRecord) => {
           val put = new Put(putRecord._1)
           putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
             putValue._3))
           put
         })

     } finally {
       sc.stop()
     }
   }
 }

Source File: HBaseBulkGetExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = hbaseContext.bulkGet[Array[Byte], String](
        TableName.valueOf(tableName),
        2,
        rdd,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseStreamingBulkPutExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseStreamingBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 4) {
      println("HBaseStreamingBulkPutExample " +
        "{host} {port} {tableName} {columnFamily} are missing an argument")
      return
    }

    val host = args(0)
    val port = args(1)
    val tableName = args(2)
    val columnFamily = args(3)

    val sparkConf = new SparkConf().setAppName("HBaseStreamingBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)
    try {
      val ssc = new StreamingContext(sc, Seconds(1))

      val lines = ssc.socketTextStream(host, port.toInt)

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      hbaseContext.streamBulkPut[String](lines,
        TableName.valueOf(tableName),
        (putRecord) => {
          if (putRecord.length() > 0) {
            val put = new Put(Bytes.toBytes(putRecord))
            put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar"))
            put
          } else {
            null
          }
        })
      ssc.start()
      ssc.awaitTerminationOrTimeout(60000)
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExampleFromFile.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExampleFromFile {
  def main(args: Array[String]) {
    if (args.length < 3) {
      println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile} are missing an argument")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)
    val inputFile = args(2)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " +
      tableName + " " + columnFamily + " " + inputFile)
    val sc = new SparkContext(sparkConf)

    try {
      var rdd = sc.hadoopFile(
        inputFile,
        classOf[TextInputFormat],
        classOf[LongWritable],
        classOf[Text]).map(v => {
        System.out.println("reading-" + v._2.toString)
        v._2.toString
      })

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[String](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          System.out.println("hbase-" + putRecord)
          val put = new Put(Bytes.toBytes("Value- " + putRecord))
          put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"),
            Bytes.toBytes(putRecord.length()))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkDeleteExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeleteExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkDelete[Array[Byte]](rdd,
        TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) =>
            put.addColumn(putValue._1, putValue._2, putValue._3))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseDistributedScanExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience

@InterfaceAudience.Private
object HBaseDistributedScanExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseDistributedScanExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName )
    val sc = new SparkContext(sparkConf)

    try {
      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val scan = new Scan()
      scan.setCaching(100)

      val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan)

      getRdd.foreach(v => println(Bytes.toString(v._1.get())))

      println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length);
    } finally {
      sc.stop()
    }
  }

}

Source File: HBaseBulkPutTimestampExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutTimestampExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily} are missing an argument")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {

      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("6"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("7"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("8"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("9"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("10"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))))

      val conf = HBaseConfiguration.create()

      val timeStamp = System.currentTimeMillis()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
            timeStamp, putValue._3))
          put
        })
    } finally {
      sc.stop()
    }
  }
}

Source File: ReadFilter.scala From hbase-rdd-examples with Apache License 2.0

5 votes

package unicredit.example

import org.apache.hadoop.hbase.filter.PrefixFilter
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import unicredit.spark.hbase._


object ReadFilter extends App {
  val name = "Example of read from HBase table"

  lazy val sparkConf = new SparkConf().setAppName(name)
  lazy val sc = new SparkContext(sparkConf)
  implicit val config = HBaseConfig() // Assumes hbase-site.xml is on classpath

  val columns = Map(
    "cf1" -> Set("col1", "col2"),
    "cf2" -> Set("col3")
  )

  val filter = new PrefixFilter(Bytes.toBytes("abc"))

  sc.hbase[String]("test-table", columns, filter)
    .map({ case (k, v) =>
      val cf1 = v("cf1")
      val col1 = cf1("col1")
      val col2 = cf1("col2")
      val col3 = v("cf2")("col3")

      List(k, col1, col2, col3) mkString "\t"
    })
    .saveAsTextFile("test-output")
}

Source File: HBaseMapPartitionExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseMapPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseMapPartitionExample {tableName} is missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseMapPartitionExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => {
        val table = connection.getTable(TableName.valueOf(tableName))
        it.map{r =>
          //batching would be faster.  This is just an example
          val result = table.get(new Get(r))

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(cell.getQualifierArray)
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")")
            }
          }
          b.toString()
        }
      })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBasePut.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.hbase.utilities

import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.{DataFrame, SparkSession}

import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs}
import com.paypal.gimel.logger.Logger

object HBasePut {

  def apply(sparkSession: SparkSession): HBasePut = new HBasePut(sparkSession)

}

class HBasePut(sparkSession: SparkSession) {
  val logger = Logger()
  lazy val hbaseUtilities = HBaseUtilities(sparkSession)

  
  def putRows(hbaseTable: String, dataFrame: DataFrame, rowKeyColumn: String, columns: Array[String], cfColsMap: Map[String, String]) {
    try {
      // Configure And Connect
      val conf = HBaseConfiguration.create()
      val cnxn = ConnectionFactory.createConnection(conf)
      // Create Connection to HBase table
      val tbl = cnxn.getTable(TableName.valueOf(hbaseTable))
      val rows = dataFrame.rdd.map { row =>
        (row.getAs(rowKeyColumn).toString,
          columns.map(eachCol => (cfColsMap.getOrElse(eachCol, ""), eachCol, row.getAs(eachCol).asInstanceOf[String]))
        )
      }.collect()
      // Performing put operation on each row of dataframe
      rows.foreach { row =>
        val putRow: Put = new Put(Bytes.toBytes(row._1.asInstanceOf[String]))
        row._2.foreach(x => if (x._2 != rowKeyColumn) putRow.addColumn(Bytes.toBytes(x._1), Bytes.toBytes(x._2), Bytes.toBytes(x._3)))
        tbl.put(putRow)
      }
      tbl.close()
    } catch {
      case ex: Throwable =>
        ex.printStackTrace()
        throw ex
    }
  }
}

Source File: HogConfig.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.util

import java.security.MessageDigest
import org.apache.hadoop.hbase.util.Bytes
import javax.xml.bind.DatatypeConverter
import math._
import com.typesafe.config.Config
import scala.collection.mutable.HashSet



object HogConfig {
  
   
  
  def get(config:Config,key:String,valueType:String,default:Any):Any =
  {
    if(config==null)
      return default
    
    
      try {
        
        val value = config.getString(key)
        
        if(value.isEmpty())
          return default // Return default value
        
        println(f"Configuration: $key => $value")
       
        if(valueType.equals("Int"))
          value.toInt 
        else if(valueType.equals("Double"))
          value.toDouble 
        else if(valueType.equals("Long"))
          value.toLong 
        else if(valueType.equals("Set(Int)"))
        {
          val patternSet="Set\\(".r
          val patternSetEnd="\\)".r
          
          if(value.equals("Set()"))
            return Set()
          
          return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),""))
                  .split(",").map({x => x.toInt}).toSet
        }
        else if(valueType.equals("Set(String)"))
        {
          val patternSet="Set\\(".r
          val patternSetEnd="\\)".r
          
          if(value.equals("Set()"))
            return Set()
          
          return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),""))
                  .split(",").map({x => println(x.toString.trim()) ; x.toString.trim()}).toSet
        }
        else
          default // Create type first
          
      } catch {
        case t: Throwable => t.printStackTrace() 
        println(f"Problem parsing $key . Check if it is ok. Using default value")
        
        return default
      } 
  
  }
  
  def getInt(config:Config,key:String,default:Any):Int =
  {
    get(config,key,"Int",default).asInstanceOf[Int]
  }
  
  def getLong(config:Config,key:String,default:Any):Long =
  {
    get(config,key,"Long",default).asInstanceOf[Long]
  }
  
  def getDouble(config:Config,key:String,default:Any):Double =
  {
    get(config,key,"Double",default).asInstanceOf[Long]
  }
  
  def getSetInt(config:Config,key:String,default:Any):Set[Int] =
  {
    get(config,key,"Set(Int)",default).asInstanceOf[Set[Int]]
  }
  
  def getSetString(config:Config,key:String,default:Any):Set[String] =
  {
    get(config,key,"Set(String)",default).asInstanceOf[Set[String]]
  }
   

}

Source File: HogGeograph.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.util

import java.security.MessageDigest
import org.apache.hadoop.hbase.util.Bytes
import javax.xml.bind.DatatypeConverter
import math._



object HogGeograph {
  
   val R = 6372.8 //radius in km
  
  def haversineDistance(lat1:Double, lon1:Double, lat2:Double, lon2:Double):Double =
  {
      val dLat=(lat2 - lat1).toRadians
      val dLon=(lon2 - lon1).toRadians

      val a = pow(sin(dLat/2),2) + pow(sin(dLon/2),2) * cos(lat1.toRadians) * cos(lat2.toRadians)
      val c = 2 * asin(sqrt(a))
      R * c    
  }
   
   
   def haversineDistanceFromStrings(coords1:String, coords2:String):Double =
	   {
		   try {
			   val coordsDouble1 = coords1.split(",").map({ x => x.toDouble })
				 val coordsDouble2 = coords2.split(",").map({ x => x.toDouble })

				 haversineDistance(coordsDouble1(0),coordsDouble1(1),coordsDouble2(0),coordsDouble2(1))
		   } catch {
		        case t: Throwable => // t.printStackTrace() 
             // Return a large distance 
             999999999D
		   }
	   }

}

Source File: HogEvent.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.event

import java.util.HashMap
import java.util.Map
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.hogzilla.hbase.HogHBaseRDD
import org.hogzilla.util.HogFlow
import java.net.InetAddress


class HogEvent(flow:HogFlow) 
{
	var sensorid:Int=0
	var signature_id:Double=0
	var priorityid:Int=0
	var text:String=""
	var data:Map[String,String]=new HashMap()
  var ports:String=""
  var title:String=""
  var username:String=""
  var coords:String=""
 
  
  def formatIPtoBytes(ip:String):Array[Byte] =
  {
    try {
       // Eca! Snorby doesn't support IPv6 yet. See https://github.com/Snorby/snorby/issues/65
    if(ip.contains(":"))
      InetAddress.getByName("255.255.6.6").getAddress
    else  
      InetAddress.getByName(ip).getAddress
    } catch {
      case t: Throwable => 
        // Bogus address!
        InetAddress.getByName("255.255.1.1").getAddress
    }   
   
  }

  
   def alert()
   {
	   val put = new Put(Bytes.toBytes(flow.get("flow:id")))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("note"), Bytes.toBytes(text))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("lower_ip"), formatIPtoBytes(flow.lower_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("upper_ip"), formatIPtoBytes(flow.upper_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("lower_ip_str"), Bytes.toBytes(flow.lower_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("upper_ip_str"), Bytes.toBytes(flow.upper_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("signature_id"), Bytes.toBytes("%.0f".format(signature_id)))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("time"), Bytes.toBytes(System.currentTimeMillis))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("ports"), Bytes.toBytes(ports))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("title"), Bytes.toBytes(title))
     
     if(!username.equals(""))
       put.add(Bytes.toBytes("event"), Bytes.toBytes("username"), Bytes.toBytes(username))
     if(!coords.equals(""))
       put.add(Bytes.toBytes("event"), Bytes.toBytes("coords"), Bytes.toBytes(coords))
     
     HogHBaseRDD.hogzilla_events.put(put)

     //println(f"ALERT: $text%100s\n\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
   }
}

Source File: HogSignature.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.event

import org.hogzilla.hbase.HogHBaseRDD
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Put




case class HogSignature(signature_class:Int, signature_name:String, signature_priority:Int, signature_revision:Int, signature_id:Double,signature_group_id:Int) {
  //Example: 3,"HZ: Suspicious DNS flow identified by K-Means clustering",2,1,826000001,826
  
  def saveHBase():HogSignature =
  {
    val get = new Get(Bytes.toBytes("%.0f".format(signature_id)))
    
    if(!HogHBaseRDD.hogzilla_sensor.exists(get))
    {
      val put = new Put(Bytes.toBytes("%.0f".format(signature_id)))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("id"), Bytes.toBytes("%.0f".format(signature_id)))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("class"), Bytes.toBytes(signature_class.toString()))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("name"), Bytes.toBytes(signature_name))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("priority"), Bytes.toBytes(signature_priority.toString()))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("revision"), Bytes.toBytes(signature_revision.toString()))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("group_id"), Bytes.toBytes(signature_group_id.toString()))
      HogHBaseRDD.hogzilla_signatures.put(put)
    }
    
    this
  }
}

Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.hbase




import scala.math.random
import java.lang.Math
import org.apache.spark._
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
import org.apache.hadoop.hbase.filter.BinaryComparator
import org.apache.hadoop.hbase.filter.FilterList
import org.apache.hadoop.hbase.filter.CompareFilter
import java.util.ArrayList
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.Filter
import scala.collection.mutable.HashSet
import org.apache.hadoop.hbase.client.Put


object HogHBaseReputation {

  // Ex: MX, whitelist
	def getReputationList(listName:String, listType:String):Set[String] =
	{
		val list =  new HashSet[String]


	  val filters: ArrayList[Filter] = new ArrayList();

		val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType)))
		colValFilter1.setFilterIfMissing(false);

		val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName)))
		colValFilter2.setFilterIfMissing(false);

		filters.add(colValFilter1);
		filters.add(colValFilter2);

		val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters);
		val scan = new Scan()
		scan.setFilter(filterList)
    
		val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator()
		
    while(it.hasNext())
		{
      list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) )
		}
    
    list.toSet

	}
  
 def saveReputationList(listName:String, listType:String, ip:String) =
 {
     val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip))
     
     HogHBaseRDD.hogzilla_reputation.put(put)
 }

}

Source File: HogHBaseCluster.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.hbase

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.Vector
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Delete
import org.hogzilla.cluster.HogClusterMember


object HogHBaseCluster {

 def formatClusterTitle(clusterCentroid: List[(Long,Double)], clusterIdx:Int):String =
 {
   val mainTitle = 
   "Group "+clusterIdx.toString+" - "+
   clusterCentroid
   .filter({case (port,rate) =>
            rate > 4.999
          })
   .map({case (port,rate) =>
            port.toString()+":"+"%.0f".format(rate)+"%"
        }).mkString(", ")
        
   val onePercentList=
   clusterCentroid
   .filter({case (port,rate) =>
            .9999 < rate & rate < 5
          })
          
   if(onePercentList.size>0)
   {
     mainTitle+", "+
     onePercentList.map({case (port,rate) =>
            port.toString()
        }).mkString("(",", ",")"+"> 1%")
     
   }else
   {
     mainTitle
   }
 }
 
 def deleteCluster(clusterIdx:Int)=
 {
     val del = new Delete(Bytes.toBytes(clusterIdx.toString))
     HogHBaseRDD.hogzilla_clusters.delete(del)
 }
 
 
 def deleteClusterMember(memberIP:String)=
 {
     val del = new Delete(Bytes.toBytes(memberIP))
     HogHBaseRDD.hogzilla_cluster_members.delete(del)
 }
 
 def saveCluster(clusterIdx:Int, clusterCentroid:List[(Long,Double)], clusterSize: Long, members:Array[String]) = {
   
     val memberString = members.mkString(",")
   
     val put = new Put(Bytes.toBytes(clusterIdx.toString))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("title"), Bytes.toBytes(formatClusterTitle(clusterCentroid,clusterIdx)))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("size"), Bytes.toBytes(clusterSize.toString))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("centroid"), Bytes.toBytes(clusterCentroid.mkString("[",",","]")))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("members"), Bytes.toBytes(memberString))
     
     HogHBaseRDD.hogzilla_clusters.put(put)
  }
 
 def saveClusterMember(clusterMember:HogClusterMember) = {
   
     val put = new Put(Bytes.toBytes(clusterMember.memberIP.toString))
     put.add(Bytes.toBytes("info"),   Bytes.toBytes("title"),      Bytes.toBytes(clusterMember.formatTitle))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("size"),       Bytes.toBytes(clusterMember.clusterSize.toString))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("centroid"),   Bytes.toBytes(clusterMember.centroid.mkString("[",",","]")))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("idx"),        Bytes.toBytes(clusterMember.clusterIdx.toString))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("description"),Bytes.toBytes(formatClusterTitle(clusterMember.centroid,clusterMember.clusterIdx)))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("ports"),      Bytes.toBytes("TCP: "+clusterMember.ports.mkString(""," ","")))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("frequencies"),Bytes.toBytes("TCP: "+
                                                                           clusterMember.frequency_vector
                                                                           .filter({case (port,freq) => clusterMember.ports.contains(port)})
                                                                           .map({case (port,freq) => port.toString+"="+
                                                                                                     "%.0f".format(freq)+"%"
                                                                                })
                                                                           .mkString(""," ","")
                                                                          ))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("ip"),         Bytes.toBytes(clusterMember.memberIP))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("distance"),   Bytes.toBytes("%.2f".format(clusterMember.distance)))
     
     
     HogHBaseRDD.hogzilla_cluster_members.put(put)
  }
  

}

Source File: BytesUtilsSuite.scala From Spark-SQL-on-HBase with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Logging
import org.apache.spark.sql.hbase.types.HBaseBytesType
import org.apache.spark.sql.hbase.util.BinaryBytesUtils
import org.apache.spark.sql.types._
import org.scalatest.{BeforeAndAfterAll, FunSuite}

class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging {
  test("Bytes Ordering Test") {
    val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1,
      0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257)
    val result = s.map(i => (i, BinaryBytesUtils.create(IntegerType).toBytes(i)))
      .sortWith((f, s) =>
      HBaseBytesType.ordering.gt(
        f._2.asInstanceOf[HBaseBytesType.InternalType],
        s._2.asInstanceOf[HBaseBytesType.InternalType]))
    assert(result.map(a => a._1) == s.sorted.reverse)
  }

  def compare(a: Array[Byte], b: Array[Byte]): Int = {
    val length = Math.min(a.length, b.length)
    var result: Int = 0
    for (i <- 0 to length - 1) {
      val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte]
      if (diff != 0) {
        result = diff
      }
    }
    result
  }

  test("Bytes Utility Test") {
    assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType)
      .toBytes(input = true), 0) === true)
    assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType)
      .toBytes(input = false), 0) === false)

    assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(12.34d), 0)
      === 12.34d)
    assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(-12.34d), 0)
      === -12.34d)

    assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(12.34f), 0)
      === 12.34f)
    assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(-12.34f), 0)
      === -12.34f)

    assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(12), 0)
      === 12)
    assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(-12), 0)
      === -12)

    assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(1234l), 0)
      === 1234l)
    assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(-1234l), 0)
      === -1234l)

    assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType)
      .toBytes(12.asInstanceOf[Short]), 0) === 12)
    assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType)
      .toBytes(-12.asInstanceOf[Short]), 0) === -12)

    assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes("abc"), 0, 3)
      === UTF8String("abc"))
    assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String(""))

    assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType)
      .toBytes(5.asInstanceOf[Byte]), 0) === 5)
    assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType)
      .toBytes(-5.asInstanceOf[Byte]), 0) === -5)

    assert(compare(BinaryBytesUtils.create(IntegerType).toBytes(128),
      BinaryBytesUtils.create(IntegerType).toBytes(-128)) > 0)
  }

  test("byte array plus one") {
    var byteArray =  Array[Byte](0x01.toByte, 127.toByte)
    assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray),  Array[Byte](0x01.toByte, 0x80.toByte)) == 0)

    byteArray =  Array[Byte](0xff.toByte, 0xff.toByte)
    assert(BinaryBytesUtils.addOne(byteArray) == null)

    byteArray =  Array[Byte](0x02.toByte, 0xff.toByte)
    assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray),  Array[Byte](0x03.toByte, 0x00.toByte)) == 0)
  }

  test("float comparison") {
    val f1 = BinaryBytesUtils.create(FloatType).toBytes(-1.23f)
    val f2 = BinaryBytesUtils.create(FloatType).toBytes(100f)
    assert(Bytes.compareTo(f1, f2) < 0)
  }
}

Source File: HBasePartitioner.scala From Spark-SQL-on-HBase with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner
import org.apache.spark.util.CollectionsUtils

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
}

Source File: HBasePartitioner.scala From Heracles with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner
import org.apache.spark.util.CollectionsUtils

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (val splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  override def numPartitions = if (len == 0) 1 else len

  @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
}

Source File: ByteArrayComparable.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.util.Bytes

class ByteArrayComparable(val bytes:Array[Byte], val offset:Int = 0, var length:Int = -1)
  extends Comparable[ByteArrayComparable] {

  if (length == -1) {
    length = bytes.length
  }

  override def compareTo(o: ByteArrayComparable): Int = {
    Bytes.compareTo(bytes, offset, length, o.bytes, o.offset, o.length)
  }

  override def hashCode(): Int = {
    Bytes.hashCode(bytes, offset, length)
  }

  override def equals (obj: Any): Boolean = {
    obj match {
      case b: ByteArrayComparable =>
        Bytes.equals(bytes, offset, length, b.bytes, b.offset, b.length)
      case _ =>
        false
    }
  }
}

Source File: HbaseReaderHelper.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.writers

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.HbaseHelper
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}

import scala.collection.JavaConverters._

object HbaseReaderHelper {
  def createConnection: Connection = {
    ConnectionFactory.createConnection(HBaseConfiguration.create())
  }

  def getAllRecords(tableName: String, columnFamily: String)(implicit connection: Connection): List[HbaseRowData] = {
    HbaseHelper.withTable(TableName.valueOf(tableName)) { tbl =>
      val scan = new Scan()
      scan.addFamily(columnFamily.fromString())
      val scanner = tbl.getScanner(scan)
      scanner.asScala.map { rs =>
        val cells = rs.rawCells().map { cell =>
          Bytes.toString(CellUtil.cloneQualifier(cell)) -> CellUtil.cloneValue(cell)
        }.toMap
        HbaseRowData(rs.getRow, cells)
      }.toList
    }
  }

}

case class HbaseRowData(key: Array[Byte], cells: Map[String, Array[Byte]])

Source File: HBaseServiceLayer.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.server.hbase

import javax.ws.rs._
import javax.ws.rs.core.MediaType

import com.cloudera.sa.taxi360.model.NyTaxiYellowTrip
import com.cloudera.sa.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable

@Path("rest")
class HBaseServiceLayer {

  @GET
  @Path("hello")
  @Produces(Array(MediaType.TEXT_PLAIN))
  def hello(): String = {
    "Hello World"
  }

  @GET
  @Path("vender/{venderId}/timeline")
  @Produces(Array(MediaType.APPLICATION_JSON))
  def getTripTimeLine (@PathParam("venderId") venderId:String,
                          @QueryParam("startTime") startTime:String = Long.MinValue.toString,
                          @QueryParam("endTime")  endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = {

    val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName))

    val st = if (startTime == null) {
      Long.MinValue.toString
    } else {
      startTime
    }
    val et = if (endTime == null) {
      Long.MaxValue.toString
    } else {
      endTime
    }

    val scan = new Scan()
    val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts)
    println("startRowKey:" + Bytes.toString(startRowKey))
    scan.setStartRow(startRowKey)
    val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts)
    println("endRowKey:" + Bytes.toString(endRowKey))
    scan.setStopRow(endRowKey)

    val scannerIt = table.getScanner(scan).iterator()

    val tripList = new mutable.MutableList[NyTaxiYellowTrip]

    while(scannerIt.hasNext) {
      val result = scannerIt.next()
      tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result)
      println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result))
    }

    println("tripList.size:" + tripList.size)

    tripList.toArray
  }

}

Source File: CreateSaltedTable.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.setup.hbase

import java.io.File

import org.apache.commons.lang.StringUtils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.compress.Compression
import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable


object CreateSaltedTable {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>")
    }
    val tableName = args(0)
    val columnFamilyName = args(1)
    val regionCount = args(2).toInt
    val numOfSalts = args(3).toInt
    val hbaseConfigFolder = args(4)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val connection = ConnectionFactory.createConnection(conf)

    val admin = connection.getAdmin

    val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName))

    val columnDescriptor = new HColumnDescriptor(columnFamilyName)

    columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY)
    columnDescriptor.setBlocksize(64 * 1024)
    columnDescriptor.setBloomFilterType(BloomType.ROW)

    tableDescriptor.addFamily(columnDescriptor)

    tableDescriptor.setMaxFileSize(Long.MaxValue)
    tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName)

    val splitKeys = new mutable.MutableList[Array[Byte]]
    for (i <- 0 to regionCount) {
      val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0")
      splitKeys += Bytes.toBytes(regionSplitStr)
    }
    admin.createTable(tableDescriptor, splitKeys.toArray)
  }
}

Source File: Test.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.types.BinaryType

object Test {
  def main(args: Array[String]) {
   val a: Array[Byte] = Array.fill(10)(Byte.MinValue)
    val b = Bytes.toBytes ("row003")
    System.arraycopy(b, 0, a, 0, b.length)
    val c = Bytes.toBytes(Int.MinValue)
    System.arraycopy(c, 0, a, b.length, c.length)
    val len = a.indexOf(HBaseTableCatalog.delimiter, 0)
    val s1 = Bytes.toString(a, 0, 6)
    val s2 = Bytes.toString(a, 0, len)



    
    val l = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(l, 0, Double.MinValue)
    val m = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(m, 0, -20.0)
    val n = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(n, 0, 0.0)
    val o = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(o,  0, 20.0)
    val p = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(p, 0, Double.MaxValue)

    val c1 = BinaryType.ordering.compare(l, m)
    val c2 = BinaryType.ordering.compare(m, n)
    val c3 = BinaryType.ordering.compare(n, o)
    val c4 = BinaryType.ordering.compare(o, p)

    val p1 = Array.fill(10)(0: Byte)
    Bytes.putBytes(p1, 0, Bytes.toBytes("row010"), 0, 6)

    val p2 = Array.fill(10)(-1: Byte)
    Bytes.putBytes(p2, 0, Bytes.toBytes("row010"), 0, 6)

    val p3 = Array.fill(10)(Byte.MaxValue)
    Bytes.putBytes(p3, 0, Bytes.toBytes("row010"), 0, 6)
    Bytes.putInt(p3, 6, 10)

    val p4 = Bytes.compareTo(p1, p3)
    val p5 = Bytes.compareTo(p2, p3)



    val z = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z, 0, -1)
    val z1 = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z1, 0, -2147483648)

    val z2 = Bytes.compareTo(z, z1)


    val t = Array.fill(4)(-1: Byte)
    println(Bytes.toInt(t))

    val s = Bytes.toBytes(1.4.asInstanceOf[Float])
    println(Bytes.toInt(s))
    println(Bytes.toFloat(s))
    val w =  Bytes.toBytes(-1.4.asInstanceOf[Float])
    println(Bytes.toInt(w))
    println(Bytes.toFloat(w))

    val buffer1 = Bytes.toBytes(-1.0f)
    val b1 = Bytes.toInt(buffer1)
    var buffer = Array.fill(4)(-1: Byte)
    var buffer2 = Bytes.toBytes(-1.0f)

    var buffer3 = java.lang.Float.floatToIntBits(-1.0f)
    val b3 = Bytes.toBytes(buffer3)
    val out = Bytes.toInt(buffer1) ^ Integer.MIN_VALUE
    buffer2 = Bytes.toBytes(out)
    var i: Int = java.lang.Float.floatToIntBits(-1.0f)
    i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1
    Bytes.putInt(buffer, 0, i)


    val mn = Bytes.toBytes(-0.0f)
    println(Bytes.toFloat(mn))
    println(Float.MinPositiveValue)



    println(s"a")
  }
}

Source File: SHC.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName, HBaseTestingUtility}
import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.types.UTF8String
import org.apache.spark.{SparkContext, SparkConf, Logging}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
import scala.collection.JavaConverters._

class SHC  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  implicit class StringToColumn(val sc: StringContext) {
    def $(args: Any*): ColumnName = {
      new ColumnName(sc.s(args: _*))
    }
  }


  private[spark] var htu = HBaseTestingUtility.createLocalHTU()
  private[spark] def tableName = "table1"

  private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"}
  var table: Table = null
  val conf = new SparkConf
  conf.set(SparkHBaseConf.testConf, "true")
  SparkHBaseConf.conf = htu.getConfiguration
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  def catalog = s"""{
            |"table":{"namespace":"default", "name":"table1"},
            |"rowkey":"key",
            |"columns":{
              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
            |}
          |}""".stripMargin

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.cleanupTestDir
    htu.startMiniZKCluster
    htu.startMiniHBaseCluster(1, 4)
    logInfo(" - minicluster started")
    println(" - minicluster started")

  }

  override def afterAll() {
    try {
      table.close()
      println("shutdown")
      htu.deleteTable(TableName.valueOf(tableName))
      logInfo("shuting down minicluster")
      htu.shutdownMiniHBaseCluster
      htu.shutdownMiniZKCluster
      logInfo(" - minicluster shut down")
      htu.cleanupTestDir
    } catch {
      case _ => logError("teardown error")
    }
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      htu.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ =>
        logInfo(" - no table " + name + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }


  def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) {
    try {
      htu.deleteTable(TableName.valueOf(name))
    } catch {
      case _ =>
        logInfo(" - no table " + Bytes.toString(name) + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(name), cfs)
  }
}

Source File: Utils.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.hbase

import java.util
import java.util.Comparator

import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.catalyst.expressions.MutableRow
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution.SparkSqlSerializer
import org.apache.spark.sql.types._

import scala.collection.mutable.ArrayBuffer
import scala.math.Ordering

object Utils {

  def setRowCol(
      row: MutableRow,
      field: (Field, Int),
      src: HBaseType,
      offset: Int,
      length: Int): Unit = {
    val index = field._2
    val f = field._1
    if (f.sedes.isDefined) {
      // If we already have sedes defined , use it.
      val m = f.sedes.get.deserialize(src, offset, length)
      row.update(index, m)
    } else if (f.exeSchema.isDefined) {
      // println("avro schema is defined to do deserialization")
      // If we have avro schema defined, use it to get record, and then covert them to catalyst data type
      val m = AvroSedes.deserialize(src, f.exeSchema.get)
      // println(m)
      val n = f.avroToCatalyst.map(_(m))
      row.update(index, n.get)
    } else  {
      // Fall back to atomic type
      f.dt match {
        case BooleanType => row.setBoolean(index, toBoolean(src, offset))
        case ByteType => row.setByte(index, src(offset))
        case DoubleType => row.setDouble(index, Bytes.toDouble(src, offset))
        case FloatType => row.setFloat(index, Bytes.toFloat(src, offset))
        case IntegerType => row.setInt(index, Bytes.toInt(src, offset))
        case LongType => row.setLong(index, Bytes.toLong(src, offset))
        case ShortType => row.setShort(index, Bytes.toShort(src, offset))
        case StringType => row.update(index, toUTF8String(src, offset, length))
        case BinaryType =>
          val newArray = new Array[Byte](length)
          System.arraycopy(src, offset, newArray, 0, length)
          row.update(index, newArray)
        case _ => row.update(index, SparkSqlSerializer.deserialize[Any](src)) //TODO
      }
    }
  }

  // convert input to data type
  def toBytes(input: Any, field: Field): Array[Byte] = {
    if (field.sedes.isDefined) {
      field.sedes.get.serialize(input)
    } else if (field.schema.isDefined) {
      // Here we assume the top level type is structType
      val record = field.catalystToAvro(input)
      AvroSedes.serialize(record, field.schema.get)
    } else {
      input match {
        case data: Boolean => Bytes.toBytes(data)
        case data: Byte => Array(data)
        case data: Array[Byte] => data
        case data: Double => Bytes.toBytes(data)
        case data: Float => Bytes.toBytes(data)
        case data: Int => Bytes.toBytes(data)
        case data: Long => Bytes.toBytes(data)
        case data: Short => Bytes.toBytes(data)
        case data: UTF8String => data.getBytes
        case data: String => Bytes.toBytes(data)
          //Bytes.toBytes(input.asInstanceOf[String])//input.asInstanceOf[UTF8String].getBytes
        case _ => throw new Exception(s"unsupported data type ${field.dt}") //TODO
      }
    }
  }

  def toBoolean(input: HBaseType, offset: Int): Boolean = {
    input(offset) != 0
  }

  def toUTF8String(input: HBaseType, offset: Int, length: Int): UTF8String = {
    UTF8String(input.slice(offset, offset + length))
  }
}

Source File: Sedes.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.hbase

import java.io.ByteArrayInputStream

import org.apache.avro.Schema
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io._
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._

trait Sedes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

class DoubleSedes extends Sedes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toLong(bytes, start)
  }
}

Source File: HBaseTestSuite.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.io.File

import scala.collection.JavaConverters._

import com.google.common.io.Files
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{TableName, HBaseTestingUtility}
import org.apache.spark.sql.execution.datasources.hbase.Logging
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class HBaseTestSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  private[spark] var htu = HBaseTestingUtility.createLocalHTU()
  private[spark] var tableName: Array[Byte] = Bytes.toBytes("t1")
  private[spark] var columnFamily: Array[Byte] = Bytes.toBytes("cf0")
  private[spark] var columnFamilies: Array[Array[Byte]] =
    Array(Bytes.toBytes("cf0"), Bytes.toBytes("cf1"), Bytes.toBytes("cf2"), Bytes.toBytes("cf3"), Bytes.toBytes("cf4"))
  var table: Table = null
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.cleanupTestDir
    htu.startMiniZKCluster
    htu.startMiniHBaseCluster(1, 4)
    logInfo(" - minicluster started")
    println(" - minicluster started")
    try {
      htu.deleteTable(TableName.valueOf(tableName))

      //htu.createTable(TableName.valueOf(tableName), columnFamily, 2, Bytes.toBytes("abc"), Bytes.toBytes("xyz"), 2)
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + Bytes.toString(tableName) + " found")
    }
    setupTable()
  }



  override def afterAll() {
    try {
      table.close()
      println("shutdown")
      htu.deleteTable(TableName.valueOf(tableName))
      logInfo("shuting down minicluster")
      htu.shutdownMiniHBaseCluster
      htu.shutdownMiniZKCluster
      logInfo(" - minicluster shut down")
      htu.cleanupTestDir
    } catch {
      case _ : Throwable => logError("teardown error")
    }
  }

  def setupTable() {
    val config = htu.getConfiguration
    htu.createMultiRegionTable(TableName.valueOf(tableName), columnFamilies)
    println("create htable t1")
    val connection = ConnectionFactory.createConnection(config)
    val r = connection.getRegionLocator(TableName.valueOf("t1"))
    table = connection.getTable(TableName.valueOf("t1"))

    val regionLocations = r.getAllRegionLocations.asScala.toSeq
    println(s"$regionLocations size: ${regionLocations.size}")
    (0 until 100).foreach { x =>
      var put = new Put(Bytes.toBytes(s"row$x"))
      (0 until 5).foreach { y =>
        put.addColumn(columnFamilies(y), Bytes.toBytes(s"c$y"), Bytes.toBytes(s"value $x $y"))
      }
      table.put(put)
    }
  }
}

Source File: Test.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.types.BinaryType

object Test {
  def main(args: Array[String]) {
    val a: Array[Byte] = Array.fill(10)(Byte.MinValue)
    val b = Bytes.toBytes ("row003")
    System.arraycopy(b, 0, a, 0, b.length)
    val c = Bytes.toBytes(Int.MinValue)
    System.arraycopy(c, 0, a, b.length, c.length)
    val len = a.indexOf(HBaseTableCatalog.delimiter, 0)
    val s1 = Bytes.toString(a, 0, 6)
    val s2 = Bytes.toString(a, 0, len)

    
    val l = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(l, 0, Double.MinValue)
    val m = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(m, 0, -20.0)
    val n = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(n, 0, 0.0)
    val o = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(o,  0, 20.0)
    val p = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(p, 0, Double.MaxValue)

    val c1 = BinaryType.ordering.compare(l, m)
    val c2 = BinaryType.ordering.compare(m, n)
    val c3 = BinaryType.ordering.compare(n, o)
    val c4 = BinaryType.ordering.compare(o, p)

    val p1 = Array.fill(10)(0: Byte)
    Bytes.putBytes(p1, 0, Bytes.toBytes("row010"), 0, 6)

    val p2 = Array.fill(10)(-1: Byte)
    Bytes.putBytes(p2, 0, Bytes.toBytes("row010"), 0, 6)

    val p3 = Array.fill(10)(Byte.MaxValue)
    Bytes.putBytes(p3, 0, Bytes.toBytes("row010"), 0, 6)
    Bytes.putInt(p3, 6, 10)

    val p4 = Bytes.compareTo(p1, p3)
    val p5 = Bytes.compareTo(p2, p3)

    val z = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z, 0, -1)
    val z1 = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z1, 0, -2147483648)

    val z2 = Bytes.compareTo(z, z1)

    val t = Array.fill(4)(-1: Byte)
    println(Bytes.toInt(t))

    val s = Bytes.toBytes(1.4.asInstanceOf[Float])
    println(Bytes.toInt(s))
    println(Bytes.toFloat(s))
    val w =  Bytes.toBytes(-1.4.asInstanceOf[Float])
    println(Bytes.toInt(w))
    println(Bytes.toFloat(w))

    val buffer1 = Bytes.toBytes(-1.0f)
    val b1 = Bytes.toInt(buffer1)
    var buffer = Array.fill(4)(-1: Byte)
    var buffer2 = Bytes.toBytes(-1.0f)

    var buffer3 = java.lang.Float.floatToIntBits(-1.0f)
    val b3 = Bytes.toBytes(buffer3)
    val out = Bytes.toInt(buffer1) ^ Integer.MIN_VALUE
    buffer2 = Bytes.toBytes(out)
    var i: Int = java.lang.Float.floatToIntBits(-1.0f)
    i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1
    Bytes.putInt(buffer, 0, i)

    val mn = Bytes.toBytes(-0.0f)
    println(Bytes.toFloat(mn))
    println(Float.MinPositiveValue)

    println(s"a")
  }
}

Source File: SHC.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.execution.datasources.hbase.Logging

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.hbase.client.Table
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.{SparkContext, SparkConf}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class SHC  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  implicit class StringToColumn(val sc: StringContext) {
    def $(args: Any*): ColumnName = {
      new ColumnName(sc.s(args: _*))
    }
  }

  var spark: SparkSession = null
  var sc: SparkContext = null
  var sqlContext: SQLContext = null
  var df: DataFrame = null

  private[spark] var htu = new HBaseTestingUtility
  private[spark] def tableName = "table1"

  private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"}
  var table: Table = null
  val conf = new SparkConf
  conf.set(SparkHBaseConf.testConf, "true")
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  def defineCatalog(tName: String) = s"""{
                                         |"table":{"namespace":"default", "name":"$tName"},
                                         |"rowkey":"key",
                                         |"columns":{
                                              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
                                              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
                                              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
                                              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
                                              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
                                              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
                                              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
                                              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
                                              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
                                            |}
                                         |}""".stripMargin

  @deprecated(since = "04.12.2017(dd/mm/year)", message = "use `defineCatalog` instead")
  def catalog = defineCatalog(tableName)

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.startMiniCluster
    SparkHBaseConf.conf = htu.getConfiguration
    logInfo(" - minicluster started")
    println(" - minicluster started")

    spark = SparkSession.builder()
      .master("local")
      .appName("HBaseTest")
      .config(conf)
      .getOrCreate()

    sqlContext = spark.sqlContext
    sc = spark.sparkContext
  }

  override def afterAll() {
    htu.shutdownMiniCluster()
    spark.stop()
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      htu.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + name + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }


  def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) {
    try {
      htu.deleteTable(TableName.valueOf(name))
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + Bytes.toString(name) + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(name), cfs)
  }
}

Source File: AvroRecordRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.avro.AvroRecordFieldExtractorMapFn
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar {
  val schema: Schema = new Schema.Parser().parse(PersonAvroSchema.schema)

  "AvroRecordRowKeyBuilder" should {
    "extract the values from the avro record and create the key" in {
      val keys = Seq("firstName", "lastName", "age")
      val rowKeyBuilder = new AvroRecordRowKeyBuilderBytes(AvroRecordFieldExtractorMapFn(schema, keys), keys)

      val sinkRecord = mock[SinkRecord]
      val firstName = "Jack"
      val lastName = "Smith"
      val age = 29

      val record = new GenericRecord {

        val values: Map[String, AnyRef] = Map("firstName" -> firstName, "lastName" -> lastName, "age" -> Int.box(age))

        override def get(key: String): AnyRef = values(key)

        override def put(key: String, v: scala.Any): Unit = sys.error("not supported")

        override def get(i: Int): AnyRef = sys.error("not supported")


        override def put(i: Int, v: scala.Any): Unit = sys.error("not supported")


        override def getSchema: Schema = sys.error("not supported")
      }

      val expectedValue = Bytes.add(
        Array(
          firstName.fromString(),
          rowKeyBuilder.delimBytes,
          lastName.fromString(),
          rowKeyBuilder.delimBytes,
          age.fromInt()))
      rowKeyBuilder.build(sinkRecord, record) shouldBe expectedValue
    }
  }
}

Source File: StructFieldsRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldsRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsRowKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        //val field = Field("threshold", "threshold", false)

        StructFieldsRowKeyBuilderBytes(List("threshold")).build(sinkRecord, null)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName")).build(sinkRecord, null) shouldBe "Alex".fromString
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      //val field2 = Field("age", "age", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName", "age")).build(sinkRecord, null) shouldBe
        Bytes.add("Alex".fromString(), "\n".fromString(), 30.fromInt())
    }
  }
}

Source File: ColumnFamilyQualifierMapKeyWrapper.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.util.Bytes


class ColumnFamilyQualifierMapKeyWrapper(val columnFamily:Array[Byte],
                                         val columnFamilyOffSet:Int,
                                         val columnFamilyLength:Int,
                                         val qualifier:Array[Byte],
                                         val qualifierOffSet:Int,
                                         val qualifierLength:Int)
  extends Serializable{

  override def equals(other:Any): Boolean = {
    val otherWrapper = other.asInstanceOf[ColumnFamilyQualifierMapKeyWrapper]

    Bytes.compareTo(columnFamily,
      columnFamilyOffSet,
      columnFamilyLength,
      otherWrapper.columnFamily,
      otherWrapper.columnFamilyOffSet,
      otherWrapper.columnFamilyLength) == 0 && Bytes.compareTo(qualifier,
        qualifierOffSet,
        qualifierLength,
        otherWrapper.qualifier,
        otherWrapper.qualifierOffSet,
        otherWrapper.qualifierLength) == 0
  }

  override def hashCode():Int = {
    Bytes.hashCode(columnFamily, columnFamilyOffSet, columnFamilyLength) +
      Bytes.hashCode(qualifier, qualifierOffSet, qualifierLength)
  }

  def cloneColumnFamily():Array[Byte] = {
    val resultArray = new Array[Byte](columnFamilyLength)
    System.arraycopy(columnFamily, columnFamilyOffSet, resultArray, 0, columnFamilyLength)
    resultArray
  }

  def cloneQualifier():Array[Byte] = {
    val resultArray = new Array[Byte](qualifierLength)
    System.arraycopy(qualifier, qualifierOffSet, resultArray, 0, qualifierLength)
    resultArray
  }
}

Source File: GenericRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class GenericRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "GenericRowKeyBuilder" should {
    "use the topic, partition and offset to make the key" in {

      val topic = "sometopic"
      val partition = 2
      val offset = 1243L
      val sinkRecord = new SinkRecord(topic, partition, Schema.INT32_SCHEMA, 345, Schema.STRING_SCHEMA, "", offset)

      val keyBuilder = new GenericRowKeyBuilderBytes()
      val expected = Bytes.add(Array(topic.fromString(), keyBuilder.delimiterBytes, partition.fromString(),
        keyBuilder.delimiterBytes, offset.fromString()))
      keyBuilder.build(sinkRecord, Nil) shouldBe expected
    }
  }
}

Source File: AvroRecordFieldExtractorMapFnTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.avro

import java.nio.file.Paths

import org.apache.avro.Schema
import org.apache.hadoop.hbase.util.Bytes
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordFieldExtractorMapFnTest extends AnyWordSpec with Matchers {

  val schema: Schema = new Schema.Parser().parse(Paths.get(getClass.getResource("/person.avsc").toURI).toFile)

  "AvroRecordFieldExtractorMapFn" should {
    "raise an exception if the given field does not exist in the schema" in {
      intercept[IllegalArgumentException] {
        AvroRecordFieldExtractorMapFn(schema, Seq("wrongField"))
      }
    }

    "raise an exception if the given field is not a primitive" in {
      intercept[IllegalArgumentException] {
        AvroRecordFieldExtractorMapFn(schema, Seq("address"))
      }
    }

    "create the mappings for all the given fields" in {
      val mappings = AvroRecordFieldExtractorMapFn(schema, Seq("firstName", "age"))

      val fnFirstName = mappings("firstName")
      val firstName = "Beaky"
      fnFirstName(firstName) shouldBe Bytes.toBytes(firstName)

      val fnAge = mappings("age")
      val age = 31
      fnAge(age) shouldBe Bytes.toBytes(age)
      intercept[ClassCastException] {
        fnAge(12.4)
      }
    }
  }
}

Source File: HBaseCatalogSuite.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.spark.datasources.{DataTypeParserWrapper, DoubleSerDes, HBaseTableCatalog}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class HBaseCatalogSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {

  val map = s"""MAP<int, struct<varchar:string>>"""
  val array = s"""array<struct<tinYint:tinyint>>"""
  val arrayMap = s"""MAp<int, ARRAY<double>>"""
  val catalog = s"""{
                    |"table":{"namespace":"default", "name":"htable"},
                    |"rowkey":"key1:key2",
                    |"columns":{
                    |"col1":{"cf":"rowkey", "col":"key1", "type":"string"},
                    |"col2":{"cf":"rowkey", "col":"key2", "type":"double"},
                    |"col3":{"cf":"cf1", "col":"col2", "type":"binary"},
                    |"col4":{"cf":"cf1", "col":"col3", "type":"timestamp"},
                    |"col5":{"cf":"cf1", "col":"col4", "type":"double", "serdes":"${classOf[DoubleSerDes].getName}"},
                    |"col6":{"cf":"cf1", "col":"col5", "type":"$map"},
                    |"col7":{"cf":"cf1", "col":"col6", "type":"$array"},
                    |"col8":{"cf":"cf1", "col":"col7", "type":"$arrayMap"},
                    |"col9":{"cf":"cf1", "col":"col8", "type":"date"},
                    |"col10":{"cf":"cf1", "col":"col9", "type":"timestamp"}
                    |}
                    |}""".stripMargin
  val parameters = Map(HBaseTableCatalog.tableCatalog->catalog)
  val t = HBaseTableCatalog(parameters)

  def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = {
    test(s"parse ${dataTypeString.replace("\n", "")}") {
      assert(DataTypeParserWrapper.parse(dataTypeString) === expectedDataType)
    }
  }
  test("basic") {
    assert(t.getField("col1").isRowKey == true)
    assert(t.getPrimaryKey == "key1")
    assert(t.getField("col3").dt == BinaryType)
    assert(t.getField("col4").dt == TimestampType)
    assert(t.getField("col5").dt == DoubleType)
    assert(t.getField("col5").serdes != None)
    assert(t.getField("col4").serdes == None)
    assert(t.getField("col1").isRowKey)
    assert(t.getField("col2").isRowKey)
    assert(!t.getField("col3").isRowKey)
    assert(t.getField("col2").length == Bytes.SIZEOF_DOUBLE)
    assert(t.getField("col1").length == -1)
    assert(t.getField("col8").length == -1)
    assert(t.getField("col9").dt == DateType)
    assert(t.getField("col10").dt == TimestampType)
  }

  checkDataType(
    map,
    t.getField("col6").dt
  )

  checkDataType(
    array,
    t.getField("col7").dt
  )

  checkDataType(
    arrayMap,
    t.getField("col8").dt
  )

  test("convert") {
    val m = Map("hbase.columns.mapping" ->
      "KEY_FIELD STRING :key, A_FIELD STRING c:a, B_FIELD DOUBLE c:b, C_FIELD BINARY c:c,",
      "hbase.table" -> "t1")
    val map = HBaseTableCatalog.convert(m)
    val json = map.get(HBaseTableCatalog.tableCatalog).get
    val parameters = Map(HBaseTableCatalog.tableCatalog->json)
    val t = HBaseTableCatalog(parameters)
    assert(t.getField("KEY_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("STRING") === t.getField("A_FIELD").dt)
    assert(!t.getField("A_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("DOUBLE") === t.getField("B_FIELD").dt)
    assert(DataTypeParserWrapper.parse("BINARY") === t.getField("C_FIELD").dt)
  }

  test("compatibility") {
    val m = Map("hbase.columns.mapping" ->
      "KEY_FIELD STRING :key, A_FIELD STRING c:a, B_FIELD DOUBLE c:b, C_FIELD BINARY c:c,",
      "hbase.table" -> "t1")
    val t = HBaseTableCatalog(m)
    assert(t.getField("KEY_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("STRING") === t.getField("A_FIELD").dt)
    assert(!t.getField("A_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("DOUBLE") === t.getField("B_FIELD").dt)
    assert(DataTypeParserWrapper.parse("BINARY") === t.getField("C_FIELD").dt)
  }
}

Source File: ByteArrayComparable.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes

@InterfaceAudience.Public
class ByteArrayComparable(val bytes:Array[Byte], val offset:Int = 0, var length:Int = -1)
  extends Comparable[ByteArrayComparable] {

  if (length == -1) {
    length = bytes.length
  }

  override def compareTo(o: ByteArrayComparable): Int = {
    Bytes.compareTo(bytes, offset, length, o.bytes, o.offset, o.length)
  }

  override def hashCode(): Int = {
    Bytes.hashCode(bytes, offset, length)
  }

  override def equals (obj: Any): Boolean = {
    obj match {
      case b: ByteArrayComparable =>
        Bytes.equals(bytes, offset, length, b.bytes, b.offset, b.length)
      case _ =>
        false
    }
  }
}

Source File: ColumnFamilyQualifierMapKeyWrapper.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes


@InterfaceAudience.Public
class ColumnFamilyQualifierMapKeyWrapper(val columnFamily:Array[Byte],
                                         val columnFamilyOffSet:Int,
                                         val columnFamilyLength:Int,
                                         val qualifier:Array[Byte],
                                         val qualifierOffSet:Int,
                                         val qualifierLength:Int)
  extends Serializable{

  override def equals(other:Any): Boolean = {
    val otherWrapper = other.asInstanceOf[ColumnFamilyQualifierMapKeyWrapper]

    Bytes.compareTo(columnFamily,
      columnFamilyOffSet,
      columnFamilyLength,
      otherWrapper.columnFamily,
      otherWrapper.columnFamilyOffSet,
      otherWrapper.columnFamilyLength) == 0 && Bytes.compareTo(qualifier,
        qualifierOffSet,
        qualifierLength,
        otherWrapper.qualifier,
        otherWrapper.qualifierOffSet,
        otherWrapper.qualifierLength) == 0
  }

  override def hashCode():Int = {
    Bytes.hashCode(columnFamily, columnFamilyOffSet, columnFamilyLength) +
      Bytes.hashCode(qualifier, qualifierOffSet, qualifierLength)
  }

  def cloneColumnFamily():Array[Byte] = {
    val resultArray = new Array[Byte](columnFamilyLength)
    System.arraycopy(columnFamily, columnFamilyOffSet, resultArray, 0, columnFamilyLength)
    resultArray
  }

  def cloneQualifier():Array[Byte] = {
    val resultArray = new Array[Byte](qualifierLength)
    System.arraycopy(qualifier, qualifierOffSet, resultArray, 0, qualifierLength)
    resultArray
  }
}

Source File: ByteArrayWrapper.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes


@InterfaceAudience.Public
class ByteArrayWrapper (var value:Array[Byte])
  extends Comparable[ByteArrayWrapper] with Serializable {
  override def compareTo(valueOther: ByteArrayWrapper): Int = {
    Bytes.compareTo(value,valueOther.value)
  }
  override def equals(o2: Any): Boolean = {
    o2 match {
      case wrapper: ByteArrayWrapper =>
        Bytes.equals(value, wrapper.value)
      case _ =>
        false
    }
  }
  override def hashCode():Int = {
    Bytes.hashCode(value)
  }
}

Source File: BulkLoadPartitioner.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.util
import java.util.Comparator

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


@InterfaceAudience.Public
class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {
  // when table not exist, startKeys = Byte[0][]
  override def numPartitions: Int = if (startKeys.length == 0) 1 else startKeys.length

  override def getPartition(key: Any): Int = {

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case wrapper: ByteArrayWrapper =>
          wrapper.value
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }
    var partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0)
      partition = partition * -1 + -2
    if (partition < 0)
      partition = 0
    partition
  }
}

Source File: package.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.util.Bytes

import scala.math.Ordering

// TODO: add @InterfaceAudience.Private if https://issues.scala-lang.org/browse/SI-3600 is resolved
package object hbase {
  type HBaseType = Array[Byte]
  def bytesMin = new Array[Byte](0)
  def bytesMax = null
  val ByteMax = -1.asInstanceOf[Byte]
  val ByteMin = 0.asInstanceOf[Byte]
  val ord: Ordering[HBaseType] = new Ordering[HBaseType] {
    def compare(x: Array[Byte], y: Array[Byte]): Int = {
      return Bytes.compareTo(x, y)
    }
  }
  //Do not use BinaryType.ordering
  implicit val order: Ordering[HBaseType] = ord

}

Source File: Utils.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.datasources

import java.sql.{Date, Timestamp}

import org.apache.hadoop.hbase.spark.AvroSerdes
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.yetus.audience.InterfaceAudience;

@InterfaceAudience.Private
object Utils {

  
  def hbaseFieldToScalaType(
      f: Field,
      src: Array[Byte],
      offset: Int,
      length: Int): Any = {
    if (f.exeSchema.isDefined) {
      // If we have avro schema defined, use it to get record, and then convert them to catalyst data type
      val m = AvroSerdes.deserialize(src, f.exeSchema.get)
      val n = f.avroToCatalyst.map(_(m))
      n.get
    } else  {
      // Fall back to atomic type
      f.dt match {
        case BooleanType => src(offset) != 0
        case ByteType => src(offset)
        case ShortType => Bytes.toShort(src, offset)
        case IntegerType => Bytes.toInt(src, offset)
        case LongType => Bytes.toLong(src, offset)
        case FloatType => Bytes.toFloat(src, offset)
        case DoubleType => Bytes.toDouble(src, offset)
        case DateType => new Date(Bytes.toLong(src, offset))
        case TimestampType => new Timestamp(Bytes.toLong(src, offset))
        case StringType => UTF8String.fromBytes(src, offset, length)
        case BinaryType =>
          val newArray = new Array[Byte](length)
          System.arraycopy(src, offset, newArray, 0, length)
          newArray
        // TODO: SparkSqlSerializer.deserialize[Any](src)
        case _ => throw new Exception(s"unsupported data type ${f.dt}")
      }
    }
  }

  // convert input to data type
  def toBytes(input: Any, field: Field): Array[Byte] = {
    if (field.schema.isDefined) {
      // Here we assume the top level type is structType
      val record = field.catalystToAvro(input)
      AvroSerdes.serialize(record, field.schema.get)
    } else {
      field.dt match {
        case BooleanType => Bytes.toBytes(input.asInstanceOf[Boolean])
        case ByteType => Array(input.asInstanceOf[Number].byteValue)
        case ShortType => Bytes.toBytes(input.asInstanceOf[Number].shortValue)
        case IntegerType => Bytes.toBytes(input.asInstanceOf[Number].intValue)
        case LongType => Bytes.toBytes(input.asInstanceOf[Number].longValue)
        case FloatType => Bytes.toBytes(input.asInstanceOf[Number].floatValue)
        case DoubleType => Bytes.toBytes(input.asInstanceOf[Number].doubleValue)
        case DateType | TimestampType => Bytes.toBytes(input.asInstanceOf[java.util.Date].getTime)
        case StringType => Bytes.toBytes(input.toString)
        case BinaryType => input.asInstanceOf[Array[Byte]]
        case _ => throw new Exception(s"unsupported data type ${field.dt}")
      }
    }
  }
}

Source File: SerDes.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.datasources

import org.apache.hadoop.hbase.util.Bytes
import org.apache.yetus.audience.InterfaceAudience

// TODO: This is not really used in code.
@InterfaceAudience.Public
trait SerDes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

// TODO: This is not really used in code.
@InterfaceAudience.Private
class DoubleSerDes extends SerDes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toDouble(bytes, start)
  }
}

Source File: KeyFamilyQualifier.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes


@InterfaceAudience.Public
class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
}

Source File: IndexEdgeDeserializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.indexedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.{Label, LabelMeta, ServiceColumn}
import org.apache.s2graph.core.storage.serde.StorageDeserializable._
import org.apache.s2graph.core.storage._
import org.apache.s2graph.core.storage.serde.Deserializable
import org.apache.s2graph.core.types._

class IndexEdgeDeserializable(graph: S2GraphLike,
                              bytesToLongFunc: (Array[Byte], Int) => Long = bytesToLong) extends Deserializable[S2EdgeLike] {

   type QualifierRaw = (Array[(LabelMeta, InnerValLike)], VertexId, Byte, Boolean, Int)
   type ValueRaw = (Array[(LabelMeta, InnerValLike)], Int)
   val builder = graph.elementBuilder

   override def fromKeyValues[T: CanSKeyValue](_kvs: Seq[T],
                                               cacheElementOpt: Option[S2EdgeLike]): Option[S2EdgeLike] = {
     try {
       assert(_kvs.size == 1)

       //     val kvs = _kvs.map { kv => implicitly[CanSKeyValue[T]].toSKeyValue(kv) }

       val kv = implicitly[CanSKeyValue[T]].toSKeyValue(_kvs.head)
       val version = kv.timestamp

       var pos = 0
       val (srcVertexId, srcIdLen) = SourceVertexId.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
       pos += srcIdLen
       val labelWithDir = LabelWithDirection(Bytes.toInt(kv.row, pos, 4))
       pos += 4
       val (labelIdxSeq, isInverted) = bytesToLabelIndexSeqWithIsInverted(kv.row, pos)
       pos += 1

       if (isInverted) None
       else {
         val label = Label.findById(labelWithDir.labelId)
         val schemaVer = label.schemaVersion
         val srcVertex = builder.newVertex(srcVertexId, version)
         //TODO:
         var tsVal = version

         if (kv.qualifier.isEmpty) {
           val degreeVal = bytesToLongFunc(kv.value, 0)
           val tgtVertexId = VertexId(ServiceColumn.Default, InnerVal.withStr("0", schemaVer))
           val tgtVertex = builder.newVertex(tgtVertexId, version)
           val edge = builder.newEdge(srcVertex, tgtVertex,
             label, labelWithDir.dir, GraphUtil.defaultOpByte, version, S2Edge.EmptyState)

           edge.propertyInner(LabelMeta.timestamp.name, version, version)
           edge.propertyInner(LabelMeta.degree.name, degreeVal, version)
           edge.tgtVertex = builder.newVertex(tgtVertexId, version)
           edge.setOp(GraphUtil.defaultOpByte)
           edge.setTsInnerValOpt(Option(InnerVal.withLong(tsVal, schemaVer)))

           Option(edge)
         } else {
           pos = 0
           val (idxPropsRaw, endAt) = bytesToProps(kv.qualifier, pos, schemaVer)
           pos = endAt

           val (tgtVertexIdRaw, tgtVertexIdLen) = if (endAt == kv.qualifier.length) {
             (HBaseType.defaultTgtVertexId, 0)
           } else {
             TargetVertexId.fromBytes(kv.qualifier, endAt, kv.qualifier.length, schemaVer)
           }
           pos += tgtVertexIdLen
           val op =
             if (kv.qualifier.length == pos) GraphUtil.defaultOpByte
             else kv.qualifier(kv.qualifier.length-1)

           val tgtVertex = builder.newVertex(tgtVertexIdRaw, version)
           val edge = builder.newEdge(srcVertex, tgtVertex,
             label, labelWithDir.dir, GraphUtil.defaultOpByte, version, S2Edge.EmptyState)

           val index = label.indicesMap.getOrElse(labelIdxSeq, throw new RuntimeException(s"invalid index seq: ${label.id.get}, ${labelIdxSeq}"))

           
           if (edge.checkProperty(LabelMeta.to.name)) {
             val vId = edge.property(LabelMeta.to.name).asInstanceOf[S2Property[_]].innerValWithTs
             val tgtVertex = builder.newVertex(TargetVertexId(ServiceColumn.Default, vId.innerVal), version)
             edge.setTgtVertex(tgtVertex)
           }

           edge.propertyInner(LabelMeta.timestamp.name, tsVal, version)
           edge.setOp(op)
           edge.setTsInnerValOpt(Option(InnerVal.withLong(tsVal, schemaVer)))

           Option(edge)
         }
       }
     } catch {
       case e: Exception => None
     }
   }
 }

Source File: HBasePartitioner.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.{CollectionsUtils, Utils}
import org.apache.spark.{Partitioner, SparkEnv}

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
}

Source File: HBaseServiceLayer.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.server.hbase

import javax.ws.rs._
import javax.ws.rs.core.MediaType

import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTrip
import com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable

@Path("rest")
class HBaseServiceLayer {

  @GET
  @Path("hello")
  @Produces(Array(MediaType.TEXT_PLAIN))
  def hello(): String = {
    "Hello World"
  }

  @GET
  @Path("vender/{venderId}/timeline")
  @Produces(Array(MediaType.APPLICATION_JSON))
  def getTripTimeLine (@PathParam("venderId") venderId:String,
                          @QueryParam("startTime") startTime:String = Long.MinValue.toString,
                          @QueryParam("endTime")  endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = {

    val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName))

    val st = if (startTime == null) {
      Long.MinValue.toString
    } else {
      startTime
    }
    val et = if (endTime == null) {
      Long.MaxValue.toString
    } else {
      endTime
    }

    val scan = new Scan()
    val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts)
    println("startRowKey:" + Bytes.toString(startRowKey))
    scan.setStartRow(startRowKey)
    val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts)
    println("endRowKey:" + Bytes.toString(endRowKey))
    scan.setStopRow(endRowKey)

    val scannerIt = table.getScanner(scan).iterator()

    val tripList = new mutable.MutableList[NyTaxiYellowTrip]

    while(scannerIt.hasNext) {
      val result = scannerIt.next()
      tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result)
      println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result))
    }

    println("tripList.size:" + tripList.size)

    tripList.toArray
  }

}

Source File: CreateSaltedTable.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.setup.hbase

import java.io.File

import org.apache.commons.lang.StringUtils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.compress.Compression
import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable


object CreateSaltedTable {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>")
    }
    val tableName = args(0)
    val columnFamilyName = args(1)
    val regionCount = args(2).toInt
    val numOfSalts = args(3).toInt
    val hbaseConfigFolder = args(4)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val connection = ConnectionFactory.createConnection(conf)

    val admin = connection.getAdmin

    val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName))

    val columnDescriptor = new HColumnDescriptor(columnFamilyName)

    columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY)
    columnDescriptor.setBlocksize(64 * 1024)
    columnDescriptor.setBloomFilterType(BloomType.ROW)

    tableDescriptor.addFamily(columnDescriptor)

    tableDescriptor.setMaxFileSize(Long.MaxValue)
    tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName)

    val splitKeys = new mutable.MutableList[Array[Byte]]
    for (i <- 0 to regionCount) {
      val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0")
      splitKeys += Bytes.toBytes(regionSplitStr)
    }
    admin.createTable(tableDescriptor, splitKeys.toArray)
  }
}

Source File: BytesUtilV1.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.counter.core.v1

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.counter.core.TimedQualifier.IntervalUnit
import org.apache.s2graph.counter.core.{TimedQualifier, ExactQualifier, ExactKeyTrait, BytesUtil}
import org.apache.s2graph.counter.models.Counter.ItemType
import org.apache.s2graph.counter.util.Hashes
import scala.collection.mutable.ArrayBuffer

object BytesUtilV1 extends BytesUtil {
  // ExactKey: [hash(2b)][policy(4b)][item(variable)]
  val BUCKET_BYTE_SIZE = Bytes.SIZEOF_SHORT
  val POLICY_ID_SIZE = Bytes.SIZEOF_INT
  val INTERVAL_SIZE = Bytes.SIZEOF_BYTE
  val TIMESTAMP_SIZE = Bytes.SIZEOF_LONG
  val TIMED_QUALIFIER_SIZE = INTERVAL_SIZE + TIMESTAMP_SIZE

  override def getRowKeyPrefix(id: Int): Array[Byte] = {
    Bytes.toBytes(id)
  }

  override def toBytes(key: ExactKeyTrait): Array[Byte] = {
    val buff = new ArrayBuffer[Byte]
    // hash key (2 byte)
    buff ++= Bytes.toBytes(Hashes.murmur3(key.itemKey)).take(BUCKET_BYTE_SIZE)

    buff ++= getRowKeyPrefix(key.policyId)
    buff ++= {
      key.itemType match {
        case ItemType.INT => Bytes.toBytes(key.itemKey.toInt)
        case ItemType.LONG => Bytes.toBytes(key.itemKey.toLong)
        case ItemType.STRING | ItemType.BLOB => Bytes.toBytes(key.itemKey)
      }
    }
    buff.toArray
  }

  override def toBytes(eq: ExactQualifier): Array[Byte] = {
    toBytes(eq.tq) ++ eq.dimension.getBytes
  }

  override def toBytes(tq: TimedQualifier): Array[Byte] = {
    Bytes.toBytes(tq.q.toString) ++ Bytes.toBytes(tq.ts)
  }

  override def toExactQualifier(bytes: Array[Byte]): ExactQualifier = {
    // qualifier: interval, ts, dimension 순서
    val tq = toTimedQualifier(bytes)

    val dimension = Bytes.toString(bytes, TIMED_QUALIFIER_SIZE, bytes.length - TIMED_QUALIFIER_SIZE)
    ExactQualifier(tq, dimension)
  }

  override def toTimedQualifier(bytes: Array[Byte]): TimedQualifier = {
    val interval = Bytes.toString(bytes, 0, INTERVAL_SIZE)
    val ts = Bytes.toLong(bytes, INTERVAL_SIZE)

    TimedQualifier(IntervalUnit.withName(interval), ts)
  }
}

Source File: Hashes.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.counter.util

import org.apache.hadoop.hbase.util.Bytes

import scala.util.hashing.MurmurHash3

object Hashes {
  def sha1(s: String): String = {
    val md = java.security.MessageDigest.getInstance("SHA-1")
    Bytes.toHex(md.digest(s.getBytes("UTF-8")))
  }
  
  private def positiveHash(h: Int): Int = {
    if (h < 0) -1 * (h + 1) else h
  }

  def murmur3(s: String): Int = {
    val hash = MurmurHash3.stringHash(s)
    positiveHash(hash)
  }
}

Source File: DistributedScanner.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.counter.helper

import java.util
import java.util.Comparator

import com.google.common.primitives.SignedBytes
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes

object DistributedScanner {
   val BUCKET_BYTE_SIZE = Bytes.SIZEOF_BYTE

   def getRealRowKey(result: Result): Array[Byte] = {
     result.getRow.drop(BUCKET_BYTE_SIZE)
   }
 }

class DistributedScanner(table: Table, scan: Scan) extends AbstractClientScanner {
   import DistributedScanner._

   private val BYTE_MAX = BigInt(256)

   private[helper] val scanners = {
     for {
       i <- 0 until BYTE_MAX.pow(BUCKET_BYTE_SIZE).toInt
     } yield {
       val bucketBytes: Array[Byte] = Bytes.toBytes(i).takeRight(BUCKET_BYTE_SIZE)
       val newScan = new Scan(scan).setStartRow(bucketBytes ++ scan.getStartRow).setStopRow(bucketBytes ++ scan.getStopRow)
       table.getScanner(newScan)
     }
   }

   val resultCache = new util.TreeMap[Result, java.util.Iterator[Result]](new Comparator[Result] {
     val comparator = SignedBytes.lexicographicalComparator()
     override def compare(o1: Result, o2: Result): Int = {
       comparator.compare(getRealRowKey(o1), getRealRowKey(o2))
     }
   })

   lazy val initialized = {
     val iterators = scanners.map(_.iterator()).filter(_.hasNext)
     iterators.foreach { it =>
       resultCache.put(it.next(), it)
     }
     iterators.nonEmpty
   }

   override def next(): Result = {
     if (initialized) {
       Option(resultCache.pollFirstEntry()).map { entry =>
         val it = entry.getValue
         if (it.hasNext) {
           // fill cache
           resultCache.put(it.next(), it)
         }
         entry.getKey
       }.orNull
     } else {
       null
     }
   }

   override def close(): Unit = {
     for {
       scanner <- scanners
     } {
       scanner.close()
     }
   }

  override def renewLease(): Boolean = true
}

Source File: BulkLoadPartitioner.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.loader.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
}

Source File: KeyFamilyQualifier.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.loader.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
}

Source File: StorageSerializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.{ColumnMeta, LabelMeta}
import org.apache.s2graph.core.storage.SKeyValue
import org.apache.s2graph.core.types.{InnerValLike, InnerValLikeWithTs}

object StorageSerializable {
  
  def propsToBytes(props: Seq[(LabelMeta, InnerValLike)]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((_, v) <- props) bytes = Bytes.add(bytes, v.bytes)
    bytes
  }

  def vertexPropsToBytes(props: Seq[(ColumnMeta, Array[Byte])]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((k, v) <- props) bytes = Bytes.add(bytes, Bytes.toBytes(k.seq.toInt), v)
    bytes
  }

  def propsToKeyValues(props: Seq[(LabelMeta, InnerValLike)]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((k, v) <- props) bytes = Bytes.add(bytes, Array.fill(1)(k.seq), v.bytes)
    bytes
  }

  def propsToKeyValuesWithTs(props: Seq[(LabelMeta, InnerValLikeWithTs)]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((k, v) <- props) bytes = Bytes.add(bytes, Array.fill(1)(k.seq), v.bytes)
    bytes
  }

  def labelOrderSeqWithIsInverted(labelOrderSeq: Byte, isInverted: Boolean): Array[Byte] = {
    assert(labelOrderSeq < (1 << 6))
    val byte = labelOrderSeq << 1 | (if (isInverted) 1 else 0)
    Array.fill(1)(byte.toByte)
  }

  def intToBytes(value: Int): Array[Byte] = Bytes.toBytes(value)

  def longToBytes(value: Long): Array[Byte] = Bytes.toBytes(value)
}

trait StorageSerializable[E] {
  val cf = Serializable.edgeCf

  def table: Array[Byte]
  def ts: Long

  def toRowKey: Array[Byte]
  def toQualifier: Array[Byte]
  def toValue: Array[Byte]

  def toKeyValues: Seq[SKeyValue] = {
    val row = toRowKey
    val qualifier = toQualifier
    val value = toValue
    val kv = SKeyValue(table, row, cf, qualifier, value, ts)
//    logger.debug(s"[SER]: ${kv.toLogString}}")
    Seq(kv)
  }
}

Source File: VertexSerializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.vertex.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.{S2Vertex, S2VertexLike}
import org.apache.s2graph.core.storage.SKeyValue
import org.apache.s2graph.core.storage.serde.Serializable
import org.apache.s2graph.core.storage.serde.StorageSerializable._

import scala.collection.JavaConverters._

case class VertexSerializable(vertex: S2VertexLike, intToBytes: Int => Array[Byte] = intToBytes) extends Serializable[S2VertexLike] {

  override val table = vertex.hbaseTableName.getBytes
  override val ts = vertex.ts
  override val cf = Serializable.vertexCf

  override def toRowKey: Array[Byte] = vertex.id.bytes

  override def toQualifier: Array[Byte] = Array.empty[Byte]
  override def toValue: Array[Byte] = {
    val props = (vertex.props.asScala ++ vertex.defaultProps.asScala).toSeq.map { case (_, v) =>
      v.columnMeta -> v.innerVal.bytes
    }
    vertexPropsToBytes(props)
  }

  
  override def toKeyValues: Seq[SKeyValue] = {
//    val row = toRowKey
//    val qualifier = toQualifier
//    val value = toValue
//    Seq(
//      SKeyValue(vertex.hbaseTableName.getBytes, row, cf, qualifier, value, vertex.ts)
//    )
    (vertex.props.asScala ++ vertex.defaultProps.asScala).toSeq.map { case (_, v) =>
        val row = Bytes.add(vertex.id.bytes, Array.fill(1)(v.columnMeta.seq))
        val qualifier = Array.empty[Byte]
        val value = v.innerVal.bytes

        SKeyValue(vertex.hbaseTableName.getBytes, row, cf, qualifier, value, vertex.ts)
    }
  }
}

Source File: IndexEdgeSerializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.indexedge.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.LabelMeta
import org.apache.s2graph.core.types.VertexId
import org.apache.s2graph.core.{GraphUtil, IndexEdge}
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.storage.serde.Serializable

class IndexEdgeSerializable(indexEdge: IndexEdge, longToBytes: Long => Array[Byte] = longToBytes) extends Serializable[IndexEdge] {

   override def ts = indexEdge.version
   override def table = indexEdge.label.hbaseTableName.getBytes("UTF-8")

   def idxPropsMap = indexEdge.orders.toMap
   def idxPropsBytes = propsToBytes(indexEdge.orders)

   override def toRowKey: Array[Byte] = {
     val srcIdBytes = VertexId.toSourceVertexId(indexEdge.srcVertex.id).bytes
     val labelWithDirBytes = indexEdge.labelWithDir.bytes
     val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(indexEdge.labelIndexSeq, isInverted = false)

     val row = Bytes.add(srcIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
     //    logger.error(s"${row.toList}\n${srcIdBytes.toList}\n${labelWithDirBytes.toList}\n${labelIndexSeqWithIsInvertedBytes.toList}")

     if (indexEdge.degreeEdge) row
     else {
       val qualifier = idxPropsMap.get(LabelMeta.to) match {
           case None => Bytes.add(idxPropsBytes, VertexId.toTargetVertexId(indexEdge.tgtVertex.id).bytes)
           case Some(vId) => idxPropsBytes
         }

       val opByte = if (indexEdge.op == GraphUtil.operations("incrementCount")) indexEdge.op else GraphUtil.defaultOpByte
       Bytes.add(row, qualifier, Array.fill(1)(opByte))
     }
   }

   override def toQualifier: Array[Byte] = Array.empty[Byte]

   override def toValue: Array[Byte] =
     if (indexEdge.degreeEdge)
       longToBytes(indexEdge.property(LabelMeta.degree).innerVal.toString().toLong)
     else if (indexEdge.op == GraphUtil.operations("incrementCount"))
       longToBytes(indexEdge.property(LabelMeta.count).innerVal.toString().toLong)
     else propsToKeyValues(indexEdge.metas.toSeq)

 }

Source File: BytesUtilsSuite.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.spark.Logging
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._
import org.apache.spark.sql.hbase.types.HBaseBytesType
import org.apache.spark.sql.hbase.util.BytesUtils
import org.scalatest.{BeforeAndAfterAll, FunSuite}

class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging {
  test("Bytes Ordering Test") {
    val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1,
      0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257)
    val result = s.map(i => (i, BytesUtils.create(IntegerType).toBytes(i)))
      .sortWith((f, s) =>
      HBaseBytesType.ordering.gt(
        f._2.asInstanceOf[HBaseBytesType.InternalType],
        s._2.asInstanceOf[HBaseBytesType.InternalType]))
    assert(result.map(a => a._1) == s.sorted.reverse)
  }

  def compare(a: Array[Byte], b: Array[Byte]): Int = {
    val length = Math.min(a.length, b.length)
    var result: Int = 0
    for (i <- 0 to length - 1) {
      val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte]
      if (diff != 0) {
        result = diff
      }
    }
    result
  }

  test("Bytes Utility Test") {
    assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
      .toBytes(input = true), 0) === true)
    assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
      .toBytes(input = false), 0) === false)

    assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(12.34d), 0)
      === 12.34d)
    assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(-12.34d), 0)
      === -12.34d)

    assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(12.34f), 0)
      === 12.34f)
    assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(-12.34f), 0)
      === -12.34f)

    assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(12), 0)
      === 12)
    assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(-12), 0)
      === -12)

    assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(1234l), 0)
      === 1234l)
    assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(-1234l), 0)
      === -1234l)

    assert(BytesUtils.toShort(BytesUtils.create(ShortType)
      .toBytes(12.asInstanceOf[Short]), 0) === 12)
    assert(BytesUtils.toShort(BytesUtils.create(ShortType)
      .toBytes(-12.asInstanceOf[Short]), 0) === -12)

    assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes("abc"), 0, 3)
      === UTF8String("abc"))
    assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String(""))

    assert(BytesUtils.toByte(BytesUtils.create(ByteType)
      .toBytes(5.asInstanceOf[Byte]), 0) === 5)
    assert(BytesUtils.toByte(BytesUtils.create(ByteType)
      .toBytes(-5.asInstanceOf[Byte]), 0) === -5)

    assert(compare(BytesUtils.create(IntegerType).toBytes(128),
      BytesUtils.create(IntegerType).toBytes(-128)) > 0)
  }

  test("byte array plus one") {
    var byteArray =  Array[Byte](0x01.toByte, 127.toByte)
    assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x01.toByte, 0x80.toByte)) == 0)

    byteArray =  Array[Byte](0xff.toByte, 0xff.toByte)
    assert(BytesUtils.addOne(byteArray) == null)

    byteArray =  Array[Byte](0x02.toByte, 0xff.toByte)
    assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x03.toByte, 0x00.toByte)) == 0)
  }

  test("float comparison") {
    val f1 = BytesUtils.create(FloatType).toBytes(-1.23f)
    val f2 = BytesUtils.create(FloatType).toBytes(100f)
    assert(Bytes.compareTo(f1, f2) < 0)
  }
}

Source File: IndexEdgeSerializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.indexedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.LabelMeta
import org.apache.s2graph.core.types.VertexId
import org.apache.s2graph.core.{GraphUtil, IndexEdge}
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.storage.serde.Serializable

class IndexEdgeSerializable(indexEdge: IndexEdge, longToBytes: Long => Array[Byte] = longToBytes) extends Serializable[IndexEdge] {

   override def ts = indexEdge.version
   override def table = indexEdge.label.hbaseTableName.getBytes("UTF-8")

   def idxPropsMap = indexEdge.orders.toMap
   def idxPropsBytes = propsToBytes(indexEdge.orders)

   override def toRowKey: Array[Byte] = {
     val srcIdBytes = VertexId.toSourceVertexId(indexEdge.srcVertex.id).bytes
     val labelWithDirBytes = indexEdge.labelWithDir.bytes
     val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(indexEdge.labelIndexSeq, isInverted = false)

     Bytes.add(srcIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
   }

   override def toQualifier: Array[Byte] = {
     val tgtIdBytes = VertexId.toTargetVertexId(indexEdge.tgtVertex.id).bytes
     if (indexEdge.degreeEdge) Array.empty[Byte]
     else {
       if (indexEdge.op == GraphUtil.operations("incrementCount")) {
         Bytes.add(idxPropsBytes, tgtIdBytes, Array.fill(1)(indexEdge.op))
       } else {
         idxPropsMap.get(LabelMeta.to) match {
           case None => Bytes.add(idxPropsBytes, tgtIdBytes)
           case Some(vId) => idxPropsBytes
         }
       }
     }
   }

  override def toValue: Array[Byte] =
    if (indexEdge.degreeEdge)
      longToBytes(indexEdge.property(LabelMeta.degree).innerVal.toString().toLong)
    else if (indexEdge.op == GraphUtil.operations("incrementCount"))
      longToBytes(indexEdge.property(LabelMeta.count).innerVal.toString().toLong)
    else propsToKeyValues(indexEdge.metas.toSeq)

 }

Source File: SnapshotEdgeDeserializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.snapshotedge.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.{Label, LabelMeta, ServiceColumn}
import org.apache.s2graph.core.storage.serde.StorageDeserializable._
import org.apache.s2graph.core.storage.CanSKeyValue
import org.apache.s2graph.core.types._
import org.apache.s2graph.core._
import org.apache.s2graph.core.storage.serde.Deserializable
import org.apache.s2graph.core.utils.logger

class SnapshotEdgeDeserializable(graph: S2GraphLike) extends Deserializable[SnapshotEdge] {
  val builder = graph.elementBuilder
  def statusCodeWithOp(byte: Byte): (Byte, Byte) = {
    val statusCode = byte >> 4
    val op = byte & ((1 << 4) - 1)
    (statusCode.toByte, op.toByte)
  }

  override def fromKeyValues[T: CanSKeyValue](_kvs: Seq[T],
                                              cacheElementOpt: Option[SnapshotEdge]): Option[SnapshotEdge] = {
    try {
      val kvs = _kvs.map { kv => implicitly[CanSKeyValue[T]].toSKeyValue(kv) }
      assert(kvs.size == 1)

      val kv = kvs.head
      val version = kv.timestamp
      var pos = 0
      val (srcVertexId, srcIdLen) = SourceVertexId.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
      pos += srcIdLen

      val isTallSchema = pos + 5 != kv.row.length
      var tgtVertexId = TargetVertexId(ServiceColumn.Default, srcVertexId.innerId)

      if (isTallSchema) {
        val (tgtId, tgtBytesLen) = InnerVal.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
        tgtVertexId = TargetVertexId(ServiceColumn.Default, tgtId)
        pos += tgtBytesLen
      }

      val labelWithDir = LabelWithDirection(Bytes.toInt(kv.row, pos, 4))
      pos += 4
      val (labelIdxSeq, isInverted) = bytesToLabelIndexSeqWithIsInverted(kv.row, pos)
      pos += 1

      if (!isInverted) None
      else {
        val label = Label.findById(labelWithDir.labelId)
        val schemaVer = label.schemaVersion
//        val srcVertexId = SourceVertexId(ServiceColumn.Default, srcIdAndTgtId.srcInnerId)
//        val tgtVertexId = SourceVertexId(ServiceColumn.Default, tgtId.tgtInnerId)

        var pos = 0
        val (statusCode, op) = statusCodeWithOp(kv.value(pos))
        pos += 1
        val (props, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
        val kvsMap = props.toMap
        val tsInnerVal = kvsMap(LabelMeta.timestamp).innerVal
        val ts = tsInnerVal.toString.toLong
        pos = endAt

        val _pendingEdgeOpt =
          if (pos == kv.value.length) None
          else {
            val (pendingEdgeStatusCode, pendingEdgeOp) = statusCodeWithOp(kv.value(pos))
            pos += 1
            //          val versionNum = Bytes.toLong(kv.value, pos, 8)
            //          pos += 8
            val (pendingEdgeProps, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
            pos = endAt
            val lockTs = Option(Bytes.toLong(kv.value, pos, 8))

            val pendingEdge =
              builder.newEdge(
                builder.newVertex(srcVertexId, version),
                builder.newVertex(tgtVertexId, version),
                label, labelWithDir.dir, pendingEdgeOp,
                version, pendingEdgeProps.toMap,
                statusCode = pendingEdgeStatusCode, lockTs = lockTs, tsInnerValOpt = Option(tsInnerVal))

            Option(pendingEdge)
          }

        val snapshotEdge = builder.newSnapshotEdge(
          builder.newVertex(srcVertexId, ts),
          builder.newVertex(tgtVertexId, ts),
          label, labelWithDir.dir, op, version, props.toMap, statusCode = statusCode,
          pendingEdgeOpt = _pendingEdgeOpt, lockTs = None, tsInnerValOpt = Option(tsInnerVal))

        Option(snapshotEdge)
      }
    } catch {
      case e: Exception =>
        logger.error("#" * 100, e)
        None
    }
  }
}

Source File: SnapshotEdgeSerializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.snapshotedge.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.{S2Edge, SnapshotEdge}
import org.apache.s2graph.core.schema.LabelIndex
import org.apache.s2graph.core.storage.serde._
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.types.SourceAndTargetVertexIdPair


class SnapshotEdgeSerializable(snapshotEdge: SnapshotEdge) extends Serializable[SnapshotEdge] {

  override def ts = snapshotEdge.version
  override def table = snapshotEdge.label.hbaseTableName.getBytes("UTF-8")

  def statusCodeWithOp(statusCode: Byte, op: Byte): Array[Byte] = {
    val byte = (((statusCode << 4) | op).toByte)
    Array.fill(1)(byte.toByte)
  }
  def valueBytes() = Bytes.add(statusCodeWithOp(snapshotEdge.statusCode, snapshotEdge.op), snapshotEdge.propsToKeyValuesWithTs)

  override def toRowKey: Array[Byte] = {
    val srcIdAndTgtIdBytes = SourceAndTargetVertexIdPair(snapshotEdge.srcVertex.innerId, snapshotEdge.tgtVertex.innerId).bytes
    val labelWithDirBytes = snapshotEdge.labelWithDir.bytes
    val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(LabelIndex.DefaultSeq, isInverted = true)

    Bytes.add(srcIdAndTgtIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
  }

  override def toQualifier: Array[Byte] = Array.empty[Byte]

  override def toValue: Array[Byte] =
    snapshotEdge.pendingEdgeOpt match {
      case None => valueBytes()
      case Some(pendingEdge) =>
        val opBytes = statusCodeWithOp(pendingEdge.getStatusCode(), pendingEdge.getOp())
        val versionBytes = Array.empty[Byte]
        val propsBytes = S2Edge.serializePropsWithTs(pendingEdge)
        val lockBytes = Bytes.toBytes(pendingEdge.getLockTs().get)

        Bytes.add(Bytes.add(valueBytes(), opBytes, versionBytes), Bytes.add(propsBytes, lockBytes))
    }

}

Source File: SnapshotEdgeDeserializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.snapshotedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.{Label, LabelMeta}
import org.apache.s2graph.core.storage.serde.StorageDeserializable._
import org.apache.s2graph.core.storage.CanSKeyValue
import org.apache.s2graph.core.types.{HBaseType, LabelWithDirection, SourceVertexId, TargetVertexId}
import org.apache.s2graph.core._
import org.apache.s2graph.core.storage.serde.Deserializable

class SnapshotEdgeDeserializable(graph: S2GraphLike) extends Deserializable[SnapshotEdge] {
  val builder = graph.elementBuilder
  def statusCodeWithOp(byte: Byte): (Byte, Byte) = {
    val statusCode = byte >> 4
    val op = byte & ((1 << 4) - 1)
    (statusCode.toByte, op.toByte)
  }

  override def fromKeyValues[T: CanSKeyValue](_kvs: Seq[T],
                                              cacheElementOpt: Option[SnapshotEdge]): Option[SnapshotEdge] = {
    try {
      val kvs = _kvs.map { kv => implicitly[CanSKeyValue[T]].toSKeyValue(kv) }
      assert(kvs.size == 1)

      val kv = kvs.head
      val version = kv.timestamp
      var pos = 0
      val (srcVertexId, srcIdLen) = SourceVertexId.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
      pos += srcIdLen
      val labelWithDir = LabelWithDirection(Bytes.toInt(kv.row, pos, 4))
      pos += 4
      val (labelIdxSeq, isInverted) = bytesToLabelIndexSeqWithIsInverted(kv.row, pos)
      pos += 1

      if (!isInverted) None
      else {
        val label = Label.findById(labelWithDir.labelId)
        val schemaVer = label.schemaVersion
        val srcVertex = builder.newVertex(srcVertexId, version)

        val (tgtVertexId, _) = TargetVertexId.fromBytes(kv.qualifier, 0, kv.qualifier.length, schemaVer)

        var pos = 0
        val (statusCode, op) = statusCodeWithOp(kv.value(pos))
        pos += 1
        val (props, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
        val kvsMap = props.toMap
        val tsInnerVal = kvsMap(LabelMeta.timestamp).innerVal
        val ts = tsInnerVal.toString.toLong
        pos = endAt

        val _pendingEdgeOpt =
          if (pos == kv.value.length) None
          else {
            val (pendingEdgeStatusCode, pendingEdgeOp) = statusCodeWithOp(kv.value(pos))
            pos += 1
            //          val versionNum = Bytes.toLong(kv.value, pos, 8)
            //          pos += 8
            val (pendingEdgeProps, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
            pos = endAt
            val lockTs = Option(Bytes.toLong(kv.value, pos, 8))

            val pendingEdge =
              builder.newEdge(
                builder.newVertex(srcVertexId, version),
                builder.newVertex(tgtVertexId, version),
                label, labelWithDir.dir, pendingEdgeOp,
                version, pendingEdgeProps.toMap,
                statusCode = pendingEdgeStatusCode, lockTs = lockTs, tsInnerValOpt = Option(tsInnerVal))
            Option(pendingEdge)
          }

        val snapshotEdge = builder.newSnapshotEdge(
          builder.newVertex(srcVertexId, ts),
          builder.newVertex(tgtVertexId, ts),
          label, labelWithDir.dir, op, version, props.toMap, statusCode = statusCode,
          pendingEdgeOpt = _pendingEdgeOpt, lockTs = None, tsInnerValOpt = Option(tsInnerVal))

        Option(snapshotEdge)
      }
    } catch {
      case e: Exception => None
    }
  }
}

Source File: SnapshotEdgeSerializable.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.serde.snapshotedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.{S2Edge, SnapshotEdge}
import org.apache.s2graph.core.schema.LabelIndex
import org.apache.s2graph.core.storage.serde.Serializable
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.types.VertexId




class SnapshotEdgeSerializable(snapshotEdge: SnapshotEdge) extends Serializable[SnapshotEdge] {

  override def ts = snapshotEdge.version
  override def table = snapshotEdge.label.hbaseTableName.getBytes("UTF-8")

  def statusCodeWithOp(statusCode: Byte, op: Byte): Array[Byte] = {
    val byte = (((statusCode << 4) | op).toByte)
    Array.fill(1)(byte.toByte)
  }
  def valueBytes() = Bytes.add(statusCodeWithOp(snapshotEdge.statusCode, snapshotEdge.op), snapshotEdge.propsToKeyValuesWithTs)


  override def toRowKey: Array[Byte] = {
    val srcIdBytes = VertexId.toSourceVertexId(snapshotEdge.srcVertex.id).bytes
    val labelWithDirBytes = snapshotEdge.labelWithDir.bytes
    val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(LabelIndex.DefaultSeq, isInverted = true)

    Bytes.add(srcIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
  }

  override def toQualifier: Array[Byte] =
    VertexId.toTargetVertexId(snapshotEdge.tgtVertex.id).bytes

  override def toValue: Array[Byte] =
    snapshotEdge.pendingEdgeOpt match {
      case None => valueBytes()
      case Some(pendingEdge) =>
        val opBytes = statusCodeWithOp(pendingEdge.getStatusCode(), pendingEdge.getOp())
        val versionBytes = Array.empty[Byte]
        val propsBytes = S2Edge.serializePropsWithTs(pendingEdge)
        val lockBytes = Bytes.toBytes(pendingEdge.getLockTs().get)
        Bytes.add(Bytes.add(valueBytes(), opBytes, versionBytes), Bytes.add(propsBytes, lockBytes))
    }

}

Source File: SKeyValue.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage

import java.nio.charset.StandardCharsets
import org.apache.hadoop.hbase.util.Bytes
import org.hbase.async.KeyValue


object SKeyValue {
//  val SnapshotEdgeCf = "s".getBytes(StandardCharsets.UTF_8)
  val EdgeCf = "e".getBytes(StandardCharsets.UTF_8)
  val VertexCf = "v".getBytes(StandardCharsets.UTF_8)
  val Put = 1
  val Delete = 2
  val Increment = 3
  val Default = Put
}

case class SKeyValue(table: Array[Byte],
                     row: Array[Byte],
                     cf: Array[Byte],
                     qualifier: Array[Byte],
                     value: Array[Byte],
                     timestamp: Long,
                     operation: Int = SKeyValue.Default,
                     durability: Boolean = true) {
  def toLogString = {
    Map("table" -> Bytes.toString(table), "row" -> row.toList, "cf" -> Bytes.toString(cf),
      "qualifier" -> qualifier.toList, "value" -> value.toList, "timestamp" -> timestamp,
      "operation" -> operation, "durability" -> durability).toString()
  }
  override def toString(): String = toLogString

  def toKeyValue: KeyValue = new KeyValue(row, cf, qualifier, timestamp, value)
}

trait CanSKeyValue[T] {
  def toSKeyValue(from: T): SKeyValue
}

object CanSKeyValue {
  def instance[T](f: T => SKeyValue): CanSKeyValue[T] = new CanSKeyValue[T] {
    override def toSKeyValue(from: T): SKeyValue = f.apply(from)
  }

  // For asyncbase KeyValues
  implicit val asyncKeyValue = instance[KeyValue] { kv =>
    SKeyValue(Array.empty[Byte], kv.key(), kv.family(), kv.qualifier(), kv.value(), kv.timestamp())
  }

  implicit val hbaseKeyValue = instance[org.apache.hadoop.hbase.KeyValue] { kv =>
    SKeyValue(Array.empty[Byte], kv.getRow, kv.getFamily, kv.getQualifier, kv.getValue, kv.getTimestamp)
  }

  // For asyncbase KeyValues
  implicit val sKeyValue = instance[SKeyValue](identity)

  // For hbase KeyValues
}

Source File: AsynchbaseEdgeFetcher.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.hbase

import java.util

import com.stumbleupon.async.Deferred
import com.typesafe.config.Config
import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.Label
import org.apache.s2graph.core.storage.serde.Serializable
import org.apache.s2graph.core.storage.{CanSKeyValue, StorageIO, StorageSerDe}
import org.apache.s2graph.core.types.{HBaseType, VertexId}
import org.apache.s2graph.core.utils.{CanDefer, DeferCache, Extensions, logger}
import org.hbase.async._

import scala.concurrent.{ExecutionContext, Future}

class AsynchbaseEdgeFetcher(val graph: S2GraphLike,
                            val config: Config,
                            val client: HBaseClient,
                            val serDe: StorageSerDe,
                            val io: StorageIO) extends EdgeFetcher {

  import AsynchbaseStorage._
  import CanDefer._
  import Extensions.DeferOps

  import scala.collection.JavaConverters._

  

    val edge = graph.elementBuilder.toRequestEdge(queryRequest, parentEdges)
    val request = buildRequest(client, serDe, queryRequest, edge)

    val (intervalMaxBytes, intervalMinBytes) = queryParam.buildInterval(Option(edge))
    val requestCacheKey = Bytes.add(toCacheKeyBytes(request), intervalMaxBytes, intervalMinBytes)

    if (cacheTTL <= 0) fetchInner(request)
    else {
      val cacheKeyBytes = Bytes.add(queryRequest.query.queryOption.cacheKeyBytes, requestCacheKey)

      //      val cacheKeyBytes = toCacheKeyBytes(request)
      val cacheKey = queryParam.toCacheKey(cacheKeyBytes)
      futureCache.getOrElseUpdate(cacheKey, cacheTTL)(fetchInner(request))
    }
  }
}

Source File: RocksVertexFetcher.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.rocks

import com.typesafe.config.Config
import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.ServiceColumn
import org.apache.s2graph.core.storage.rocks.RocksStorage.{qualifier, table}
import org.apache.s2graph.core.storage.{SKeyValue, StorageIO, StorageSerDe}
import org.apache.s2graph.core.types.HBaseType
import org.rocksdb.RocksDB

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}

class RocksVertexFetcher(val graph: S2GraphLike,
                         val config: Config,
                         val db: RocksDB,
                         val vdb: RocksDB,
                         val serDe: StorageSerDe,
                         val io: StorageIO) extends VertexFetcher {
  private def fetchKeyValues(queryRequest: QueryRequest, vertex: S2VertexLike)(implicit ec: ExecutionContext): Future[Seq[SKeyValue]] = {
    val rpc = RocksStorage.buildRequest(queryRequest, vertex)

    RocksStorage.fetchKeyValues(vdb, db, rpc)
  }

  override def fetchVertices(vertexQueryParam: VertexQueryParam)(implicit ec: ExecutionContext): Future[Seq[S2VertexLike]] = {
    def fromResult(kvs: Seq[SKeyValue], version: String): Seq[S2VertexLike] = {
      if (kvs.isEmpty) Nil
      else serDe.vertexDeserializer(version).fromKeyValues(kvs, None).toSeq.filter(vertexQueryParam.where.get.filter)
    }
    val vertices = vertexQueryParam.vertexIds.map(vId => graph.elementBuilder.newVertex(vId))

    val futures = vertices.map { vertex =>
      val queryParam = QueryParam.Empty
      val q = Query.toQuery(Seq(vertex), Seq(queryParam))
      val queryRequest = QueryRequest(q, stepIdx = -1, vertex, queryParam)

      fetchKeyValues(queryRequest, vertex).map { kvs =>
        fromResult(kvs, vertex.serviceColumn.schemaVersion)
      } recoverWith {
        case ex: Throwable => Future.successful(Nil)
      }
    }

    Future.sequence(futures).map(_.flatten)
  }

  override def fetchVerticesAll()(implicit ec: ExecutionContext) = {
    import scala.collection.mutable

    val vertices = new ArrayBuffer[S2VertexLike]()
    ServiceColumn.findAll().groupBy(_.service.hTableName).toSeq.foreach { case (hTableName, columns) =>
      val distinctColumns = columns.toSet

      val iter = vdb.newIterator()
      val buffer = mutable.ListBuffer.empty[SKeyValue]
      var oldVertexIdBytes = Array.empty[Byte]
      var minusPos = 0

      try {
        iter.seekToFirst()
        while (iter.isValid) {
          val row = iter.key()
          if (!Bytes.equals(oldVertexIdBytes, 0, oldVertexIdBytes.length - minusPos, row, 0, row.length - 1)) {
            if (buffer.nonEmpty)
              serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None)
                .filter(v => distinctColumns(v.serviceColumn))
                .foreach { vertex =>
                  vertices += vertex
                }

            oldVertexIdBytes = row
            minusPos = 1
            buffer.clear()
          }
          val kv = SKeyValue(table, iter.key(), SKeyValue.VertexCf, qualifier, iter.value(), System.currentTimeMillis())
          buffer += kv

          iter.next()
        }
        if (buffer.nonEmpty)
          serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None)
            .filter(v => distinctColumns(v.serviceColumn))
            .foreach { vertex =>
              vertices += vertex
            }

      } finally {
        iter.close()
      }
    }

    Future.successful(vertices)
  }
}

Source File: LabelWithDirection.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.types

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.GraphUtil

object LabelWithDirection {

  import HBaseType._

  def apply(compositeInt: Int): LabelWithDirection = {
    //      logger.debug(s"CompositeInt: $compositeInt")

    val dir = compositeInt & ((1 << bitsForDir) - 1)
    val labelId = compositeInt >> bitsForDir
    LabelWithDirection(labelId, dir)
  }

  def labelOrderSeqWithIsInverted(labelOrderSeq: Byte, isInverted: Boolean): Array[Byte] = {
    assert(labelOrderSeq < (1 << 6))
    val byte = labelOrderSeq << 1 | (if (isInverted) 1 else 0)
    Array.fill(1)(byte.toByte)
  }

  def bytesToLabelIndexSeqWithIsInverted(bytes: Array[Byte], offset: Int): (Byte, Boolean) = {
    val byte = bytes(offset)
    val isInverted = if ((byte & 1) != 0) true else false
    val labelOrderSeq = byte >> 1
    (labelOrderSeq.toByte, isInverted)
  }
}

case class LabelWithDirection(labelId: Int, dir: Int) extends HBaseSerializable {

  import HBaseType._

  assert(dir < (1 << bitsForDir))
  assert(labelId < (Int.MaxValue >> bitsForDir))

  lazy val labelBits = labelId << bitsForDir

  lazy val compositeInt = labelBits | dir

  def bytes = {
     Bytes.toBytes(compositeInt)
  }

  lazy val dirToggled = LabelWithDirection(labelId, GraphUtil.toggleDir(dir))

  def updateDir(newDir: Int) = LabelWithDirection(labelId, newDir)

  def isDirected = dir == 0 || dir == 1

  override def hashCode(): Int = compositeInt

  override def equals(other: Any): Boolean = {
    other match {
      case o: LabelWithDirection => hashCode == o.hashCode()
      case _ => false
    }
  }
}

Source File: S2GraphCellReader.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.serde.reader

import org.apache.hadoop.hbase.Cell
import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.storage.SKeyValue
import org.apache.s2graph.core.types.HBaseType
import org.apache.s2graph.core.{GraphElement, S2Graph}
import org.apache.s2graph.s2jobs.serde.GraphElementReadable

class S2GraphCellReader(elementType: String) extends GraphElementReadable[Seq[Cell]]{
  override def read(s2: S2Graph)(cells: Seq[Cell]): Seq[GraphElement] = {
    val schemaVer = HBaseType.DEFAULT_VERSION
    val kvs = cells.map { cell =>
      new SKeyValue(Array.empty[Byte], cell.getRow, cell.getFamily, cell.getQualifier,
        cell.getValue, cell.getTimestamp, SKeyValue.Default)
    }

    elementType.toLowerCase match {
      case "vertex" | "v" =>
        s2.defaultStorage.serDe.vertexDeserializer(schemaVer)
          .fromKeyValues(kvs, None).map(_.asInstanceOf[GraphElement]).toSeq
      case "indexedge" | "ie" =>
        kvs.flatMap { kv =>
          s2.defaultStorage.serDe.indexEdgeDeserializer(schemaVer)
            .fromKeyValues(Seq(kv), None).map(_.asInstanceOf[GraphElement])
        }
      case "snapshotedge" | "se" =>
        kvs.flatMap { kv =>
          s2.defaultStorage.serDe.snapshotEdgeDeserializer(schemaVer)
            .fromKeyValues(Seq(kv), None).map(_.asInstanceOf[GraphElement])
        }
      case _ => throw new IllegalArgumentException(s"$elementType is not supported.")
    }
  }
}

Source File: BulkLoadPartitioner.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
}

Source File: HBaseBulkDeleteExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Delete
import org.apache.spark.SparkConf


object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeletesExample {tableName} ")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkDelete[Array[Byte]](rdd,
        TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)
    } finally {
      sc.stop()
    }
  }
}

Source File: ByteArrayWrapper.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class ByteArrayWrapper (var value:Array[Byte])
  extends Comparable[ByteArrayWrapper] with Serializable {
  override def compareTo(valueOther: ByteArrayWrapper): Int = {
    Bytes.compareTo(value,valueOther.value)
  }
  override def equals(o2: Any): Boolean = {
    o2 match {
      case wrapper: ByteArrayWrapper =>
        Bytes.equals(value, wrapper.value)
      case _ =>
        false
    }
  }
  override def hashCode():Int = {
    Bytes.hashCode(value)
  }
}

Source File: BulkLoadPartitioner.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case wrapper: ByteArrayWrapper =>
          wrapper.value
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
}

Source File: KeyFamilyQualifier.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
}

Source File: HBaseMapPartitionExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkContext, SparkConf}


object HBaseMapPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => {
        val table = connection.getTable(TableName.valueOf(tableName))
        it.map{r =>
          //batching would be faster.  This is just an example
          val result = table.get(new Get(r))
          val it = result.listCells().iterator()
          val b = new StringBuilder
          b.append(Bytes.toString(result.getRow) + ":")
          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(cell.getQualifierArray)
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")")
            }
          }
          b.toString()
        }
      })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkGetExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.{Result, Get}
import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.spark.{SparkContext, SparkConf}


object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseForeachPartitionExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkContext, SparkConf}


object HBaseForeachPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)


      rdd.hbaseForeachPartition(hbaseContext,
        (it, connection) => {
          val m = connection.getBufferedMutator(TableName.valueOf(tableName))

          it.foreach(r => {
            val put = new Put(r._1)
            r._2.foreach((putValue) =>
              put.addColumn(putValue._1, putValue._2, putValue._3))
            m.mutate(put)
          })
          m.flush()
          m.close()
        })

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkDeleteExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes

import org.apache.spark.{SparkContext, SparkConf}


object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeletesExample {tableName} ")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.{SparkConf, SparkContext}


object HBaseBulkPutExample {
   def main(args: Array[String]) {
     if (args.length < 2) {
       println("HBaseBulkPutExample {tableName} {columnFamily}")
       return
     }

     val tableName = args(0)
     val columnFamily = args(1)

     val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
       tableName + " " + columnFamily)
     val sc = new SparkContext(sparkConf)

     try {
       //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
       val rdd = sc.parallelize(Array(
         (Bytes.toBytes("1"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
         (Bytes.toBytes("2"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
         (Bytes.toBytes("3"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
         (Bytes.toBytes("4"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
         (Bytes.toBytes("5"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
       ))

       val conf = HBaseConfiguration.create()

       val hbaseContext = new HBaseContext(sc, conf)

       rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName),
         (putRecord) => {
           val put = new Put(putRecord._1)
           putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
             putValue._3))
           put
         })

     } finally {
       sc.stop()
     }
   }
 }

Source File: HBaseBulkGetExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.spark.SparkConf


object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = hbaseContext.bulkGet[Array[Byte], String](
        TableName.valueOf(tableName),
        2,
        rdd,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExampleFromFile.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf


object HBaseBulkPutExampleFromFile {
  def main(args: Array[String]) {
    if (args.length < 3) {
      println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)
    val inputFile = args(2)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " +
      tableName + " " + columnFamily + " " + inputFile)
    val sc = new SparkContext(sparkConf)

    try {
      var rdd = sc.hadoopFile(
        inputFile,
        classOf[TextInputFormat],
        classOf[LongWritable],
        classOf[Text]).map(v => {
        System.out.println("reading-" + v._2.toString)
        v._2.toString
      })

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[String](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          System.out.println("hbase-" + putRecord)
          val put = new Put(Bytes.toBytes("Value- " + putRecord))
          put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"),
            Bytes.toBytes(putRecord.length()))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: KeyFamilyQualifier.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
}

Source File: HBaseBulkPutExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf


object HBaseBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) =>
            put.addColumn(putValue._1, putValue._2, putValue._3))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseDistributedScanExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Scan
import org.apache.spark.SparkConf

object HBaseDistributedScanExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("GenerateGraphs {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName )
    val sc = new SparkContext(sparkConf)

    try {
      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val scan = new Scan()
      scan.setCaching(100)

      val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan)

      getRdd.foreach(v => println(Bytes.toString(v._1.get())))

      println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length);

        //.collect().foreach(v => println(Bytes.toString(v._1.get())))
    } finally {
      sc.stop()
    }
  }

}

Source File: HBaseBulkPutTimestampExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf


object HBaseBulkPutTimestampExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {

      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("6"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("7"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("8"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("9"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("10"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))))

      val conf = HBaseConfiguration.create()

      val timeStamp = System.currentTimeMillis()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
            timeStamp, putValue._3))
          put
        })
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseSimpleRDD.scala From spark-hbase-connector with Apache License 2.0

5 votes

package it.nerdammer.spark.hbase

import it.nerdammer.spark.hbase.conversion.FieldReader
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, TaskContext}

import scala.reflect.ClassTag

class HBaseSimpleRDD[R: ClassTag](hadoopHBase: NewHadoopRDD[ImmutableBytesWritable, Result], builder: HBaseReaderBuilder[R], saltingLength: Int = 0)
                       (implicit mapper: FieldReader[R], saltingProvider: SaltingProviderFactory[String]) extends RDD[R](hadoopHBase) {

  override def getPartitions: Array[Partition] = firstParent[(ImmutableBytesWritable, Result)].partitions

  override def compute(split: Partition, context: TaskContext) = {
    // val cleanConversion = sc.clean ---> next version
    firstParent[(ImmutableBytesWritable, Result)].iterator(split, context)
      .map(e => conversion(e._1, e._2))
  }

  def conversion(key: ImmutableBytesWritable, row: Result) = {

    val columnNames = HBaseUtils.chosenColumns(builder.columns, mapper.columns)

    val columnNamesFC = HBaseUtils.columnsWithFamily(builder.columnFamily, columnNames)

    val columns = columnNamesFC
      .map(t => (Bytes.toBytes(t._1), Bytes.toBytes(t._2)))
      .map(t => if(row.containsColumn(t._1, t._2)) Some(CellUtil.cloneValue(row.getColumnLatestCell(t._1, t._2)).array) else None)
      .toList

    mapper.map(Some(key.get.drop(saltingLength)) :: columns)
  }
}

Source File: HBaseSinkSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.external.hbase

import akka.actor.ActorSystem
import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.external.hbase.HBaseSink.{HBaseWriter, HBaseWriterFactory}
import org.apache.gearpump.streaming.MockUtil
import org.apache.gearpump.streaming.task.TaskContext
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.mock.MockitoSugar
import org.scalatest.prop.PropertyChecks
import org.scalatest.{Matchers, PropSpec}

class HBaseSinkSpec extends PropSpec with PropertyChecks with Matchers with MockitoSugar {


  property("HBaseSink should invoke HBaseWriter for writing message to HBase") {

    val hbaseWriter = mock[HBaseWriter]
    val hbaseWriterFactory = mock[HBaseWriterFactory]

    implicit val system: ActorSystem = MockUtil.system

    val userConfig = UserConfig.empty
    val tableName = "hbase"

    when(hbaseWriterFactory.getHBaseWriter(userConfig, tableName))
      .thenReturn(hbaseWriter)

    val hbaseSink = new HBaseSink(userConfig, tableName, hbaseWriterFactory)

    hbaseSink.open(MockUtil.mockTaskContext)

    forAll(Gen.alphaStr) { (value: String) =>
      val message = Message(value)
      hbaseSink.write(message)
      verify(hbaseWriter, atLeastOnce()).put(value)
    }

    hbaseSink.close()
    verify(hbaseWriter).close()
  }

  property("HBaseWriter should insert a row successfully") {

    val table = mock[Table]
    val config = mock[Configuration]
    val connection = mock[Connection]
    val taskContext = mock[TaskContext]

    val map = Map[String, String]("HBASESINK" -> "hbasesink", "TABLE_NAME" -> "hbase.table.name",
      "COLUMN_FAMILY" -> "hbase.table.column.family", "COLUMN_NAME" -> "hbase.table.column.name",
      "HBASE_USER" -> "hbase.user", "GEARPUMP_KERBEROS_PRINCIPAL" -> "gearpump.kerberos.principal",
      "GEARPUMP_KEYTAB_FILE" -> "gearpump.keytab.file"
    )
    val userConfig = new UserConfig(map)
    val tableName = "hbase"
    val row = "row"
    val group = "group"
    val name = "name"
    val value = "3.0"

    when(connection.getTable(TableName.valueOf(tableName))).thenReturn(table)

    val put = new Put(Bytes.toBytes(row))
    put.addColumn(Bytes.toBytes(group), Bytes.toBytes(name), Bytes.toBytes(value))

    val hbaseWriter = new HBaseWriter(connection, tableName)
    hbaseWriter.insert(Bytes.toBytes(row), Bytes.toBytes(group), Bytes.toBytes(name),
      Bytes.toBytes(value))

    verify(table).put(MockUtil.argMatch[Put](_.getRow sameElements Bytes.toBytes(row)))
  }
}

Source File: HBaseUtil.scala From sprue with Apache License 2.0

5 votes

package com.cloudera.sprue

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes

object HBaseUtil {
  val columnFamily: String = "cf1"

  def insertIncomingDataIntoHBase(patient: Patient): Put = {
    if (patient.getPatientId == null) {
      return null
    } else {
      val put = new Put(Bytes.toBytes(patient.getPatientId))
      if (patient.getPatientId != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("patientId"), Bytes.toBytes(patient.getPatientId))
      if (patient.getLocation != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("location"), Bytes.toBytes(patient.getLocation))
      if ((patient.getEvaluationDate : java.lang.Long) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("evaluationDate"), Bytes.toBytes(patient.getEvaluationDate))
      if ((patient.getTemperature  : java.lang.Float) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("temperature"), Bytes.toBytes(patient.getTemperature))
      if ((patient.getWbc  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("wbc"), Bytes.toBytes(patient.getWbc))

      if ((patient.getHeartRate  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("heartRate"), Bytes.toBytes(patient.getHeartRate))
      if ((patient.getRespiratoryRate  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("respiratoryRate"), Bytes.toBytes(patient.getRespiratoryRate))
      if ((patient.getSbp  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sbp"), Bytes.toBytes(patient.getSbp))
      if ((patient.getHypotension  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("hypotension"), Bytes.toBytes(patient.getHypotension))
      if (patient.getInfectionFlag != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("infectionFlag"), Bytes.toBytes(patient.getInfectionFlag))
      if ((patient.getOrganFailCount  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("organFailCount"), Bytes.toBytes(patient.getOrganFailCount))

      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("evalFinished"), Bytes.toBytes("N"))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("recordUpdatedTime"), Bytes.toBytes(System.currentTimeMillis))

    }
  }

  def insertEvaluatedDataIntoHBase(patient: Patient): Put = {
    if (patient.getPatientId == null) {
      return null
    } else {
      val put = new Put(Bytes.toBytes(patient.getPatientId))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("evalFinished"), Bytes.toBytes("Y"))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sirsCounter"), Bytes.toBytes(patient.getSirsCounter))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sirsFlag"), Bytes.toBytes(patient.getSirsFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sepsisFlag"), Bytes.toBytes(patient.getSepsisFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("severeSepsisFlag"), Bytes.toBytes(patient.getSevereSepsisFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("septicShockFlag"), Bytes.toBytes(patient.getSepticShockFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("organDysfunctionFlag"), Bytes.toBytes(patient.getOrganDysfunctionSyndrome))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("systemEvalTime"), Bytes.toBytes(System.currentTimeMillis))
      put
    }
  }
}

Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object HBaseSinkApp {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
      .foreachRDD(rdd => {
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
        hbaseConf.set("hbase.master", hbaseMaster)
        val jobConf = new Configuration(hbaseConf)
        jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName)
        rdd.map(rec => {
          val put = new Put(rec._1.getBytes)
          put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
          (rec._1, put)
        }).saveAsNewAPIHadoopDataset(jobConf)
      })

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: L6-16SparkHBase.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object SparkHBaseBulkPutApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    val hbaseConf = HBaseConfiguration.create()
    val hContext = new HBaseContext(ssc.sparkContext, hbaseConf)

    val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))

    hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => {
      val put = new Put(rec._1.getBytes)
      put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
      put
    })

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: HbRddReaders.scala From hbrdd with Apache License 2.0

5 votes

package top.spoofer.hbrdd.unit

import org.apache.hadoop.hbase.util.Bytes
import org.json4s._

trait HbRddReaders {
  implicit val hbBooleanReader = new HbRddFormatsReader[Boolean] {
    def formatsRead(readData: Array[Byte]): Boolean = Bytes.toBoolean(readData)
  }

  implicit val hbByteArrayReader = new HbRddFormatsReader[Array[Byte]] {
    def formatsRead(readData: Array[Byte]): Array[Byte] = readData
  }

  implicit val hbShortReader = new HbRddFormatsReader[Short] {
    def formatsRead(readData: Array[Byte]): Short = Bytes.toShort(readData)
  }

  implicit val hbIntReader = new HbRddFormatsReader[Int] {
    def formatsRead(readData: Array[Byte]): Int = Bytes.toInt(readData)
  }

  implicit val hbFloatReader = new HbRddFormatsReader[Float] {
    def formatsRead(readData: Array[Byte]): Float = Bytes.toFloat(readData)
  }

  implicit val hbDoubleReader = new HbRddFormatsReader[Double] {
    def formatsRead(readData: Array[Byte]): Double = Bytes.toDouble(readData)
  }

  implicit val hbLongReader = new HbRddFormatsReader[Long] {
    def formatsRead(readData: Array[Byte]): Long = Bytes.toLong(readData)
  }

  implicit val hbStringReader = new HbRddFormatsReader[String] {
    def formatsRead(readData: Array[Byte]): String = Bytes.toString(readData)
  }

  
  implicit val hbJsonReader = new HbRddFormatsReader[JValue] {
    import org.json4s.jackson.JsonMethods._
    def formatsRead(readData: Array[Byte]): JValue = parse(Bytes.toString(readData))
  }
}

Source File: HbRddWriters.scala From hbrdd with Apache License 2.0

5 votes

package top.spoofer.hbrdd.unit

import org.apache.hadoop.hbase.util.Bytes
import org.json4s._

trait HbRddWriters {
  implicit val hbBooleanWriter = new HbRddFormatsWriter[Boolean] {
    def formatsWrite(writeData: Boolean): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbArrayWriter = new HbRddFormatsWriter[Array[Byte]] {
    def formatsWrite(writeData: Array[Byte]): Array[Byte] = writeData
  }

  implicit val hbShortWriter = new HbRddFormatsWriter[Short] {
    def formatsWrite(writeData: Short): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbIntWriter = new HbRddFormatsWriter[Int] {
    def formatsWrite(writeData: Int): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbFloatWriter = new HbRddFormatsWriter[Float] {
    def formatsWrite(writeData: Float): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbDoubleWrite = new HbRddFormatsWriter[Double] {
    def formatsWrite(writeData: Double): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbLongWrite = new HbRddFormatsWriter[Long] {
    def formatsWrite(writeData: Long): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbStringWrite = new HbRddFormatsWriter[String] {
    def formatsWrite(writeData: String): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbJsonWrite = new HbRddFormatsWriter[JValue] {
    import org.json4s.jackson.JsonMethods._
    def formatsWrite(writeData: JValue): Array[Byte] = Bytes.toBytes(compact(writeData))
  }
}

Source File: HbasePredicate.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hbase

import io.eels._
import io.eels.schema.{DataType, StructType}
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
import org.apache.hadoop.hbase.filter._
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.JavaConversions._

// These are simply marker predicates used for pattern matching
case class ContainsPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = true
}

case class RegexPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = true
}

case class StartsWithPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = true
}

case class NotEqualsPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = row.get(name) != value
}

object HbasePredicate {

  private val ByteComparableClazz = classOf[BinaryComparator]
  private val StringComparableClazz = classOf[SubstringComparator]
  private val RegexStringComparableClazz = classOf[RegexStringComparator]
  private val BinaryPrefixComparableClazz = classOf[BinaryPrefixComparator]

  def apply(pred: Predicate)(implicit schema: StructType, serializer: HbaseSerializer): FilterList = pred match {
    case EqualsPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, ByteComparableClazz))
    case NotEqualsPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.NOT_EQUAL, value, ByteComparableClazz))
    case ContainsPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, StringComparableClazz))
    case StartsWithPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, BinaryPrefixComparableClazz))
    case RegexPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, RegexStringComparableClazz))
    case GtPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.GREATER, value, ByteComparableClazz))
    case GtePredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.GREATER_OR_EQUAL, value, ByteComparableClazz))
    case LtPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.LESS, value, ByteComparableClazz))
    case LtePredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.LESS_OR_EQUAL, value, ByteComparableClazz))
    case AndPredicate(predicates: Seq[Predicate]) => new FilterList(FilterList.Operator.MUST_PASS_ALL, predicates.map(apply).flatMap(_.getFilters))
    case OrPredicate(predicates: Seq[Predicate]) => new FilterList(FilterList.Operator.MUST_PASS_ONE, predicates.map(apply).flatMap(_.getFilters))
    case _@predicateType => sys.error(s"Predicate type '${predicateType.getClass.getSimpleName}' is not supported!")
  }

  def hbaseFiler[T](name: String, compareOp: CompareOp, value: Any, comparableClass: Class[T])
                   (implicit schema: StructType, serializer: HbaseSerializer): Filter = {
    val field = schema.fields.find(_.name == name).getOrElse(sys.error(s"Field '$name' in the predicate is not defined in the EEL schema"))
    if (field.key) {
      new RowFilter(compareOp, hbaseComparator(comparableClass, name, field.dataType, value))
    } else {
      new SingleColumnValueFilter(
        Bytes.toBytes(field.columnFamily.getOrElse(sys.error(s"No Column Family defined for field '${field.name}'"))),
        Bytes.toBytes(name),
        compareOp,
        hbaseComparator(comparableClass, name, field.dataType, value))
    }
  }

  def hbaseComparator[T](comparableClass: Class[T], name: String, dataType: DataType, value: Any)
                        (implicit schema: StructType, serializer: HbaseSerializer): ByteArrayComparable = (comparableClass, value) match {
    case (ByteComparableClazz, _) => new BinaryComparator(serializer.toBytes(value, name, dataType))
    case (RegexStringComparableClazz, stringValue: String) => new RegexStringComparator(stringValue)
    case (StringComparableClazz, stringValue: String) => new SubstringComparator(stringValue)
    case (BinaryPrefixComparableClazz, _) => new BinaryPrefixComparator(serializer.toBytes(value, name, dataType))
  }

  // Shorthand predicate names
  def or(left: Predicate, right: Predicate) = OrPredicate(Seq(left, right))

  def and(left: Predicate, right: Predicate) = AndPredicate(Seq(left, right))

  def equals(name: String, value: Any) = EqualsPredicate(name, value)

  def notEquals(name: String, value: Any) = NotEqualsPredicate(name, value)

  def gt(name: String, value: Any) = GtPredicate(name, value)

  def gte(name: String, value: Any) = GtePredicate(name, value)

  def lt(name: String, value: Any) = LtPredicate(name, value)

  def lte(name: String, value: Any) = LtePredicate(name, value)

  def regex(name: String, value: Any) = RegexPredicate(name, value)

  def contains(name: String, value: Any) = ContainsPredicate(name, value)

  def startsWith(name: String, value: Any) = StartsWithPredicate(name, value)

}

Source File: HBaseStreamingBulkPutExample.scala From SparkOnHBase with Apache License 2.0

4 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.SparkConf


object HBaseStreamingBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 4) {
      println("HBaseStreamingBulkPutExample " +
        "{host} {port} {tableName} {columnFamily}")
      return
    }

    val host = args(0)
    val port = args(1)
    val tableName = args(2)
    val columnFamily = args(3)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)
    try {
      val ssc = new StreamingContext(sc, Seconds(1))

      val lines = ssc.socketTextStream(host, port.toInt)

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      hbaseContext.streamBulkPut[String](lines,
        TableName.valueOf(tableName),
        (putRecord) => {
          if (putRecord.length() > 0) {
            val put = new Put(Bytes.toBytes(putRecord))
            put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar"))
            put
          } else {
            null
          }
        })
      ssc.start()
      ssc.awaitTerminationOrTimeout(60000)
    } finally {
      sc.stop()
    }
  }
}

org.apache.hadoop.hbase.util.Bytes Scala Examples