org.apache.hadoop.hbase.HBaseConfiguration Scala Example

Source File: HBase.scala From AI with Apache License 2.0

6 votes

package com.bigchange.hbase

import com.bigchange.util.HBaseUtil._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{Result, _}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.SparkContext


  def existRowKey(row:String, table: Table): Boolean ={

    val get = new Get(row.getBytes())
    val result = table.get(get)

    if (result.isEmpty) {
      warn("hbase table don't have this data,execute insert")
      return false
    }

    true

  }

  def getConfiguration = if(hBaseConfiguration == null) {
      warn("hbase setDefaultConfiguration....")
      setDefaultConfiguration
    } else  hBaseConfiguration

  def setDefaultConfiguration = {

    hBaseConfiguration = HBaseConfiguration.create

    // 本地测试 需配置的选项， 在集群上通过对应配置文件路径自动获得
    hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径
    hBaseConfiguration.set("dfs.nameservices", "ns1");  //
    hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址
    // 设置namenode自动切换的实现类
    hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
    hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase")
    hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2")
    hBaseConfiguration.set("hbase.zookeeper.property.clientPort", "2181")

    hBaseConfiguration

  }

}

Source File: HBaseStreamingBulkPutExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseStreamingBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 4) {
      println("HBaseStreamingBulkPutExample " +
        "{host} {port} {tableName} {columnFamily} are missing an argument")
      return
    }

    val host = args(0)
    val port = args(1)
    val tableName = args(2)
    val columnFamily = args(3)

    val sparkConf = new SparkConf().setAppName("HBaseStreamingBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)
    try {
      val ssc = new StreamingContext(sc, Seconds(1))

      val lines = ssc.socketTextStream(host, port.toInt)

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      hbaseContext.streamBulkPut[String](lines,
        TableName.valueOf(tableName),
        (putRecord) => {
          if (putRecord.length() > 0) {
            val put = new Put(Bytes.toBytes(putRecord))
            put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar"))
            put
          } else {
            null
          }
        })
      ssc.start()
      ssc.awaitTerminationOrTimeout(60000)
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseSQLContext.scala From Spark-SQL-on-HBase with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.analysis.OverrideCatalog
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan}
import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies}

class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) {
  self =>

  def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)

  protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf

  HBaseConfiguration.merge(
    sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration))

  @transient
  override protected[sql] lazy val catalog: HBaseCatalog =
    new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog

  experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource)

  @transient
  override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
    val batches = Batch("Add exchange", Once, EnsureRequirements(self)) ::
      Batch("Add coprocessor", Once, AddCoprocessor(self)) ::
      Nil
  }
}

Source File: Util.scala From Spark-SQL-on-HBase with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.util

import java.io._
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.{DeflaterOutputStream, InflaterInputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.HBaseConfiguration

object Util {
  val iteration = new AtomicInteger(0)

  def getTempFilePath(conf: Configuration, prefix: String): String = {
    val fileSystem = FileSystem.get(conf)
    val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}")
    if (fileSystem.exists(path)) {
      fileSystem.delete(path, true)
    }
    path.getName
  }

  def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val deflaterOutputStream = new DeflaterOutputStream(bos)
    val dos = new DataOutputStream(deflaterOutputStream)
    configuration.write(dos)
    dos.close()
    bos.toByteArray
  }

  def deserializeHBaseConfiguration(arr: Array[Byte]) = {
    val conf = HBaseConfiguration.create
    conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
    conf
  }
}

Source File: HBaseTest.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat

import org.apache.spark._


object HBaseTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("HBaseTest")
    val sc = new SparkContext(sparkConf)

    // please ensure HBASE_CONF_DIR is on classpath of spark driver
    // e.g: set it through spark.driver.extraClassPath property
    // in spark-defaults.conf or through --driver-class-path
    // command line option of spark-submit

    val conf = HBaseConfiguration.create()

    if (args.length < 1) {
      System.err.println("Usage: HBaseTest <table_name>")
      System.exit(1)
    }

    // Other options for configuring scan behavior are available. More information available at
    // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
    conf.set(TableInputFormat.INPUT_TABLE, args(0))

    // Initialize hBase table if necessary
    val admin = new HBaseAdmin(conf)
    if (!admin.isTableAvailable(args(0))) {
      val tableDesc = new HTableDescriptor(TableName.valueOf(args(0)))
      admin.createTable(tableDesc)
    }

    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    hBaseRDD.count()

    sc.stop()
    admin.close()
  }
}
// scalastyle:on println

Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.hbase




import scala.math.random
import java.lang.Math
import org.apache.spark._
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
import org.apache.hadoop.hbase.filter.BinaryComparator
import org.apache.hadoop.hbase.filter.FilterList
import org.apache.hadoop.hbase.filter.CompareFilter
import java.util.ArrayList
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.Filter
import scala.collection.mutable.HashSet
import org.apache.hadoop.hbase.client.Put


object HogHBaseReputation {

  // Ex: MX, whitelist
	def getReputationList(listName:String, listType:String):Set[String] =
	{
		val list =  new HashSet[String]


	  val filters: ArrayList[Filter] = new ArrayList();

		val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType)))
		colValFilter1.setFilterIfMissing(false);

		val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName)))
		colValFilter2.setFilterIfMissing(false);

		filters.add(colValFilter1);
		filters.add(colValFilter2);

		val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters);
		val scan = new Scan()
		scan.setFilter(filterList)
    
		val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator()
		
    while(it.hasNext())
		{
      list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) )
		}
    
    list.toSet

	}
  
 def saveReputationList(listName:String, listType:String, ip:String) =
 {
     val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip))
     
     HogHBaseRDD.hogzilla_reputation.put(put)
 }

}

Source File: Hdfs2HBase.scala From wow-spark with MIT License

5 votes

package com.sev7e0.wow.hbase

import java.util

import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.spark.{SparkConf, SparkContext}

object Hdfs2HBase {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
      .setMaster("spark://spark01:7077")
      .setAppName(Hdfs2HBase.getClass.getName)
      .set("spark.jars", "target/wow-spark-1.0-SNAPSHOT.jar")

    val sparkContext = new SparkContext(conf)

    val userRDD = sparkContext.textFile("hdfs://spark01:9000/spark/users.dat",2).map(_.split("::"))

    userRDD.foreachPartition(iter =>{
      val configuration = HBaseConfiguration.create()
//      configuration.set("hbase.zookeeper.quorum","spark01:2181,spark02:2181,spark03:2181")
      configuration.set("hbase.zookeeper.quorum", "spark01")
      configuration.set("hbase.zookeeper.property.clientPort", "2181")
      //创建连接
      val connection = ConnectionFactory.createConnection(configuration)
      //get table object
      val person = connection.getTable(TableName.valueOf("users"))

      iter.foreach(p=>{
        val arrayList = new util.ArrayList[Put]()
        val put = new Put(p(0).getBytes)
        arrayList.add(put.addColumn("f1".getBytes,"gender".getBytes,p(1).getBytes))
        arrayList.add(put.addColumn("f1".getBytes,"age".getBytes,p(2).getBytes))
        arrayList.add(put.addColumn("f2".getBytes,"position".getBytes,p(3).getBytes))
        arrayList.add(put.addColumn("f2".getBytes,"code".getBytes,p(4).getBytes))
        person.put(arrayList)
      })
    })
    sparkContext.stop()

  }

}

Source File: HBasePut.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.hbase.utilities

import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.{DataFrame, SparkSession}

import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs}
import com.paypal.gimel.logger.Logger

object HBasePut {

  def apply(sparkSession: SparkSession): HBasePut = new HBasePut(sparkSession)

}

class HBasePut(sparkSession: SparkSession) {
  val logger = Logger()
  lazy val hbaseUtilities = HBaseUtilities(sparkSession)

  
  def putRows(hbaseTable: String, dataFrame: DataFrame, rowKeyColumn: String, columns: Array[String], cfColsMap: Map[String, String]) {
    try {
      // Configure And Connect
      val conf = HBaseConfiguration.create()
      val cnxn = ConnectionFactory.createConnection(conf)
      // Create Connection to HBase table
      val tbl = cnxn.getTable(TableName.valueOf(hbaseTable))
      val rows = dataFrame.rdd.map { row =>
        (row.getAs(rowKeyColumn).toString,
          columns.map(eachCol => (cfColsMap.getOrElse(eachCol, ""), eachCol, row.getAs(eachCol).asInstanceOf[String]))
        )
      }.collect()
      // Performing put operation on each row of dataframe
      rows.foreach { row =>
        val putRow: Put = new Put(Bytes.toBytes(row._1.asInstanceOf[String]))
        row._2.foreach(x => if (x._2 != rowKeyColumn) putRow.addColumn(Bytes.toBytes(x._1), Bytes.toBytes(x._2), Bytes.toBytes(x._3)))
        tbl.put(putRow)
      }
      tbl.close()
    } catch {
      case ex: Throwable =>
        ex.printStackTrace()
        throw ex
    }
  }
}

Source File: GraphX.scala From unicorn with Apache License 2.0

5 votes

package unicorn.narwhal.graph

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext

import unicorn.bigtable.hbase.HBaseTable
import unicorn.json._
import unicorn.unibase.graph.{ReadOnlyGraph, GraphSerializer, GraphVertexColumnFamily, GraphOutEdgeColumnFamily}


  def graphx(sc: SparkContext): org.apache.spark.graphx.Graph[JsObject, (String, JsValue)] = {

    val conf = HBaseConfiguration.create()
    conf.set(TableInputFormat.INPUT_TABLE, name)
    conf.setInt(TableInputFormat.SCAN_CACHEDROWS, 500)
    conf.setBoolean(TableInputFormat.SCAN_CACHEBLOCKS, false)
    conf.set(TableInputFormat.SCAN_COLUMNS, s"$GraphVertexColumnFamily $GraphOutEdgeColumnFamily")

    val rdd = sc.newAPIHadoopRDD(
      conf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result]
    )

    val rows = rdd.mapPartitions { it =>
      val serializer = new GraphSerializer()
      it.map { tuple =>
        val row = HBaseTable.getRow(tuple._2)
        serializer.deserializeVertex(row)
      }
    }

    val vertices = rows.map { vertex =>
      (vertex.id, vertex.properties)
    }

    val edges = rows.flatMap { vertex =>
      vertex.edges.map { edge =>
        org.apache.spark.graphx.Edge(edge.from, edge.to, (edge.label, edge.properties))
      }
    }

    org.apache.spark.graphx.Graph(vertices, edges)
  }
}

Source File: HBaseBulkPutTimestampExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutTimestampExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily} are missing an argument")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {

      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("6"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("7"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("8"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("9"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("10"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))))

      val conf = HBaseConfiguration.create()

      val timeStamp = System.currentTimeMillis()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
            timeStamp, putValue._3))
          put
        })
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseDistributedScanExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience

@InterfaceAudience.Private
object HBaseDistributedScanExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseDistributedScanExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName )
    val sc = new SparkContext(sparkConf)

    try {
      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val scan = new Scan()
      scan.setCaching(100)

      val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan)

      getRdd.foreach(v => println(Bytes.toString(v._1.get())))

      println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length);
    } finally {
      sc.stop()
    }
  }

}

Source File: HBaseBulkPutExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) =>
            put.addColumn(putValue._1, putValue._2, putValue._3))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkDeleteExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeleteExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkDelete[Array[Byte]](rdd,
        TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExampleFromFile.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExampleFromFile {
  def main(args: Array[String]) {
    if (args.length < 3) {
      println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile} are missing an argument")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)
    val inputFile = args(2)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " +
      tableName + " " + columnFamily + " " + inputFile)
    val sc = new SparkContext(sparkConf)

    try {
      var rdd = sc.hadoopFile(
        inputFile,
        classOf[TextInputFormat],
        classOf[LongWritable],
        classOf[Text]).map(v => {
        System.out.println("reading-" + v._2.toString)
        v._2.toString
      })

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[String](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          System.out.println("hbase-" + putRecord)
          val put = new Put(Bytes.toBytes("Value- " + putRecord))
          put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"),
            Bytes.toBytes(putRecord.length()))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseSparkSession.scala From Heracles with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.analysis.Analyzer
import org.apache.spark.sql.catalyst.catalog.ExternalCatalog
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.SparkPlanner
import org.apache.spark.sql.hbase.execution.{HBaseSourceAnalysis, HBaseStrategies}
import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SQLConf, SessionState, SharedState}

class HBaseSparkSession(sc: SparkContext) extends SparkSession(sc) {
  self =>

  def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)

  @transient
  override lazy val sessionState: SessionState = new HBaseSessionStateBuilder(this).build()

  HBaseConfiguration.merge(
    sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration))

  @transient
  override lazy val sharedState: SharedState =
    new HBaseSharedState(sc, this.sqlContext)
}

class HBaseSessionStateBuilder(session: SparkSession, parentState: Option[SessionState] = None) extends BaseSessionStateBuilder(session) {
  override lazy val conf: SQLConf = new HBaseSQLConf

  override protected def newBuilder: NewBuilder = new HBaseSessionStateBuilder(_, _)

  override lazy val experimentalMethods: ExperimentalMethods = {
    val result = new ExperimentalMethods;
    result.extraStrategies = Seq((new SparkPlanner(session.sparkContext, conf, new ExperimentalMethods)
      with HBaseStrategies).HBaseDataSource)
    result
  }

  override lazy val analyzer: Analyzer = {
    new Analyzer(catalog, conf) {
      override val extendedResolutionRules: Seq[Rule[LogicalPlan]] =
          new FindDataSourceTable(session) +:
          new ResolveSQLOnFile(session) +:
          customResolutionRules

      override val postHocResolutionRules: Seq[Rule[LogicalPlan]] =
          PreprocessTableCreation(session) +:
          PreprocessTableInsertion(conf) +:
          DataSourceAnalysis(conf) +:
          HBaseSourceAnalysis(session) +:
          customPostHocResolutionRules

      override val extendedCheckRules =
        customCheckRules
    }
  }
}

class HBaseSharedState(sc: SparkContext, sqlContext: SQLContext) extends SharedState(sc) {
  override lazy val externalCatalog: ExternalCatalog =
    new HBaseCatalog(sqlContext, sc.hadoopConfiguration)
}

Source File: HBaseBulkGetExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = hbaseContext.bulkGet[Array[Byte], String](
        TableName.valueOf(tableName),
        2,
        rdd,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExample {
   def main(args: Array[String]) {
     if (args.length < 2) {
       println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments")
       return
     }

     val tableName = args(0)
     val columnFamily = args(1)

     val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
       tableName + " " + columnFamily)
     val sc = new SparkContext(sparkConf)

     try {
       //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
       val rdd = sc.parallelize(Array(
         (Bytes.toBytes("1"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
         (Bytes.toBytes("2"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
         (Bytes.toBytes("3"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
         (Bytes.toBytes("4"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
         (Bytes.toBytes("5"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
       ))

       val conf = HBaseConfiguration.create()

       val hbaseContext = new HBaseContext(sc, conf)

       rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName),
         (putRecord) => {
           val put = new Put(putRecord._1)
           putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
             putValue._3))
           put
         })

     } finally {
       sc.stop()
     }
   }
 }

Source File: HBaseBulkDeleteExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeleteExample {tableName} are missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseForeachPartitionExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseForeachPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseForeachPartitionExample {tableName} {columnFamily} are missing an arguments")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseForeachPartitionExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)


      rdd.hbaseForeachPartition(hbaseContext,
        (it, connection) => {
          val m = connection.getBufferedMutator(TableName.valueOf(tableName))

          it.foreach(r => {
            val put = new Put(r._1)
            r._2.foreach((putValue) =>
              put.addColumn(putValue._1, putValue._2, putValue._3))
            m.mutate(put)
          })
          m.flush()
          m.close()
        })

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkGetExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName} is missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseMapPartitionExample.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseMapPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseMapPartitionExample {tableName} is missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseMapPartitionExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => {
        val table = connection.getTable(TableName.valueOf(tableName))
        it.map{r =>
          //batching would be faster.  This is just an example
          val result = table.get(new Get(r))

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(cell.getQualifierArray)
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")")
            }
          }
          b.toString()
        }
      })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: ConfigurationBuilder.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.config

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.kafka.common.config.ConfigException

object ConfigurationBuilder {

  def buildHBaseConfig(hBaseSettings: HBaseSettings): Configuration = {
    val configuration = HBaseConfiguration.create()

    def appendFile(file:String): Unit = {
      val hbaseFile = new File(file)
      if (!hbaseFile.exists) {
        throw new ConfigException(s"$file does not exist in provided HBase configuration directory $hbaseFile.")
      } else {
        configuration.addResource(new Path(hbaseFile.toString))
      }
    }
    hBaseSettings.hbaseConfigDir.foreach { dir =>
      appendFile(dir + s"/hbase-site.xml")
    }
    configuration
  }
}

Source File: HbaseReaderHelper.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.writers

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.HbaseHelper
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}

import scala.collection.JavaConverters._

object HbaseReaderHelper {
  def createConnection: Connection = {
    ConnectionFactory.createConnection(HBaseConfiguration.create())
  }

  def getAllRecords(tableName: String, columnFamily: String)(implicit connection: Connection): List[HbaseRowData] = {
    HbaseHelper.withTable(TableName.valueOf(tableName)) { tbl =>
      val scan = new Scan()
      scan.addFamily(columnFamily.fromString())
      val scanner = tbl.getScanner(scan)
      scanner.asScala.map { rs =>
        val cells = rs.rawCells().map { cell =>
          Bytes.toString(CellUtil.cloneQualifier(cell)) -> CellUtil.cloneValue(cell)
        }.toMap
        HbaseRowData(rs.getRow, cells)
      }.toList
    }
  }

}

case class HbaseRowData(key: Array[Byte], cells: Map[String, Array[Byte]])

Source File: HBaseUtils.scala From bigdata-examples with Apache License 2.0

5 votes

package com.timeyang.common.util

import com.timeyang.common.config.BaseConf
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.hadoop.mapreduce.Job


  def createHbaseOutputJob(tableName: String): Job = {
    val conf = HBaseUtils.newConf()
    conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
    val job = Job.getInstance(conf)
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setOutputValueClass(classOf[Put])
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
    job
  }

}

Source File: CreateSaltedTable.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.setup.hbase

import java.io.File

import org.apache.commons.lang.StringUtils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.compress.Compression
import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable


object CreateSaltedTable {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>")
    }
    val tableName = args(0)
    val columnFamilyName = args(1)
    val regionCount = args(2).toInt
    val numOfSalts = args(3).toInt
    val hbaseConfigFolder = args(4)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val connection = ConnectionFactory.createConnection(conf)

    val admin = connection.getAdmin

    val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName))

    val columnDescriptor = new HColumnDescriptor(columnFamilyName)

    columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY)
    columnDescriptor.setBlocksize(64 * 1024)
    columnDescriptor.setBloomFilterType(BloomType.ROW)

    tableDescriptor.addFamily(columnDescriptor)

    tableDescriptor.setMaxFileSize(Long.MaxValue)
    tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName)

    val splitKeys = new mutable.MutableList[Array[Byte]]
    for (i <- 0 to regionCount) {
      val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0")
      splitKeys += Bytes.toBytes(regionSplitStr)
    }
    admin.createTable(tableDescriptor, splitKeys.toArray)
  }
}

Source File: HBaseRestServer.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.server.hbase

import java.io.File

import com.sun.jersey.spi.container.servlet.ServletContainer
import org.apache.hadoop.hbase.HBaseConfiguration
import org.mortbay.jetty.Server
import org.mortbay.jetty.servlet.{Context, ServletHolder}

object HBaseRestServer {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<port> <configDir> <numberOfSalts> <customerTableName>")
    }
    val port = args(0).toInt
    val hbaseConfigFolder = args(1)
    val numberOfSalts = args(2).toInt
    val appEventTableName = args(3)

    val conf = HBaseConfiguration.create()
    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    HBaseGlobalValues.init(conf, numberOfSalts,
      appEventTableName)

    val server = new Server(port)

    val sh = new ServletHolder(classOf[ServletContainer])
    sh.setInitParameter("com.sun.jersey.config.property.resourceConfigClass", "com.sun.jersey.api.core.PackagesResourceConfig")
    sh.setInitParameter("com.sun.jersey.config.property.packages", "com.cloudera.sa.taxi360.server.hbase")
    sh.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature", "true")

    val context = new Context(server, "/", Context.SESSIONS)
    context.addServlet(sh, "/*")

    println("starting HBase Rest Server")
    server.start()
    println("started HBase Rest Sserver")
    server.join()
  }
}

Source File: HBaseServiceLayer.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.server.hbase

import javax.ws.rs._
import javax.ws.rs.core.MediaType

import com.cloudera.sa.taxi360.model.NyTaxiYellowTrip
import com.cloudera.sa.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable

@Path("rest")
class HBaseServiceLayer {

  @GET
  @Path("hello")
  @Produces(Array(MediaType.TEXT_PLAIN))
  def hello(): String = {
    "Hello World"
  }

  @GET
  @Path("vender/{venderId}/timeline")
  @Produces(Array(MediaType.APPLICATION_JSON))
  def getTripTimeLine (@PathParam("venderId") venderId:String,
                          @QueryParam("startTime") startTime:String = Long.MinValue.toString,
                          @QueryParam("endTime")  endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = {

    val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName))

    val st = if (startTime == null) {
      Long.MinValue.toString
    } else {
      startTime
    }
    val et = if (endTime == null) {
      Long.MaxValue.toString
    } else {
      endTime
    }

    val scan = new Scan()
    val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts)
    println("startRowKey:" + Bytes.toString(startRowKey))
    scan.setStartRow(startRowKey)
    val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts)
    println("endRowKey:" + Bytes.toString(endRowKey))
    scan.setStopRow(endRowKey)

    val scannerIt = table.getScanner(scan).iterator()

    val tripList = new mutable.MutableList[NyTaxiYellowTrip]

    while(scannerIt.hasNext) {
      val result = scannerIt.next()
      tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result)
      println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result))
    }

    println("tripList.size:" + tripList.size)

    tripList.toArray
  }

}

Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.streaming.ingestion.hbase

import java.io.File

import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._
import kafka.serializer.StringDecoder
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.solr.common.cloud.ZooKeeperException
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object SparkStreamingTaxiTripToHBase {
  def main(args: Array[String]): Unit = {
    println("Java Version:" + System.getProperty("java.version"))
    println("Java Home:" + System.getProperties().getProperty("java.home"))

    val v:ZooKeeperException = null

    if (args.length == 0) {
      println("Args: <KafkaBrokerList> " +
        "<kafkaTopicList> " +
        "<numberOfSeconds>" +
        "<runLocal>" +
        "<hbaseTable>" +
        "<numOfSalts>" +
        "<checkpointDir>" +
        "<hbaseConfigFolder>")
      return
    }

    val kafkaBrokerList = args(0)
    val kafkaTopicList = args(1)
    val numberOfSeconds = args(2).toInt
    val runLocal = args(3).equals("l")
    val tableName = args(4)
    val numOfSalts = args(5).toInt
    val checkpointFolder = args(6)
    val hbaseConfigFolder = args(7)

    println("kafkaBrokerList:" + kafkaBrokerList)
    println("kafkaTopicList:" + kafkaTopicList)
    println("numberOfSeconds:" + numberOfSeconds)
    println("runLocal:" + runLocal)
    println("tableName:" + tableName)
    println("numOfSalts:" + numOfSalts)

    val sc:SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase")
      new SparkContext(sparkConf)
    }
    val ssc = new StreamingContext(sc, Seconds(numberOfSeconds))

    val topicsSet = kafkaTopicList.split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList)

    val messageStream = KafkaUtils.
      createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val hbaseContext = new HBaseContext(sc, conf)

    val tripDStream = messageStream.map(r => {
      (r._1, r._2.split(","))
    }).filter(r => r._2.size > 3).map(r => {
      (r._1, NyTaxiYellowTripBuilder.build(r._2))
    })

    tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => {
      TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts)
    })

    ssc.checkpoint(checkpointFolder)
    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: HBaseSparkConf.scala From spark-hbase-connector with Apache License 2.0

5 votes

package it.nerdammer.spark.hbase

import org.apache.hadoop.hbase.{HConstants, HBaseConfiguration}
import org.apache.spark.SparkConf

case class HBaseSparkConf (
  hbaseHost: Option[String] = None,
  hbaseXmlConfigFile: String = "hbase-site.xml") extends Serializable {

  def createHadoopBaseConfig() = {
    val conf = HBaseConfiguration.create

    val xmlFile = Option(getClass.getClassLoader.getResource(hbaseXmlConfigFile))
    xmlFile.foreach(f => conf.addResource(f))

    hbaseHost.foreach(h => conf.set(HConstants.ZOOKEEPER_QUORUM, h))
    if(Option(conf.get(HConstants.ZOOKEEPER_QUORUM)).isEmpty)
      conf.set(HConstants.ZOOKEEPER_QUORUM, HBaseSparkConf.DefaultHBaseHost)

    conf
  }
}

object HBaseSparkConf extends Serializable {

  val DefaultHBaseHost = "localhost"

  def fromSparkConf(conf: SparkConf) = {
    HBaseSparkConf(
      hbaseHost = Option(conf.get("spark.hbase.host", null))
    )
  }
}

Source File: CreateSaltedTable.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.setup.hbase

import java.io.File

import org.apache.commons.lang.StringUtils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.compress.Compression
import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable


object CreateSaltedTable {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>")
    }
    val tableName = args(0)
    val columnFamilyName = args(1)
    val regionCount = args(2).toInt
    val numOfSalts = args(3).toInt
    val hbaseConfigFolder = args(4)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val connection = ConnectionFactory.createConnection(conf)

    val admin = connection.getAdmin

    val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName))

    val columnDescriptor = new HColumnDescriptor(columnFamilyName)

    columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY)
    columnDescriptor.setBlocksize(64 * 1024)
    columnDescriptor.setBloomFilterType(BloomType.ROW)

    tableDescriptor.addFamily(columnDescriptor)

    tableDescriptor.setMaxFileSize(Long.MaxValue)
    tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName)

    val splitKeys = new mutable.MutableList[Array[Byte]]
    for (i <- 0 to regionCount) {
      val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0")
      splitKeys += Bytes.toBytes(regionSplitStr)
    }
    admin.createTable(tableDescriptor, splitKeys.toArray)
  }
}

Source File: HBaseRestServer.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.server.hbase

import java.io.File

import com.sun.jersey.spi.container.servlet.ServletContainer
import org.apache.hadoop.hbase.HBaseConfiguration
import org.mortbay.jetty.Server
import org.mortbay.jetty.servlet.{Context, ServletHolder}

object HBaseRestServer {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<port> <configDir> <numberOfSalts> <customerTableName>")
    }
    val port = args(0).toInt
    val hbaseConfigFolder = args(1)
    val numberOfSalts = args(2).toInt
    val appEventTableName = args(3)

    val conf = HBaseConfiguration.create()
    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    HBaseGlobalValues.init(conf, numberOfSalts,
      appEventTableName)

    val server = new Server(port)

    val sh = new ServletHolder(classOf[ServletContainer])
    sh.setInitParameter("com.sun.jersey.config.property.resourceConfigClass", "com.sun.jersey.api.core.PackagesResourceConfig")
    sh.setInitParameter("com.sun.jersey.config.property.packages", "com.hadooparchitecturebook.taxi360.server.hbase")
    sh.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature", "true")

    val context = new Context(server, "/", Context.SESSIONS)
    context.addServlet(sh, "/*")

    println("starting HBase Rest Server")
    server.start()
    println("started HBase Rest Sserver")
    server.join()
  }
}

Source File: HBaseServiceLayer.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.server.hbase

import javax.ws.rs._
import javax.ws.rs.core.MediaType

import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTrip
import com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable

@Path("rest")
class HBaseServiceLayer {

  @GET
  @Path("hello")
  @Produces(Array(MediaType.TEXT_PLAIN))
  def hello(): String = {
    "Hello World"
  }

  @GET
  @Path("vender/{venderId}/timeline")
  @Produces(Array(MediaType.APPLICATION_JSON))
  def getTripTimeLine (@PathParam("venderId") venderId:String,
                          @QueryParam("startTime") startTime:String = Long.MinValue.toString,
                          @QueryParam("endTime")  endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = {

    val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName))

    val st = if (startTime == null) {
      Long.MinValue.toString
    } else {
      startTime
    }
    val et = if (endTime == null) {
      Long.MaxValue.toString
    } else {
      endTime
    }

    val scan = new Scan()
    val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts)
    println("startRowKey:" + Bytes.toString(startRowKey))
    scan.setStartRow(startRowKey)
    val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts)
    println("endRowKey:" + Bytes.toString(endRowKey))
    scan.setStopRow(endRowKey)

    val scannerIt = table.getScanner(scan).iterator()

    val tripList = new mutable.MutableList[NyTaxiYellowTrip]

    while(scannerIt.hasNext) {
      val result = scannerIt.next()
      tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result)
      println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result))
    }

    println("tripList.size:" + tripList.size)

    tripList.toArray
  }

}

Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase

import java.io.File

import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._
import kafka.serializer.StringDecoder
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.solr.common.cloud.ZooKeeperException
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object SparkStreamingTaxiTripToHBase {
  def main(args: Array[String]): Unit = {
    println("Java Version:" + System.getProperty("java.version"))
    println("Java Home:" + System.getProperties().getProperty("java.home"))

    val v:ZooKeeperException = null

    if (args.length == 0) {
      println("Args: <KafkaBrokerList> " +
        "<kafkaTopicList> " +
        "<numberOfSeconds>" +
        "<runLocal>" +
        "<hbaseTable>" +
        "<numOfSalts>" +
        "<checkpointDir>" +
        "<hbaseConfigFolder>")
      return
    }

    val kafkaBrokerList = args(0)
    val kafkaTopicList = args(1)
    val numberOfSeconds = args(2).toInt
    val runLocal = args(3).equals("l")
    val tableName = args(4)
    val numOfSalts = args(5).toInt
    val checkpointFolder = args(6)
    val hbaseConfigFolder = args(7)

    println("kafkaBrokerList:" + kafkaBrokerList)
    println("kafkaTopicList:" + kafkaTopicList)
    println("numberOfSeconds:" + numberOfSeconds)
    println("runLocal:" + runLocal)
    println("tableName:" + tableName)
    println("numOfSalts:" + numOfSalts)

    val sc:SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase")
      new SparkContext(sparkConf)
    }
    val ssc = new StreamingContext(sc, Seconds(numberOfSeconds))

    val topicsSet = kafkaTopicList.split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList)

    val messageStream = KafkaUtils.
      createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val hbaseContext = new HBaseContext(sc, conf)

    val tripDStream = messageStream.map(r => {
      (r._1, r._2.split(","))
    }).filter(r => r._2.size > 3).map(r => {
      (r._1, NyTaxiYellowTripBuilder.build(r._2))
    })

    tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => {
      TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts)
    })

    ssc.checkpoint(checkpointFolder)
    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: HBaseSQLContext.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.analysis.OverrideCatalog
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan}
import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies}

class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) {
  self =>

  def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)

  protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf

  HBaseConfiguration.merge(
    sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration))

  @transient
  override protected[sql] lazy val catalog: HBaseCatalog =
    new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog

  experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource)

  @transient
  override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
    val batches = Batch("Add exchange", Once, EnsureRequirements(self)) ::
      Batch("Add coprocessor", Once, AddCoprocessor(self)) ::
      Nil
  }
}

Source File: Util.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.util

import java.io._
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.{DeflaterOutputStream, InflaterInputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.HBaseConfiguration

object Util {
  val iteration = new AtomicInteger(0)

  def getTempFilePath(conf: Configuration, prefix: String): String = {
    val fileSystem = FileSystem.get(conf)
    val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}")
    if (fileSystem.exists(path)) {
      fileSystem.delete(path, true)
    }
    path.getName
  }

  def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val deflaterOutputStream = new DeflaterOutputStream(bos)
    val dos = new DataOutputStream(deflaterOutputStream)
    configuration.write(dos)
    dos.close()
    bos.toByteArray
  }

  def deserializeHBaseConfiguration(arr: Array[Byte]) = {
    val conf = HBaseConfiguration.create
    conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
    conf
  }
}

Source File: HbRddConfig.scala From hbrdd with Apache License 2.0

5 votes

package top.spoofer.hbrdd.config

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration

class HbRddConfig(config: Configuration) extends Serializable {
  def getHbaseConfig = HBaseConfiguration.create(config)
}

object HbRddConfig {
  type configOption = (String, String)
  private[HbRddConfig] case class HbaseOption(name: String, value: String)

  def apply(config: Configuration): HbRddConfig = new HbRddConfig(config)

  def apply(configs: configOption*): HbRddConfig = {
    val hbConfig = HBaseConfiguration.create()

    for {
      option <- configs
      hbOption = HbaseOption(option._1, option._2) //使用新的case class 只是为了表达更加清晰
    } hbConfig.set(hbOption.name, hbOption.value)

    this.apply(hbConfig)
  }

  def apply(configs: { def rootDir: String; def quorum: String }): HbRddConfig = {
    apply(
      "hbase.rootdir" -> configs.rootDir,
      "hbase.zookeeper.quorum" -> configs.quorum
    )
  }

  def apply(configs: Map[String, String]): HbRddConfig = {
    val hbConfig = HBaseConfiguration.create()

    configs.keys foreach { name =>
      hbConfig.set(name, configs(name))
    }

    this.apply(hbConfig)
  }

  def apply(configs: TraversableOnce[configOption]): HbRddConfig = {
    val hbConfig = HBaseConfiguration.create()

    configs foreach { option =>
      val hbOption = HbaseOption(option._1, option._2)
      hbConfig.set(hbOption.name, hbOption.value)
    }

    this.apply(hbConfig)
  }
}

Source File: SparkManager.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.app.spark

import com.typesafe.config.Config
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.{Logger, LoggerFactory}

import scala.collection.JavaConverters._

trait SparkManager {

  val sparkManagerLogger: Logger = LoggerFactory.getLogger("SparkManager")

  
  protected def defaultParallelism(implicit sparkSession: SparkSession, config: Config): Int = {
    sparkSession.conf.getOption(SparkConfigurationKeys.SPARK_EXECUTOR_INSTANCES) match {
      case Some(instances) =>
        sparkSession.conf.getOption(SparkConfigurationKeys.SPARK_CORES).getOrElse("1").toInt * instances.toInt
      case None =>
        sparkManagerLogger.info("Spark is configured with dynamic allocation, default parallelism will be gathered from app " +
          "conf: " +
          "next.process.parallelism")
        if (config.hasPath(SparkConfigurationKeys.PARALLELISM)) {
          config.getInt(SparkConfigurationKeys.PARALLELISM)
        } else {
          sparkManagerLogger.info("next.process.parallelism was not set fallback to sparkSession.defaultParallelism")
          sparkSession.sparkContext.defaultParallelism
        }
    }
  }
}

Source File: L6-16SparkHBase.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object SparkHBaseBulkPutApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    val hbaseConf = HBaseConfiguration.create()
    val hContext = new HBaseContext(ssc.sparkContext, hbaseConf)

    val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))

    hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => {
      val put = new Put(rec._1.getBytes)
      put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
      put
    })

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object HBaseSinkApp {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
      .foreachRDD(rdd => {
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
        hbaseConf.set("hbase.master", hbaseMaster)
        val jobConf = new Configuration(hbaseConf)
        jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName)
        rdd.map(rec => {
          val put = new Put(rec._1.getBytes)
          put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
          (rec._1, put)
        }).saveAsNewAPIHadoopDataset(jobConf)
      })

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: HBaseReaders.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import com.kakao.mango.util.Conversions._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.JavaConversions._

trait HBaseReaders {
  val sc: SparkContext

  
  def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = {
    hbaseTableBinary(quorum, table).map {
      case (rowkey, ((family, qualifier), (timestamp, value))) =>
        (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string)))
    }
  }

  def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = {
    hbaseTableBinary(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell)
    }
  }

  def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = {
    hbaseTable(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell)
    }
  }
}

Source File: HBasePutDriver.scala From akka-nbench with Apache License 2.0

5 votes

package bench.drivers

import akka.actor._
import com.typesafe.config._

import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util._
import org.apache.hadoop.hbase.HBaseConfiguration

class HBasePutDriver(operation: String, stats: ActorRef, config: Config) extends Driver(operation, stats, config) {

  var conn: Connection = _
  var table: Table = _

  val nsName = "ns"
  val tableName= s"${nsName}:tbl"
  val colFamilies = List("fam")

  override val getOperation = () => {
    operation match {
      case "put" => putTest _
    }
  }

  // https://github.com/apache/hbase/blob/master/hbase-server/src/test/java/org/apache/hadoop/hbase/TestNamespace.java
  override def setup(): Boolean = {
    val conf = HBaseConfiguration.create
    this.conn = ConnectionFactory.createConnection(conf)
    val admin = this.conn.getAdmin

    try{
      admin.createNamespace(NamespaceDescriptor.create(nsName).build());
    } catch {
       case e: NamespaceExistException => {
         log.info(s"namespace: ${nsName} already exists")
       }
     }

    val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName))
    try{
      colFamilies.foreach { fam => tableDescriptor.addFamily(new HColumnDescriptor(fam)) }
      admin.createTable(tableDescriptor)
      log.info(s"table: ${tableName} created")
    } catch {
       case e: TableExistsException => {
         log.info(s"table: ${tableName} already exists")
       }
     }
    this.table = this.conn.getTable(TableName.valueOf(tableName))
    true
  }

  override def teardown(): Boolean = {
    this.table.close()
    this.conn.close()
    true
  }

  // https://github.com/hbasebook101/hbasebook101/blob/master/ch04/java-api-examples/src/main/java/example/PutExample.java
  // https://github.com/xldrx/hbase_examples/pull/1/files
  def putTest(): (Boolean, Long, Long) = {
    val start = System.currentTimeMillis

    try {
      val put = new Put(Bytes.toBytes("row-" + start ))
      put.addColumn(Bytes.toBytes("fam"), Bytes.toBytes("col1"), Bytes.toBytes("value1"))
      put.addColumn(Bytes.toBytes("fam"), Bytes.toBytes("col2"), 100L, Bytes.toBytes("value1"))
      table.put(put)

      val endAt = System.currentTimeMillis
      val elapsedMillis= endAt - start
      (true, endAt, elapsedMillis)
    } catch {
       case e: Throwable => {
         log.error("" + e)
         val endAt = System.currentTimeMillis
         val elapsedMillis= endAt - start
         (false, endAt, elapsedMillis)
       }
     }
  }
}

Source File: HBaseTest.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.examples

import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat

import org.apache.spark._


object HBaseTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("HBaseTest")
    val sc = new SparkContext(sparkConf)
    val conf = HBaseConfiguration.create()
    // Other options for configuring scan behavior are available. More information available at
    // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
    conf.set(TableInputFormat.INPUT_TABLE, args(0))

    // Initialize hBase table if necessary
    val admin = new HBaseAdmin(conf)
    if (!admin.isTableAvailable(args(0))) {
      val tableDesc = new HTableDescriptor(TableName.valueOf(args(0)))
      admin.createTable(tableDesc)
    }

    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    hBaseRDD.count()

    sc.stop()
  }
}

Source File: Util.scala From Heracles with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.util

import java.io._
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.{DeflaterOutputStream, InflaterInputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.HBaseConfiguration

object Util {
  val iteration = new AtomicInteger(0)

  
  def dropTempFilePath(conf: Configuration, path: String): Boolean = {
    val fileSystem = FileSystem.get(conf)
    val filePath = new Path(path)
    if (fileSystem.exists(filePath)) {
      fileSystem.delete(filePath, true)
    } else {
      false
    }
  }

  def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val deflaterOutputStream = new DeflaterOutputStream(bos)
    val dos = new DataOutputStream(deflaterOutputStream)
    configuration.write(dos)
    dos.close()
    bos.toByteArray
  }

  def deserializeHBaseConfiguration(arr: Array[Byte]) = {
    val conf = HBaseConfiguration.create
    conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
    conf
  }
}

Source File: HBaseBulkPutTimestampExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf


object HBaseBulkPutTimestampExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {

      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("6"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("7"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("8"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("9"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("10"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))))

      val conf = HBaseConfiguration.create()

      val timeStamp = System.currentTimeMillis()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
            timeStamp, putValue._3))
          put
        })
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseDistributedScanExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Scan
import org.apache.spark.SparkConf

object HBaseDistributedScanExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("GenerateGraphs {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName )
    val sc = new SparkContext(sparkConf)

    try {
      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val scan = new Scan()
      scan.setCaching(100)

      val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan)

      getRdd.foreach(v => println(Bytes.toString(v._1.get())))

      println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length);

        //.collect().foreach(v => println(Bytes.toString(v._1.get())))
    } finally {
      sc.stop()
    }
  }

}

Source File: HBaseBulkPutExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf


object HBaseBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) =>
            put.addColumn(putValue._1, putValue._2, putValue._3))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkDeleteExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Delete
import org.apache.spark.SparkConf


object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeletesExample {tableName} ")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkDelete[Array[Byte]](rdd,
        TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExampleFromFile.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf


object HBaseBulkPutExampleFromFile {
  def main(args: Array[String]) {
    if (args.length < 3) {
      println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)
    val inputFile = args(2)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " +
      tableName + " " + columnFamily + " " + inputFile)
    val sc = new SparkContext(sparkConf)

    try {
      var rdd = sc.hadoopFile(
        inputFile,
        classOf[TextInputFormat],
        classOf[LongWritable],
        classOf[Text]).map(v => {
        System.out.println("reading-" + v._2.toString)
        v._2.toString
      })

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[String](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          System.out.println("hbase-" + putRecord)
          val put = new Put(Bytes.toBytes("Value- " + putRecord))
          put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"),
            Bytes.toBytes(putRecord.length()))
          put
        });
    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkGetExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.spark.SparkConf


object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = hbaseContext.bulkGet[Array[Byte], String](
        TableName.valueOf(tableName),
        2,
        rdd,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkPutExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.{SparkConf, SparkContext}


object HBaseBulkPutExample {
   def main(args: Array[String]) {
     if (args.length < 2) {
       println("HBaseBulkPutExample {tableName} {columnFamily}")
       return
     }

     val tableName = args(0)
     val columnFamily = args(1)

     val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
       tableName + " " + columnFamily)
     val sc = new SparkContext(sparkConf)

     try {
       //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
       val rdd = sc.parallelize(Array(
         (Bytes.toBytes("1"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
         (Bytes.toBytes("2"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
         (Bytes.toBytes("3"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
         (Bytes.toBytes("4"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
         (Bytes.toBytes("5"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
       ))

       val conf = HBaseConfiguration.create()

       val hbaseContext = new HBaseContext(sc, conf)

       rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName),
         (putRecord) => {
           val put = new Put(putRecord._1)
           putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
             putValue._3))
           put
         })

     } finally {
       sc.stop()
     }
   }
 }

Source File: HBaseBulkDeleteExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes

import org.apache.spark.{SparkContext, SparkConf}


object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeletesExample {tableName} ")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseForeachPartitionExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkContext, SparkConf}


object HBaseForeachPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)


      rdd.hbaseForeachPartition(hbaseContext,
        (it, connection) => {
          val m = connection.getBufferedMutator(TableName.valueOf(tableName))

          it.foreach(r => {
            val put = new Put(r._1)
            r._2.foreach((putValue) =>
              put.addColumn(putValue._1, putValue._2, putValue._3))
            m.mutate(put)
          })
          m.flush()
          m.close()
        })

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseBulkGetExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.{Result, Get}
import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.spark.{SparkContext, SparkConf}


object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseMapPartitionExample.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkContext, SparkConf}


object HBaseMapPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => {
        val table = connection.getTable(TableName.valueOf(tableName))
        it.map{r =>
          //batching would be faster.  This is just an example
          val result = table.get(new Get(r))
          val it = result.listCells().iterator()
          val b = new StringBuilder
          b.append(Bytes.toString(result.getRow) + ":")
          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(cell.getQualifierArray)
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")")
            }
          }
          b.toString()
        }
      })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
}

Source File: HBaseStreamingBulkPutExample.scala From SparkOnHBase with Apache License 2.0

4 votes

package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.SparkConf


object HBaseStreamingBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 4) {
      println("HBaseStreamingBulkPutExample " +
        "{host} {port} {tableName} {columnFamily}")
      return
    }

    val host = args(0)
    val port = args(1)
    val tableName = args(2)
    val columnFamily = args(3)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)
    try {
      val ssc = new StreamingContext(sc, Seconds(1))

      val lines = ssc.socketTextStream(host, port.toInt)

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      hbaseContext.streamBulkPut[String](lines,
        TableName.valueOf(tableName),
        (putRecord) => {
          if (putRecord.length() > 0) {
            val put = new Put(Bytes.toBytes(putRecord))
            put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar"))
            put
          } else {
            null
          }
        })
      ssc.start()
      ssc.awaitTerminationOrTimeout(60000)
    } finally {
      sc.stop()
    }
  }
}

org.apache.hadoop.hbase.HBaseConfiguration Scala Examples