org.apache.hadoop.hbase.mapreduce.TableInputFormat Scala Example

Source File: HBase.scala From AI with Apache License 2.0

6 votes

package com.bigchange.hbase

import com.bigchange.util.HBaseUtil._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{Result, _}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.SparkContext


  def existRowKey(row:String, table: Table): Boolean ={

    val get = new Get(row.getBytes())
    val result = table.get(get)

    if (result.isEmpty) {
      warn("hbase table don't have this data,execute insert")
      return false
    }

    true

  }

  def getConfiguration = if(hBaseConfiguration == null) {
      warn("hbase setDefaultConfiguration....")
      setDefaultConfiguration
    } else  hBaseConfiguration

  def setDefaultConfiguration = {

    hBaseConfiguration = HBaseConfiguration.create

    // 本地测试 需配置的选项， 在集群上通过对应配置文件路径自动获得
    hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径
    hBaseConfiguration.set("dfs.nameservices", "ns1");  //
    hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址
    // 设置namenode自动切换的实现类
    hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
    hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase")
    hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2")
    hBaseConfiguration.set("hbase.zookeeper.property.clientPort", "2181")

    hBaseConfiguration

  }

}

Source File: HBaseReaders.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import com.kakao.mango.util.Conversions._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.JavaConversions._

trait HBaseReaders {
  val sc: SparkContext

  
  def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = {
    hbaseTableBinary(quorum, table).map {
      case (rowkey, ((family, qualifier), (timestamp, value))) =>
        (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string)))
    }
  }

  def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = {
    hbaseTableBinary(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell)
    }
  }

  def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = {
    hbaseTable(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell)
    }
  }
}

Source File: HBaseTest.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.examples

import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat

import org.apache.spark._


object HBaseTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("HBaseTest")
    val sc = new SparkContext(sparkConf)
    val conf = HBaseConfiguration.create()
    // Other options for configuring scan behavior are available. More information available at
    // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
    conf.set(TableInputFormat.INPUT_TABLE, args(0))

    // Initialize hBase table if necessary
    val admin = new HBaseAdmin(conf)
    if (!admin.isTableAvailable(args(0))) {
      val tableDesc = new HTableDescriptor(TableName.valueOf(args(0)))
      admin.createTable(tableDesc)
    }

    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    hBaseRDD.count()

    sc.stop()
  }
}

Source File: HBaseTest.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat

import org.apache.spark._


object HBaseTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("HBaseTest")
    val sc = new SparkContext(sparkConf)

    // please ensure HBASE_CONF_DIR is on classpath of spark driver
    // e.g: set it through spark.driver.extraClassPath property
    // in spark-defaults.conf or through --driver-class-path
    // command line option of spark-submit

    val conf = HBaseConfiguration.create()

    if (args.length < 1) {
      System.err.println("Usage: HBaseTest <table_name>")
      System.exit(1)
    }

    // Other options for configuring scan behavior are available. More information available at
    // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
    conf.set(TableInputFormat.INPUT_TABLE, args(0))

    // Initialize hBase table if necessary
    val admin = new HBaseAdmin(conf)
    if (!admin.isTableAvailable(args(0))) {
      val tableDesc = new HTableDescriptor(TableName.valueOf(args(0)))
      admin.createTable(tableDesc)
    }

    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    hBaseRDD.count()

    sc.stop()
    admin.close()
  }
}
// scalastyle:on println

Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.hbase




import scala.math.random
import java.lang.Math
import org.apache.spark._
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
import org.apache.hadoop.hbase.filter.BinaryComparator
import org.apache.hadoop.hbase.filter.FilterList
import org.apache.hadoop.hbase.filter.CompareFilter
import java.util.ArrayList
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.Filter
import scala.collection.mutable.HashSet
import org.apache.hadoop.hbase.client.Put


object HogHBaseReputation {

  // Ex: MX, whitelist
	def getReputationList(listName:String, listType:String):Set[String] =
	{
		val list =  new HashSet[String]


	  val filters: ArrayList[Filter] = new ArrayList();

		val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType)))
		colValFilter1.setFilterIfMissing(false);

		val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName)))
		colValFilter2.setFilterIfMissing(false);

		filters.add(colValFilter1);
		filters.add(colValFilter2);

		val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters);
		val scan = new Scan()
		scan.setFilter(filterList)
    
		val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator()
		
    while(it.hasNext())
		{
      list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) )
		}
    
    list.toSet

	}
  
 def saveReputationList(listName:String, listType:String, ip:String) =
 {
     val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip))
     
     HogHBaseRDD.hogzilla_reputation.put(put)
 }

}

Source File: GraphX.scala From unicorn with Apache License 2.0

5 votes

package unicorn.narwhal.graph

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext

import unicorn.bigtable.hbase.HBaseTable
import unicorn.json._
import unicorn.unibase.graph.{ReadOnlyGraph, GraphSerializer, GraphVertexColumnFamily, GraphOutEdgeColumnFamily}


  def graphx(sc: SparkContext): org.apache.spark.graphx.Graph[JsObject, (String, JsValue)] = {

    val conf = HBaseConfiguration.create()
    conf.set(TableInputFormat.INPUT_TABLE, name)
    conf.setInt(TableInputFormat.SCAN_CACHEDROWS, 500)
    conf.setBoolean(TableInputFormat.SCAN_CACHEBLOCKS, false)
    conf.set(TableInputFormat.SCAN_COLUMNS, s"$GraphVertexColumnFamily $GraphOutEdgeColumnFamily")

    val rdd = sc.newAPIHadoopRDD(
      conf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result]
    )

    val rows = rdd.mapPartitions { it =>
      val serializer = new GraphSerializer()
      it.map { tuple =>
        val row = HBaseTable.getRow(tuple._2)
        serializer.deserializeVertex(row)
      }
    }

    val vertices = rows.map { vertex =>
      (vertex.id, vertex.properties)
    }

    val edges = rows.flatMap { vertex =>
      vertex.edges.map { edge =>
        org.apache.spark.graphx.Edge(edge.from, edge.to, (edge.label, edge.properties))
      }
    }

    org.apache.spark.graphx.Graph(vertices, edges)
  }
}

org.apache.hadoop.hbase.mapreduce.TableInputFormat Scala Examples