org.apache.hadoop.hbase.HBaseConfiguration Scala Examples
The following examples show how to use org.apache.hadoop.hbase.HBaseConfiguration.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HBase.scala From AI with Apache License 2.0 | 6 votes |
package com.bigchange.hbase import com.bigchange.util.HBaseUtil._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.client.{Result, _} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.hadoop.hbase.protobuf.ProtobufUtil import org.apache.hadoop.hbase.protobuf.generated.ClientProtos import org.apache.hadoop.hbase.util.Base64 import org.apache.spark.SparkContext def existRowKey(row:String, table: Table): Boolean ={ val get = new Get(row.getBytes()) val result = table.get(get) if (result.isEmpty) { warn("hbase table don't have this data,execute insert") return false } true } def getConfiguration = if(hBaseConfiguration == null) { warn("hbase setDefaultConfiguration....") setDefaultConfiguration } else hBaseConfiguration def setDefaultConfiguration = { hBaseConfiguration = HBaseConfiguration.create // 本地测试 需配置的选项, 在集群上通过对应配置文件路径自动获得 hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径 hBaseConfiguration.set("dfs.nameservices", "ns1"); // hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径 hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址 hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址 // 设置namenode自动切换的实现类 hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider") hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase") hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2") hBaseConfiguration.set("hbase.zookeeper.property.clientPort", "2181") hBaseConfiguration } }
Example 2
Source File: HBaseStreamingBulkPutExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseStreamingBulkPutExample { def main(args: Array[String]) { if (args.length < 4) { println("HBaseStreamingBulkPutExample " + "{host} {port} {tableName} {columnFamily} are missing an argument") return } val host = args(0) val port = args(1) val tableName = args(2) val columnFamily = args(3) val sparkConf = new SparkConf().setAppName("HBaseStreamingBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { val ssc = new StreamingContext(sc, Seconds(1)) val lines = ssc.socketTextStream(host, port.toInt) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.streamBulkPut[String](lines, TableName.valueOf(tableName), (putRecord) => { if (putRecord.length() > 0) { val put = new Put(Bytes.toBytes(putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar")) put } else { null } }) ssc.start() ssc.awaitTerminationOrTimeout(60000) } finally { sc.stop() } } }
Example 3
Source File: HBaseSQLContext.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.OverrideCatalog import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan} import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies} class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override protected[sql] lazy val catalog: HBaseCatalog = new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource) @transient override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] { val batches = Batch("Add exchange", Once, EnsureRequirements(self)) :: Batch("Add coprocessor", Once, AddCoprocessor(self)) :: Nil } }
Example 4
Source File: Util.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def getTempFilePath(conf: Configuration, prefix: String): String = { val fileSystem = FileSystem.get(conf) val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") if (fileSystem.exists(path)) { fileSystem.delete(path, true) } path.getName } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 5
Source File: HBaseTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import org.apache.hadoop.hbase.client.HBaseAdmin import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName} import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark._ object HBaseTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("HBaseTest") val sc = new SparkContext(sparkConf) // please ensure HBASE_CONF_DIR is on classpath of spark driver // e.g: set it through spark.driver.extraClassPath property // in spark-defaults.conf or through --driver-class-path // command line option of spark-submit val conf = HBaseConfiguration.create() if (args.length < 1) { System.err.println("Usage: HBaseTest <table_name>") System.exit(1) } // Other options for configuring scan behavior are available. More information available at // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html conf.set(TableInputFormat.INPUT_TABLE, args(0)) // Initialize hBase table if necessary val admin = new HBaseAdmin(conf) if (!admin.isTableAvailable(args(0))) { val tableDesc = new HTableDescriptor(TableName.valueOf(args(0))) admin.createTable(tableDesc) } val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) hBaseRDD.count() sc.stop() admin.close() } } // scalastyle:on println
Example 6
Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0 | 5 votes |
package org.hogzilla.hbase import scala.math.random import java.lang.Math import org.apache.spark._ import org.apache.hadoop.hbase.client.HBaseAdmin import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName} import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.hadoop.hbase.client.HTable import org.apache.hadoop.hbase.filter.SingleColumnValueFilter import org.apache.hadoop.hbase.filter.BinaryComparator import org.apache.hadoop.hbase.filter.FilterList import org.apache.hadoop.hbase.filter.CompareFilter import java.util.ArrayList import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.filter.Filter import scala.collection.mutable.HashSet import org.apache.hadoop.hbase.client.Put object HogHBaseReputation { // Ex: MX, whitelist def getReputationList(listName:String, listType:String):Set[String] = { val list = new HashSet[String] val filters: ArrayList[Filter] = new ArrayList(); val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType))) colValFilter1.setFilterIfMissing(false); val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName))) colValFilter2.setFilterIfMissing(false); filters.add(colValFilter1); filters.add(colValFilter2); val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters); val scan = new Scan() scan.setFilter(filterList) val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator() while(it.hasNext()) { list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) ) } list.toSet } def saveReputationList(listName:String, listType:String, ip:String) = { val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip)) HogHBaseRDD.hogzilla_reputation.put(put) } }
Example 7
Source File: Hdfs2HBase.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.hbase import java.util import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.hadoop.hbase.client.{ConnectionFactory, Put} import org.apache.spark.{SparkConf, SparkContext} object Hdfs2HBase { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("spark://spark01:7077") .setAppName(Hdfs2HBase.getClass.getName) .set("spark.jars", "target/wow-spark-1.0-SNAPSHOT.jar") val sparkContext = new SparkContext(conf) val userRDD = sparkContext.textFile("hdfs://spark01:9000/spark/users.dat",2).map(_.split("::")) userRDD.foreachPartition(iter =>{ val configuration = HBaseConfiguration.create() // configuration.set("hbase.zookeeper.quorum","spark01:2181,spark02:2181,spark03:2181") configuration.set("hbase.zookeeper.quorum", "spark01") configuration.set("hbase.zookeeper.property.clientPort", "2181") //创建连接 val connection = ConnectionFactory.createConnection(configuration) //get table object val person = connection.getTable(TableName.valueOf("users")) iter.foreach(p=>{ val arrayList = new util.ArrayList[Put]() val put = new Put(p(0).getBytes) arrayList.add(put.addColumn("f1".getBytes,"gender".getBytes,p(1).getBytes)) arrayList.add(put.addColumn("f1".getBytes,"age".getBytes,p(2).getBytes)) arrayList.add(put.addColumn("f2".getBytes,"position".getBytes,p(3).getBytes)) arrayList.add(put.addColumn("f2".getBytes,"code".getBytes,p(4).getBytes)) person.put(arrayList) }) }) sparkContext.stop() } }
Example 8
Source File: HBasePut.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.hbase.utilities import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.hadoop.hbase.client.{ConnectionFactory, Put} import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.{DataFrame, SparkSession} import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs} import com.paypal.gimel.logger.Logger object HBasePut { def apply(sparkSession: SparkSession): HBasePut = new HBasePut(sparkSession) } class HBasePut(sparkSession: SparkSession) { val logger = Logger() lazy val hbaseUtilities = HBaseUtilities(sparkSession) def putRows(hbaseTable: String, dataFrame: DataFrame, rowKeyColumn: String, columns: Array[String], cfColsMap: Map[String, String]) { try { // Configure And Connect val conf = HBaseConfiguration.create() val cnxn = ConnectionFactory.createConnection(conf) // Create Connection to HBase table val tbl = cnxn.getTable(TableName.valueOf(hbaseTable)) val rows = dataFrame.rdd.map { row => (row.getAs(rowKeyColumn).toString, columns.map(eachCol => (cfColsMap.getOrElse(eachCol, ""), eachCol, row.getAs(eachCol).asInstanceOf[String])) ) }.collect() // Performing put operation on each row of dataframe rows.foreach { row => val putRow: Put = new Put(Bytes.toBytes(row._1.asInstanceOf[String])) row._2.foreach(x => if (x._2 != rowKeyColumn) putRow.addColumn(Bytes.toBytes(x._1), Bytes.toBytes(x._2), Bytes.toBytes(x._3))) tbl.put(putRow) } tbl.close() } catch { case ex: Throwable => ex.printStackTrace() throw ex } } }
Example 9
Source File: GraphX.scala From unicorn with Apache License 2.0 | 5 votes |
package unicorn.narwhal.graph import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.SparkContext import unicorn.bigtable.hbase.HBaseTable import unicorn.json._ import unicorn.unibase.graph.{ReadOnlyGraph, GraphSerializer, GraphVertexColumnFamily, GraphOutEdgeColumnFamily} def graphx(sc: SparkContext): org.apache.spark.graphx.Graph[JsObject, (String, JsValue)] = { val conf = HBaseConfiguration.create() conf.set(TableInputFormat.INPUT_TABLE, name) conf.setInt(TableInputFormat.SCAN_CACHEDROWS, 500) conf.setBoolean(TableInputFormat.SCAN_CACHEBLOCKS, false) conf.set(TableInputFormat.SCAN_COLUMNS, s"$GraphVertexColumnFamily $GraphOutEdgeColumnFamily") val rdd = sc.newAPIHadoopRDD( conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result] ) val rows = rdd.mapPartitions { it => val serializer = new GraphSerializer() it.map { tuple => val row = HBaseTable.getRow(tuple._2) serializer.deserializeVertex(row) } } val vertices = rows.map { vertex => (vertex.id, vertex.properties) } val edges = rows.flatMap { vertex => vertex.edges.map { edge => org.apache.spark.graphx.Edge(edge.from, edge.to, (edge.label, edge.properties)) } } org.apache.spark.graphx.Graph(vertices, edges) } }
Example 10
Source File: HBaseBulkPutTimestampExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.spark.SparkConf import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkPutTimestampExample { def main(args: Array[String]) { if (args.length < 2) { System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily} are missing an argument") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { val rdd = sc.parallelize(Array( (Bytes.toBytes("6"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("7"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("8"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("9"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("10"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))))) val conf = HBaseConfiguration.create() val timeStamp = System.currentTimeMillis() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, timeStamp, putValue._3)) put }) } finally { sc.stop() } } }
Example 11
Source File: HBaseDistributedScanExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseDistributedScanExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseDistributedScanExample {tableName} missing an argument") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName ) val sc = new SparkContext(sparkConf) try { val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val scan = new Scan() scan.setCaching(100) val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan) getRdd.foreach(v => println(Bytes.toString(v._1.get()))) println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length); } finally { sc.stop() } } }
Example 12
Source File: HBaseBulkPutExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkPutExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) put }); } finally { sc.stop() } } }
Example 13
Source File: HBaseBulkDeleteExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Delete import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkDeleteExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkDeleteExample {tableName} missing an argument") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) val sc = new SparkContext(sparkConf) try { //[Array[Byte]] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkDelete[Array[Byte]](rdd, TableName.valueOf(tableName), putRecord => new Delete(putRecord), 4) } finally { sc.stop() } } }
Example 14
Source File: HBaseBulkPutExampleFromFile.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkPutExampleFromFile { def main(args: Array[String]) { if (args.length < 3) { println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile} are missing an argument") return } val tableName = args(0) val columnFamily = args(1) val inputFile = args(2) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + tableName + " " + columnFamily + " " + inputFile) val sc = new SparkContext(sparkConf) try { var rdd = sc.hadoopFile( inputFile, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map(v => { System.out.println("reading-" + v._2.toString) v._2.toString }) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[String](rdd, TableName.valueOf(tableName), (putRecord) => { System.out.println("hbase-" + putRecord) val put = new Put(Bytes.toBytes("Value- " + putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), Bytes.toBytes(putRecord.length())) put }); } finally { sc.stop() } } }
Example 15
Source File: HBaseSparkSession.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Analyzer import org.apache.spark.sql.catalyst.catalog.ExternalCatalog import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.SparkPlanner import org.apache.spark.sql.hbase.execution.{HBaseSourceAnalysis, HBaseStrategies} import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SQLConf, SessionState, SharedState} class HBaseSparkSession(sc: SparkContext) extends SparkSession(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) @transient override lazy val sessionState: SessionState = new HBaseSessionStateBuilder(this).build() HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override lazy val sharedState: SharedState = new HBaseSharedState(sc, this.sqlContext) } class HBaseSessionStateBuilder(session: SparkSession, parentState: Option[SessionState] = None) extends BaseSessionStateBuilder(session) { override lazy val conf: SQLConf = new HBaseSQLConf override protected def newBuilder: NewBuilder = new HBaseSessionStateBuilder(_, _) override lazy val experimentalMethods: ExperimentalMethods = { val result = new ExperimentalMethods; result.extraStrategies = Seq((new SparkPlanner(session.sparkContext, conf, new ExperimentalMethods) with HBaseStrategies).HBaseDataSource) result } override lazy val analyzer: Analyzer = { new Analyzer(catalog, conf) { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: customResolutionRules override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = PreprocessTableCreation(session) +: PreprocessTableInsertion(conf) +: DataSourceAnalysis(conf) +: HBaseSourceAnalysis(session) +: customPostHocResolutionRules override val extendedCheckRules = customCheckRules } } } class HBaseSharedState(sc: SparkContext, sqlContext: SQLContext) extends SharedState(sc) { override lazy val externalCatalog: ExternalCatalog = new HBaseCatalog(sqlContext, sc.hadoopConfiguration) }
Example 16
Source File: HBaseBulkGetExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.CellUtil import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkGetExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName} missing an argument") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = hbaseContext.bulkGet[Array[Byte], String]( TableName.valueOf(tableName), 2, rdd, record => { System.out.println("making Get") new Get(record) }, (result: Result) => { val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(CellUtil.cloneQualifier(cell)) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") } else { b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") } } b.toString() }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } }
Example 17
Source File: HBaseBulkPutExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkPutExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) put }) } finally { sc.stop() } } }
Example 18
Source File: HBaseBulkDeleteExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Delete import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkDeleteExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkDeleteExample {tableName} are missing an argument") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) val sc = new SparkContext(sparkConf) try { //[Array[Byte]] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName), putRecord => new Delete(putRecord), 4) } finally { sc.stop() } } }
Example 19
Source File: HBaseForeachPartitionExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseForeachPartitionExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseForeachPartitionExample {tableName} {columnFamily} are missing an arguments") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseForeachPartitionExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseForeachPartition(hbaseContext, (it, connection) => { val m = connection.getBufferedMutator(TableName.valueOf(tableName)) it.foreach(r => { val put = new Put(r._1) r._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) m.mutate(put) }) m.flush() m.close() }) } finally { sc.stop() } } }
Example 20
Source File: HBaseBulkGetExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.CellUtil import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkGetExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName} is missing an argument") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2, record => { System.out.println("making Get") new Get(record) }, (result: Result) => { val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(CellUtil.cloneQualifier(cell)) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") } else { b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") } } b.toString() }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } }
Example 21
Source File: HBaseMapPartitionExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseMapPartitionExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseMapPartitionExample {tableName} is missing an argument") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseMapPartitionExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => { val table = connection.getTable(TableName.valueOf(tableName)) it.map{r => //batching would be faster. This is just an example val result = table.get(new Get(r)) val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(cell.getQualifierArray) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")") } else { b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")") } } b.toString() } }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } }
Example 22
Source File: ConfigurationBuilder.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase.config import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.kafka.common.config.ConfigException object ConfigurationBuilder { def buildHBaseConfig(hBaseSettings: HBaseSettings): Configuration = { val configuration = HBaseConfiguration.create() def appendFile(file:String): Unit = { val hbaseFile = new File(file) if (!hbaseFile.exists) { throw new ConfigException(s"$file does not exist in provided HBase configuration directory $hbaseFile.") } else { configuration.addResource(new Path(hbaseFile.toString)) } } hBaseSettings.hbaseConfigDir.foreach { dir => appendFile(dir + s"/hbase-site.xml") } configuration } }
Example 23
Source File: HbaseReaderHelper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase.writers import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._ import com.datamountaineer.streamreactor.connect.hbase.HbaseHelper import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName} import scala.collection.JavaConverters._ object HbaseReaderHelper { def createConnection: Connection = { ConnectionFactory.createConnection(HBaseConfiguration.create()) } def getAllRecords(tableName: String, columnFamily: String)(implicit connection: Connection): List[HbaseRowData] = { HbaseHelper.withTable(TableName.valueOf(tableName)) { tbl => val scan = new Scan() scan.addFamily(columnFamily.fromString()) val scanner = tbl.getScanner(scan) scanner.asScala.map { rs => val cells = rs.rawCells().map { cell => Bytes.toString(CellUtil.cloneQualifier(cell)) -> CellUtil.cloneValue(cell) }.toMap HbaseRowData(rs.getRow, cells) }.toList } } } case class HbaseRowData(key: Array[Byte], cells: Map[String, Array[Byte]])
Example 24
Source File: HBaseUtils.scala From bigdata-examples with Apache License 2.0 | 5 votes |
package com.timeyang.common.util import com.timeyang.common.config.BaseConf import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.protobuf.ProtobufUtil import org.apache.hadoop.hbase.protobuf.generated.ClientProtos import org.apache.hadoop.hbase.util.Base64 import org.apache.hadoop.mapreduce.Job def createHbaseOutputJob(tableName: String): Job = { val conf = HBaseUtils.newConf() conf.set(TableOutputFormat.OUTPUT_TABLE, tableName) val job = Job.getInstance(conf) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Put]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) job } }
Example 25
Source File: CreateSaltedTable.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.setup.hbase import java.io.File import org.apache.commons.lang.StringUtils import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.client.ConnectionFactory import org.apache.hadoop.hbase.io.compress.Compression import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy} import org.apache.hadoop.hbase.util.Bytes import scala.collection.mutable object CreateSaltedTable { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>") } val tableName = args(0) val columnFamilyName = args(1) val regionCount = args(2).toInt val numOfSalts = args(3).toInt val hbaseConfigFolder = args(4) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val connection = ConnectionFactory.createConnection(conf) val admin = connection.getAdmin val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName)) val columnDescriptor = new HColumnDescriptor(columnFamilyName) columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY) columnDescriptor.setBlocksize(64 * 1024) columnDescriptor.setBloomFilterType(BloomType.ROW) tableDescriptor.addFamily(columnDescriptor) tableDescriptor.setMaxFileSize(Long.MaxValue) tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName) val splitKeys = new mutable.MutableList[Array[Byte]] for (i <- 0 to regionCount) { val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0") splitKeys += Bytes.toBytes(regionSplitStr) } admin.createTable(tableDescriptor, splitKeys.toArray) } }
Example 26
Source File: HBaseRestServer.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.server.hbase import java.io.File import com.sun.jersey.spi.container.servlet.ServletContainer import org.apache.hadoop.hbase.HBaseConfiguration import org.mortbay.jetty.Server import org.mortbay.jetty.servlet.{Context, ServletHolder} object HBaseRestServer { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<port> <configDir> <numberOfSalts> <customerTableName>") } val port = args(0).toInt val hbaseConfigFolder = args(1) val numberOfSalts = args(2).toInt val appEventTableName = args(3) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) HBaseGlobalValues.init(conf, numberOfSalts, appEventTableName) val server = new Server(port) val sh = new ServletHolder(classOf[ServletContainer]) sh.setInitParameter("com.sun.jersey.config.property.resourceConfigClass", "com.sun.jersey.api.core.PackagesResourceConfig") sh.setInitParameter("com.sun.jersey.config.property.packages", "com.cloudera.sa.taxi360.server.hbase") sh.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature", "true") val context = new Context(server, "/", Context.SESSIONS) context.addServlet(sh, "/*") println("starting HBase Rest Server") server.start() println("started HBase Rest Sserver") server.join() } }
Example 27
Source File: HBaseServiceLayer.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.server.hbase import javax.ws.rs._ import javax.ws.rs.core.MediaType import com.cloudera.sa.taxi360.model.NyTaxiYellowTrip import com.cloudera.sa.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan} import org.apache.hadoop.hbase.util.Bytes import scala.collection.mutable @Path("rest") class HBaseServiceLayer { @GET @Path("hello") @Produces(Array(MediaType.TEXT_PLAIN)) def hello(): String = { "Hello World" } @GET @Path("vender/{venderId}/timeline") @Produces(Array(MediaType.APPLICATION_JSON)) def getTripTimeLine (@PathParam("venderId") venderId:String, @QueryParam("startTime") startTime:String = Long.MinValue.toString, @QueryParam("endTime") endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = { val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName)) val st = if (startTime == null) { Long.MinValue.toString } else { startTime } val et = if (endTime == null) { Long.MaxValue.toString } else { endTime } val scan = new Scan() val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts) println("startRowKey:" + Bytes.toString(startRowKey)) scan.setStartRow(startRowKey) val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts) println("endRowKey:" + Bytes.toString(endRowKey)) scan.setStopRow(endRowKey) val scannerIt = table.getScanner(scan).iterator() val tripList = new mutable.MutableList[NyTaxiYellowTrip] while(scannerIt.hasNext) { val result = scannerIt.next() tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result) println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result)) } println("tripList.size:" + tripList.size) tripList.toArray } }
Example 28
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.streaming.ingestion.hbase import java.io.File import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 29
Source File: HBaseSparkConf.scala From spark-hbase-connector with Apache License 2.0 | 5 votes |
package it.nerdammer.spark.hbase import org.apache.hadoop.hbase.{HConstants, HBaseConfiguration} import org.apache.spark.SparkConf case class HBaseSparkConf ( hbaseHost: Option[String] = None, hbaseXmlConfigFile: String = "hbase-site.xml") extends Serializable { def createHadoopBaseConfig() = { val conf = HBaseConfiguration.create val xmlFile = Option(getClass.getClassLoader.getResource(hbaseXmlConfigFile)) xmlFile.foreach(f => conf.addResource(f)) hbaseHost.foreach(h => conf.set(HConstants.ZOOKEEPER_QUORUM, h)) if(Option(conf.get(HConstants.ZOOKEEPER_QUORUM)).isEmpty) conf.set(HConstants.ZOOKEEPER_QUORUM, HBaseSparkConf.DefaultHBaseHost) conf } } object HBaseSparkConf extends Serializable { val DefaultHBaseHost = "localhost" def fromSparkConf(conf: SparkConf) = { HBaseSparkConf( hbaseHost = Option(conf.get("spark.hbase.host", null)) ) } }
Example 30
Source File: CreateSaltedTable.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.setup.hbase import java.io.File import org.apache.commons.lang.StringUtils import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.client.ConnectionFactory import org.apache.hadoop.hbase.io.compress.Compression import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy} import org.apache.hadoop.hbase.util.Bytes import scala.collection.mutable object CreateSaltedTable { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>") } val tableName = args(0) val columnFamilyName = args(1) val regionCount = args(2).toInt val numOfSalts = args(3).toInt val hbaseConfigFolder = args(4) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val connection = ConnectionFactory.createConnection(conf) val admin = connection.getAdmin val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName)) val columnDescriptor = new HColumnDescriptor(columnFamilyName) columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY) columnDescriptor.setBlocksize(64 * 1024) columnDescriptor.setBloomFilterType(BloomType.ROW) tableDescriptor.addFamily(columnDescriptor) tableDescriptor.setMaxFileSize(Long.MaxValue) tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName) val splitKeys = new mutable.MutableList[Array[Byte]] for (i <- 0 to regionCount) { val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0") splitKeys += Bytes.toBytes(regionSplitStr) } admin.createTable(tableDescriptor, splitKeys.toArray) } }
Example 31
Source File: HBaseRestServer.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.server.hbase import java.io.File import com.sun.jersey.spi.container.servlet.ServletContainer import org.apache.hadoop.hbase.HBaseConfiguration import org.mortbay.jetty.Server import org.mortbay.jetty.servlet.{Context, ServletHolder} object HBaseRestServer { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<port> <configDir> <numberOfSalts> <customerTableName>") } val port = args(0).toInt val hbaseConfigFolder = args(1) val numberOfSalts = args(2).toInt val appEventTableName = args(3) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) HBaseGlobalValues.init(conf, numberOfSalts, appEventTableName) val server = new Server(port) val sh = new ServletHolder(classOf[ServletContainer]) sh.setInitParameter("com.sun.jersey.config.property.resourceConfigClass", "com.sun.jersey.api.core.PackagesResourceConfig") sh.setInitParameter("com.sun.jersey.config.property.packages", "com.hadooparchitecturebook.taxi360.server.hbase") sh.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature", "true") val context = new Context(server, "/", Context.SESSIONS) context.addServlet(sh, "/*") println("starting HBase Rest Server") server.start() println("started HBase Rest Sserver") server.join() } }
Example 32
Source File: HBaseServiceLayer.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.server.hbase import javax.ws.rs._ import javax.ws.rs.core.MediaType import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTrip import com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan} import org.apache.hadoop.hbase.util.Bytes import scala.collection.mutable @Path("rest") class HBaseServiceLayer { @GET @Path("hello") @Produces(Array(MediaType.TEXT_PLAIN)) def hello(): String = { "Hello World" } @GET @Path("vender/{venderId}/timeline") @Produces(Array(MediaType.APPLICATION_JSON)) def getTripTimeLine (@PathParam("venderId") venderId:String, @QueryParam("startTime") startTime:String = Long.MinValue.toString, @QueryParam("endTime") endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = { val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName)) val st = if (startTime == null) { Long.MinValue.toString } else { startTime } val et = if (endTime == null) { Long.MaxValue.toString } else { endTime } val scan = new Scan() val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts) println("startRowKey:" + Bytes.toString(startRowKey)) scan.setStartRow(startRowKey) val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts) println("endRowKey:" + Bytes.toString(endRowKey)) scan.setStopRow(endRowKey) val scannerIt = table.getScanner(scan).iterator() val tripList = new mutable.MutableList[NyTaxiYellowTrip] while(scannerIt.hasNext) { val result = scannerIt.next() tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result) println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result)) } println("tripList.size:" + tripList.size) tripList.toArray } }
Example 33
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase import java.io.File import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 34
Source File: HBaseSQLContext.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.OverrideCatalog import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan} import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies} class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override protected[sql] lazy val catalog: HBaseCatalog = new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource) @transient override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] { val batches = Batch("Add exchange", Once, EnsureRequirements(self)) :: Batch("Add coprocessor", Once, AddCoprocessor(self)) :: Nil } }
Example 35
Source File: Util.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def getTempFilePath(conf: Configuration, prefix: String): String = { val fileSystem = FileSystem.get(conf) val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") if (fileSystem.exists(path)) { fileSystem.delete(path, true) } path.getName } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 36
Source File: HbRddConfig.scala From hbrdd with Apache License 2.0 | 5 votes |
package top.spoofer.hbrdd.config import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.HBaseConfiguration class HbRddConfig(config: Configuration) extends Serializable { def getHbaseConfig = HBaseConfiguration.create(config) } object HbRddConfig { type configOption = (String, String) private[HbRddConfig] case class HbaseOption(name: String, value: String) def apply(config: Configuration): HbRddConfig = new HbRddConfig(config) def apply(configs: configOption*): HbRddConfig = { val hbConfig = HBaseConfiguration.create() for { option <- configs hbOption = HbaseOption(option._1, option._2) //使用新的case class 只是为了表达更加清晰 } hbConfig.set(hbOption.name, hbOption.value) this.apply(hbConfig) } def apply(configs: { def rootDir: String; def quorum: String }): HbRddConfig = { apply( "hbase.rootdir" -> configs.rootDir, "hbase.zookeeper.quorum" -> configs.quorum ) } def apply(configs: Map[String, String]): HbRddConfig = { val hbConfig = HBaseConfiguration.create() configs.keys foreach { name => hbConfig.set(name, configs(name)) } this.apply(hbConfig) } def apply(configs: TraversableOnce[configOption]): HbRddConfig = { val hbConfig = HBaseConfiguration.create() configs foreach { option => val hbOption = HbaseOption(option._1, option._2) hbConfig.set(hbOption.name, hbOption.value) } this.apply(hbConfig) } }
Example 37
Source File: SparkManager.scala From darwin with Apache License 2.0 | 5 votes |
package it.agilelab.darwin.app.spark import com.typesafe.config.Config import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ trait SparkManager { val sparkManagerLogger: Logger = LoggerFactory.getLogger("SparkManager") protected def defaultParallelism(implicit sparkSession: SparkSession, config: Config): Int = { sparkSession.conf.getOption(SparkConfigurationKeys.SPARK_EXECUTOR_INSTANCES) match { case Some(instances) => sparkSession.conf.getOption(SparkConfigurationKeys.SPARK_CORES).getOrElse("1").toInt * instances.toInt case None => sparkManagerLogger.info("Spark is configured with dynamic allocation, default parallelism will be gathered from app " + "conf: " + "next.process.parallelism") if (config.hasPath(SparkConfigurationKeys.PARALLELISM)) { config.getInt(SparkConfigurationKeys.PARALLELISM) } else { sparkManagerLogger.info("next.process.parallelism was not set fallback to sparkSession.defaultParallelism") sparkSession.sparkContext.defaultParallelism } } } }
Example 38
Source File: L6-16SparkHBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object SparkHBaseBulkPutApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val hbaseConf = HBaseConfiguration.create() val hContext = new HBaseContext(ssc.sparkContext, hbaseConf) val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) put }) ssc.start() ssc.awaitTermination() } }
Example 39
Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.io.Text import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HBaseSinkApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val hbaseConf = HBaseConfiguration.create() hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName) hbaseConf.set("hbase.master", hbaseMaster) val jobConf = new Configuration(hbaseConf) jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName) rdd.map(rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) (rec._1, put) }).saveAsNewAPIHadoopDataset(jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 40
Source File: HBaseReaders.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import com.kakao.mango.util.Conversions._ import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.JavaConversions._ trait HBaseReaders { val sc: SparkContext def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = { hbaseTableBinary(quorum, table).map { case (rowkey, ((family, qualifier), (timestamp, value))) => (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string))) } } def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = { hbaseTableBinary(quorum, table).collect { case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell) } } def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = { hbaseTable(quorum, table).collect { case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell) } } }
Example 41
Source File: HBasePutDriver.scala From akka-nbench with Apache License 2.0 | 5 votes |
package bench.drivers import akka.actor._ import com.typesafe.config._ import org.apache.hadoop.hbase._ import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.util._ import org.apache.hadoop.hbase.HBaseConfiguration class HBasePutDriver(operation: String, stats: ActorRef, config: Config) extends Driver(operation, stats, config) { var conn: Connection = _ var table: Table = _ val nsName = "ns" val tableName= s"${nsName}:tbl" val colFamilies = List("fam") override val getOperation = () => { operation match { case "put" => putTest _ } } // https://github.com/apache/hbase/blob/master/hbase-server/src/test/java/org/apache/hadoop/hbase/TestNamespace.java override def setup(): Boolean = { val conf = HBaseConfiguration.create this.conn = ConnectionFactory.createConnection(conf) val admin = this.conn.getAdmin try{ admin.createNamespace(NamespaceDescriptor.create(nsName).build()); } catch { case e: NamespaceExistException => { log.info(s"namespace: ${nsName} already exists") } } val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName)) try{ colFamilies.foreach { fam => tableDescriptor.addFamily(new HColumnDescriptor(fam)) } admin.createTable(tableDescriptor) log.info(s"table: ${tableName} created") } catch { case e: TableExistsException => { log.info(s"table: ${tableName} already exists") } } this.table = this.conn.getTable(TableName.valueOf(tableName)) true } override def teardown(): Boolean = { this.table.close() this.conn.close() true } // https://github.com/hbasebook101/hbasebook101/blob/master/ch04/java-api-examples/src/main/java/example/PutExample.java // https://github.com/xldrx/hbase_examples/pull/1/files def putTest(): (Boolean, Long, Long) = { val start = System.currentTimeMillis try { val put = new Put(Bytes.toBytes("row-" + start )) put.addColumn(Bytes.toBytes("fam"), Bytes.toBytes("col1"), Bytes.toBytes("value1")) put.addColumn(Bytes.toBytes("fam"), Bytes.toBytes("col2"), 100L, Bytes.toBytes("value1")) table.put(put) val endAt = System.currentTimeMillis val elapsedMillis= endAt - start (true, endAt, elapsedMillis) } catch { case e: Throwable => { log.error("" + e) val endAt = System.currentTimeMillis val elapsedMillis= endAt - start (false, endAt, elapsedMillis) } } } }
Example 42
Source File: HBaseTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import org.apache.hadoop.hbase.client.HBaseAdmin import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName} import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark._ object HBaseTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("HBaseTest") val sc = new SparkContext(sparkConf) val conf = HBaseConfiguration.create() // Other options for configuring scan behavior are available. More information available at // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html conf.set(TableInputFormat.INPUT_TABLE, args(0)) // Initialize hBase table if necessary val admin = new HBaseAdmin(conf) if (!admin.isTableAvailable(args(0))) { val tableDesc = new HTableDescriptor(TableName.valueOf(args(0))) admin.createTable(tableDesc) } val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) hBaseRDD.count() sc.stop() } }
Example 43
Source File: Util.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def dropTempFilePath(conf: Configuration, path: String): Boolean = { val fileSystem = FileSystem.get(conf) val filePath = new Path(path) if (fileSystem.exists(filePath)) { fileSystem.delete(filePath, true) } else { false } } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 44
Source File: HBaseBulkPutTimestampExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.spark.SparkConf object HBaseBulkPutTimestampExample { def main(args: Array[String]) { if (args.length < 2) { System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { val rdd = sc.parallelize(Array( (Bytes.toBytes("6"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("7"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("8"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("9"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("10"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))))) val conf = HBaseConfiguration.create() val timeStamp = System.currentTimeMillis() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, timeStamp, putValue._3)) put }) } finally { sc.stop() } } }
Example 45
Source File: HBaseDistributedScanExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Scan import org.apache.spark.SparkConf object HBaseDistributedScanExample { def main(args: Array[String]) { if (args.length < 1) { println("GenerateGraphs {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName ) val sc = new SparkContext(sparkConf) try { val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val scan = new Scan() scan.setCaching(100) val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan) getRdd.foreach(v => println(Bytes.toString(v._1.get()))) println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length); //.collect().foreach(v => println(Bytes.toString(v._1.get()))) } finally { sc.stop() } } }
Example 46
Source File: HBaseBulkPutExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.spark.SparkConf object HBaseBulkPutExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) put }); } finally { sc.stop() } } }
Example 47
Source File: HBaseBulkDeleteExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Delete import org.apache.spark.SparkConf object HBaseBulkDeleteExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkDeletesExample {tableName} ") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) val sc = new SparkContext(sparkConf) try { //[Array[Byte]] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkDelete[Array[Byte]](rdd, TableName.valueOf(tableName), putRecord => new Delete(putRecord), 4) } finally { sc.stop() } } }
Example 48
Source File: HBaseBulkPutExampleFromFile.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.mapred.TextInputFormat import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.spark.SparkConf object HBaseBulkPutExampleFromFile { def main(args: Array[String]) { if (args.length < 3) { println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile}") return } val tableName = args(0) val columnFamily = args(1) val inputFile = args(2) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + tableName + " " + columnFamily + " " + inputFile) val sc = new SparkContext(sparkConf) try { var rdd = sc.hadoopFile( inputFile, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map(v => { System.out.println("reading-" + v._2.toString) v._2.toString }) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[String](rdd, TableName.valueOf(tableName), (putRecord) => { System.out.println("hbase-" + putRecord) val put = new Put(Bytes.toBytes("Value- " + putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), Bytes.toBytes(putRecord.length())) put }); } finally { sc.stop() } } }
Example 49
Source File: HBaseBulkGetExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.client.Result import org.apache.spark.SparkConf object HBaseBulkGetExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = hbaseContext.bulkGet[Array[Byte], String]( TableName.valueOf(tableName), 2, rdd, record => { System.out.println("making Get") new Get(record) }, (result: Result) => { val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(CellUtil.cloneQualifier(cell)) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") } else { b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") } } b.toString() }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } }
Example 50
Source File: HBaseBulkPutExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.spark.{SparkConf, SparkContext} object HBaseBulkPutExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) put }) } finally { sc.stop() } } }
Example 51
Source File: HBaseBulkDeleteExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Delete import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.{SparkContext, SparkConf} object HBaseBulkDeleteExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkDeletesExample {tableName} ") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) val sc = new SparkContext(sparkConf) try { //[Array[Byte]] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName), putRecord => new Delete(putRecord), 4) } finally { sc.stop() } } }
Example 52
Source File: HBaseForeachPartitionExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.{SparkContext, SparkConf} object HBaseForeachPartitionExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseForeachPartition(hbaseContext, (it, connection) => { val m = connection.getBufferedMutator(TableName.valueOf(tableName)) it.foreach(r => { val put = new Put(r._1) r._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) m.mutate(put) }) m.flush() m.close() }) } finally { sc.stop() } } }
Example 53
Source File: HBaseBulkGetExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.{Result, Get} import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.spark.{SparkContext, SparkConf} object HBaseBulkGetExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2, record => { System.out.println("making Get") new Get(record) }, (result: Result) => { val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(CellUtil.cloneQualifier(cell)) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") } else { b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") } } b.toString() }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } }
Example 54
Source File: HBaseMapPartitionExample.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.{SparkContext, SparkConf} object HBaseMapPartitionExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => { val table = connection.getTable(TableName.valueOf(tableName)) it.map{r => //batching would be faster. This is just an example val result = table.get(new Get(r)) val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(cell.getQualifierArray) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")") } else { b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")") } } b.toString() } }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } }
Example 55
Source File: HBaseStreamingBulkPutExample.scala From SparkOnHBase with Apache License 2.0 | 4 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds import org.apache.spark.SparkConf object HBaseStreamingBulkPutExample { def main(args: Array[String]) { if (args.length < 4) { println("HBaseStreamingBulkPutExample " + "{host} {port} {tableName} {columnFamily}") return } val host = args(0) val port = args(1) val tableName = args(2) val columnFamily = args(3) val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { val ssc = new StreamingContext(sc, Seconds(1)) val lines = ssc.socketTextStream(host, port.toInt) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.streamBulkPut[String](lines, TableName.valueOf(tableName), (putRecord) => { if (putRecord.length() > 0) { val put = new Put(Bytes.toBytes(putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar")) put } else { null } }) ssc.start() ssc.awaitTerminationOrTimeout(60000) } finally { sc.stop() } } }