话不多说,可以看上篇博文,关于offset存储到zookeeper
https://www.cnblogs.com/niutao/p/10547718.html
本篇博文主要告诉你如何将offset写到Hbase做存储:
最后存储到Hbase的展现形式:
testDirect:co:1552667595000 column=info:0, timestamp=1552667594784, value=66
testDirect:co:1552667595000 column=info:1, timestamp=1552667594784, value=269
testDirect:co:1552667595000 column=info:2, timestamp=1552667594784, value=67
testDirect:co:1552667600000 column=info:0, timestamp=1552667599864, value=66
testDirect:co:1552667600000 column=info:1, timestamp=1552667599864, value=269
testDirect:co:1552667600000 column=info:2, timestamp=1552667599864, value=67
testDirect:co:1552667605000 column=info:0, timestamp=1552667604778, value=66
testDirect:co:1552667605000 column=info:1, timestamp=1552667604778, value=269
testDirect:co:1552667605000 column=info:2, timestamp=1552667604778, value=67
testDirect:co:1552667610000 column=info:0, timestamp=1552667609777, value=66
testDirect:co:1552667610000 column=info:1, timestamp=1552667609777, value=269
版本:
scala:2.11.8
spark:2.11
hbase:1.2.0-cdh5.14.0
遇到的问题: `java.lang.IllegalStateException: Consumer is not subscribed to any topics or assigned any partitions`
分析原因: 从指定的主题或者分区获取数据,在poll之前,你没有订阅任何主题或分区是不行的,每一次poll,消费者都会尝试使用最后一次消费的offset作为接下来获取数据的start offset,最后一次消费的offset也可以通过seek(TopicPartition, long)设置或者自动设置
通过源码可以找到:
public ConsumerRecords<K, V> poll(long timeout) {
acquire();
try {
if (timeout < 0)
throw new IllegalArgumentException("Timeout must not be negative");
// 如果没有任何订阅,抛出异常
if (this.subscriptions.hasNoSubscriptionOrUserAssignment())
throw new IllegalStateException("Consumer is not subscribed to any topics or assigned any partitions"); // 一直poll新数据直到超时
long start = time.milliseconds();
// 距离超时还剩余多少时间
long remaining = timeout;
do {
// 获取数据,如果自动提交,则进行偏移量自动提交,如果设置offset重置,则进行offset重置
Map<TopicPartition, List<ConsumerRecord<K, V>>> records = pollOnce(remaining);
if (!records.isEmpty()) {
// 再返回结果之前,我们可以进行下一轮的fetch请求,避免阻塞等待
fetcher.sendFetches();
client.pollNoWakeup();
// 如果有拦截器进行拦截,没有直接返回
if (this.interceptors == null)
return new ConsumerRecords<>(records);
else
return this.interceptors.onConsume(new ConsumerRecords<>(records));
} long elapsed = time.milliseconds() - start;
remaining = timeout - elapsed;
} while (remaining > 0); return ConsumerRecords.empty();
} finally {
release();
}
}
解决:
因此,需要订阅当前的topic才能消费,我之前使用的api是:(适用于非新--已经被消费者消费过的)
因此,需要订阅当前的topic才能消费,我之前使用的api是:(适用于非新--已经被消费者消费过的)
`val inputDStream1 = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Assign[String, String](
fromOffsets.keys,kafkaParams,fromOffsets)
)` 修改:(全新的topic,没有被消费者消费过)
`val inputDStream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)`
完整代码:
package offsetInHbase
import kafka.utils.ZkUtils
import org.apache.hadoop.hbase.filter.PrefixFilter
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010.ConsumerStrategies._
import org.apache.spark.streaming.kafka010.{OffsetRange, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.kafka010.LocationStrategies._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by angel
*/
object KafkaOffsetsBlogStreamingDriver { def main(args: Array[String]) { if (args.length < 6) {
System.err.println("Usage: KafkaDirectStreamTest " +
"<batch-duration-in-seconds> " +
"<kafka-bootstrap-servers> " +
"<kafka-topics> " +
"<kafka-consumer-group-id> " +
"<hbase-table-name> " +
"<kafka-zookeeper-quorum>")
System.exit(1)
}
//5 cdh1:9092,cdh2:2181,cdh3:2181 testDirect co testDirect cdh1:2181,cdh2:2181,cdh3:2181 val batchDuration = args(0)
val bootstrapServers = args(1).toString
val topicsSet = args(2).toString.split(",").toSet
val consumerGroupID = args(3)
val hbaseTableName = args(4)
val zkQuorum = args(5)
val zkKafkaRootDir = "kafka"
val zkSessionTimeOut = 10000
val zkConnectionTimeOut = 10000 val sparkConf = new SparkConf().setAppName("Kafka-Offset-Management-Blog")
.setMaster("local[4]")//Uncomment this line to test while developing on a workstation
val sc = new SparkContext(sparkConf)
val ssc = new StreamingContext(sc, Seconds(batchDuration.toLong))
val topics = topicsSet.toArray
val topic = topics(0) val kafkaParams = Map[String, Object](
"bootstrap.servers" -> bootstrapServers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> consumerGroupID,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
) /*
Create a dummy process that simply returns the message as is.
*/
def processMessage(message:ConsumerRecord[String,String]):ConsumerRecord[String,String]={
message
} /*
Save Offsets into HBase
*/
def saveOffsets(
TOPIC_NAME:String,
GROUP_ID:String,
offsetRanges:Array[OffsetRange],
hbaseTableName:String,
batchTime: org.apache.spark.streaming.Time
) ={
val hbaseConf = HBaseConfiguration.create()
hbaseConf.addResource("src/main/resources/hbase-site.xml")
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(hbaseTableName))
val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
val put = new Put(rowKey.getBytes)
for(offset <- offsetRanges){
put.addColumn(Bytes.toBytes("info"),Bytes.toBytes(offset.partition.toString),
Bytes.toBytes(offset.untilOffset.toString))
}
table.put(put)
conn.close()
} /*
Returns last committed offsets for all the partitions of a given topic from HBase in following cases.
- CASE 1: SparkStreaming job is started for the first time. This function gets the number of topic partitions from
Zookeeper and for each partition returns the last committed offset as 0
- CASE 2: SparkStreaming is restarted and there are no changes to the number of partitions in a topic. Last
committed offsets for each topic-partition is returned as is from HBase.
- CASE 3: SparkStreaming is restarted and the number of partitions in a topic increased. For old partitions, last
committed offsets for each topic-partition is returned as is from HBase as is. For newly added partitions,
function returns last committed offsets as 0
*/
def getLastCommittedOffsets(
TOPIC_NAME:String,
GROUP_ID:String,
hbaseTableName:String,
zkQuorum:String,
zkRootDir:String,
sessionTimeout:Int,
connectionTimeOut:Int
):Map[TopicPartition,Long] ={ val hbaseConf = HBaseConfiguration.create()
hbaseConf.addResource("src/main/resources/hbase-site.xml")
val zkUrl = zkQuorum+"/"+zkRootDir
val zkClientAndConnection = ZkUtils.createZkClientAndConnection(zkUrl,sessionTimeout,connectionTimeOut)
val zkUtils = new ZkUtils(zkClientAndConnection._1, zkClientAndConnection._2,false)
val zKNumberOfPartitionsForTopic = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size //Connect to HBase to retrieve last committed offsets
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(hbaseTableName))
val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
val scan = new Scan()
val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
val result = scanner.next()
//Set the number of partitions discovered for a topic in HBase to 0
var hbaseNumberOfPartitionsForTopic = 0
if (result != null){
//If the result from hbase scanner is not null, set number of partitions from hbase to the number of cells
//listCells 获取列族下的列
hbaseNumberOfPartitionsForTopic = result.listCells().size()
} val fromOffsets = collection.mutable.Map[TopicPartition,Long]()
//初始化时候的hbase
if(hbaseNumberOfPartitionsForTopic == 0){
// initialize fromOffsets to beginning
for (partition <- 0 to zKNumberOfPartitionsForTopic-1){
fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
//增加了topic的分区数
} else if(zKNumberOfPartitionsForTopic > hbaseNumberOfPartitionsForTopic){
// handle scenario where new partitions have been added to existing kafka topic
for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1){
val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("info"),Bytes.toBytes(partition.toString)))
fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
//将新增的分区也添加上
for (partition <- hbaseNumberOfPartitionsForTopic to zKNumberOfPartitionsForTopic-1){
fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
} else {
//initialize fromOffsets from last run
for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1 ){
val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("info"),Bytes.toBytes(partition.toString)))
fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
}
scanner.close()
conn.close()
fromOffsets.toMap
} val fromOffsets= getLastCommittedOffsets(
topic,
consumerGroupID,
hbaseTableName,
zkQuorum,
zkKafkaRootDir,
zkSessionTimeOut,
zkConnectionTimeOut)
//刚开始时候启动,全新的topic会报错
val inputDStream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Assign[String, String](
fromOffsets.keys,kafkaParams,fromOffsets)
)
//如果报错,则使用下面的api
// val inputDStream = KafkaUtils.createDirectStream[String, String](
// ssc,
// PreferConsistent,
// Subscribe[String, String](topics, kafkaParams)
// ) /*
For each RDD in a DStream apply a map transformation that processes the message.
*/
inputDStream.foreachRDD((rdd,batchTime) => {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
offsetRanges.foreach(offset => println(offset.topic, offset.partition, offset.fromOffset,offset.untilOffset))
val newRDD = rdd.map(message => processMessage(message))
newRDD.count()
saveOffsets(topic,consumerGroupID,offsetRanges,hbaseTableName,batchTime) //save the offsets to HBase
}) println("Number of messages processed " + inputDStream.count())
ssc.start()
ssc.awaitTermination()
}
}