版权声明: https://blog.csdn.net/xianpanjia4616/article/details/82216845
简单的写了一个sparkstreaming入es的demo,直接看代码吧:
package spark
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.streamingRedisHive._
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.elasticsearch.spark.rdd.EsSpark
import redis.RedisPool
object wordcount {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
val conf = new SparkConf().setAppName("Spark Streaming Jason")
conf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.registerKryoClasses(Array(classOf[ConsumerRecord[String,String]]))
conf.set("spark.streaming.concurrentJobs", "10")
conf.set("spark.streaming.kafka.maxRetries", "50")
conf.set("spark.streaming.stopGracefullyOnShutdown","true")
conf.set("spark.streaming.backpressure.enabled","true")
conf.set("spark.streaming.backpressure.initialRate","5000")
conf.set("spark.streaming.kafka.maxRatePerPartition","3000")
conf.set("es.nodes","xxx")
conf.set("es.index.auto.create", "true")
conf.set("es.port", "9200")
val scc = new StreamingContext(conf, Seconds(1))
scc.checkpoint("/home/hdfs/jason/checkpoint")
val topic = PropertiesScalaUtils.loadProperties("topic_combine")
val topicSet = Set(topic)
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "latest", //latest;earliest
"value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
, "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
, "enable.auto.commit" -> (false: java.lang.Boolean)
)
val maxTotal = 200
val maxIdle = 100
val minIdle = 10
val testOnBorrow = false
val testOnReturn = false
val maxWaitMillis = 5000
RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
val jedis = RedisPool.getPool.getResource
jedis.select(dbIndex)
val keys = jedis.keys(topic + "*") //从redis,获取offest的位置;
if (keys.size() == 0) { //如果redis里没有保存过offest,就视为第一次启动;
println("第一次启动,从最开始的位置读取-------------------------------")
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
} else { //redis里有offest,说明不是第一次启动了,从保存的offest的位置开始读;
println("不是第一次启动,从上次保存放的位置开始读取---------------------------")
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams, RedisKeysListUtils.getKeysList(topic,jedis)))
}
jedis.close()
kafkaStreams.foreachRDD(rdd=>{
if(!rdd.isEmpty()){
val json = rdd.map(_.value())
EsSpark.saveJsonToEs(json,"spark/jaosn")
}
})
scc.start()
scc.awaitTermination()
}
}
如果有写的不对的地方,欢迎大家指正,如果有什么疑问,可以加QQ群:340297350,谢谢