sparkstreaming写入elasticsearch

简单的写了一个sparkstreaming入es的demo,直接看代码吧:
package spark

import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.streamingRedisHive._
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.elasticsearch.spark.rdd.EsSpark
import redis.RedisPool

object wordcount {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
    Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
    val conf = new SparkConf().setAppName("Spark Streaming Jason")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.registerKryoClasses(Array(classOf[ConsumerRecord[String,String]]))
    conf.set("spark.streaming.concurrentJobs", "10")
    conf.set("spark.streaming.kafka.maxRetries", "50")
    conf.set("spark.streaming.stopGracefullyOnShutdown","true")
    conf.set("spark.streaming.backpressure.enabled","true")
    conf.set("spark.streaming.backpressure.initialRate","5000")
    conf.set("spark.streaming.kafka.maxRatePerPartition","3000")
    conf.set("es.nodes","xxx")
    conf.set("es.index.auto.create", "true")
    conf.set("es.port", "9200")
    val scc = new StreamingContext(conf, Seconds(1))
    scc.checkpoint("/home/hdfs/jason/checkpoint")
    val topic = PropertiesScalaUtils.loadProperties("topic_combine")
    val topicSet = Set(topic)
    val kafkaParams = Map[String, Object](
      "auto.offset.reset" -> "latest",   //latest;earliest
      "value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
      , "key.deserializer" -> classOf[StringDeserializer]
      , "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
      , "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
      , "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val maxTotal = 200
    val maxIdle = 100
    val minIdle = 10
    val testOnBorrow = false
    val testOnReturn = false
    val maxWaitMillis = 5000
    RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
    val jedis = RedisPool.getPool.getResource
    jedis.select(dbIndex)
    val keys = jedis.keys(topic + "*") //从redis,获取offest的位置;
    if (keys.size() == 0) { //如果redis里没有保存过offest,就视为第一次启动;
      println("第一次启动,从最开始的位置读取-------------------------------")
      kafkaStreams = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
    } else { //redis里有offest,说明不是第一次启动了,从保存的offest的位置开始读;
      println("不是第一次启动,从上次保存放的位置开始读取---------------------------")
      kafkaStreams = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams, RedisKeysListUtils.getKeysList(topic,jedis)))
    }
    jedis.close()
    kafkaStreams.foreachRDD(rdd=>{
      if(!rdd.isEmpty()){
        val json = rdd.map(_.value())
        EsSpark.saveJsonToEs(json,"spark/jaosn")
      }
    })
    scc.start()
    scc.awaitTermination()
  }
}
如果有写的不对的地方,欢迎大家指正,如果有什么疑问,可以加QQ群:340297350,谢谢
sparkstreaming写入elasticsearch

猜你喜欢