- 首先,我们需要将KafkaProducer利用lazy val的方式进行包装如下:
package com.eitcloud.util import java.util.concurrent.Future import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata} import scala.collection.JavaConversions._ class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable { /* This is the key idea that allows us to work around running into NotSerializableExceptions. */ lazy val producer = createProducer() def send(topic: String, key: K, value: V): Future[RecordMetadata] = producer.send(new ProducerRecord[K, V](topic, key, value)) def send(topic: String, value: V): Future[RecordMetadata] = producer.send(new ProducerRecord[K, V](topic, value)) } object KafkaSink { def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = { val createProducerFunc = () => { val producer = new KafkaProducer[K, V](config) sys.addShutdownHook { // Ensure that, on executor JVM shutdown, the Kafka producer sends // any buffered messages to Kafka before shutting down. producer.close() } producer } new KafkaSink(createProducerFunc) } def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap) }
- 2、之后我们利用广播变量的形式,将KafkaProducer广播到每一个executor,在每个executor中愉快的将数据输入到kafka当中:
package com.eitcloud.Entrance import java.util.Properties import breeze.numerics.log import com.eitcloud.util.{KafkaOut, KafkaSink} import org.apache.kafka.common.serialization.StringSerializer import org.apache.log4j.{Level, Logger} import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object Test2 { def main(args: Array[String]): Unit = { //取消打印多余日志 Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) val conf = new SparkConf() conf.setExecutorEnv("SPARK_JAVA_OPTS", " -Xms8024m -Xmx12040m -XX:MaxPermSize=30840m") conf.setMaster("local[4]") conf.setAppName(s"${this.getClass.getSimpleName}") val sc: SparkContext = new SparkContext(conf) val rdd: RDD[String] = sc.parallelize(Array("1","2","4","5","6")) //KafkaOut.outPut(rdd,sc) // rdd.collect() //广播KafkaSink val kafkaProducer: Broadcast[KafkaSink[String, String]] = { val kafkaProducerConfig = { val p = new Properties() p.setProperty("bootstrap.servers", "192.168.2.116:9092") p.setProperty("key.serializer", classOf[StringSerializer].getName) p.setProperty("value.serializer", classOf[StringSerializer].getName) p } sc.broadcast(KafkaSink[String, String](kafkaProducerConfig)) } //输出到kafka rdd.foreach(record=>{ kafkaProducer.value.send("lili", record) }) } }