package com.ws.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
/**
* sample 采样
*/
object SampleTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordCount").setMaster("local[*]")
val sparkContext = new SparkContext(conf)
val data = Array(("hello", 1), ("good", 1), ("good", 1), ("good", 1), ("beautiful", 1), ("nice", 1), ("beautiful", 1), ("good", 1), ("good", 1), ("beautiful", 1), ("good", 1), ("beautiful", 1), ("good", 1), ("good", 1), ("good", 1));
val rdd1: RDD[(String, Int)] = sparkContext.makeRDD(data)
rdd1.cache()
//数据倾斜使用采样
//返回的是对象/集合,非RDD ,返回的结果数量是指定数量,如5; 参数1:是否需要重新放回 参数2:指定采取的数量
val rdd2: Array[(String, Int)] = rdd1.takeSample(false, 5)
println(rdd2.toBuffer)
//返回的是RDD, 参数1:是否需要重新放回, 参数2:抽取的比例(不一定精确)
val rdd3: RDD[(String, Int)] = rdd1.sample(false, 0.2)
val rdd4: collection.Map[String, Long] = rdd3.countByKey()
val buffer: mutable.Buffer[(String, Long)] = rdd4.toBuffer
println(buffer)
sparkContext.stop()
}
}
spark: sample 数据采样
猜你喜欢
转载自blog.csdn.net/bb23417274/article/details/87890605
今日推荐
周排行