countByKey和countByValue都是对RDD中的元素进行统计,前者必须是键值对RDD,根据键统计。后者直接统计值。
package com.cb.spark.sparkrdd;
import java.util.Arrays;
import java.util.function.BiConsumer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
public class CountByExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CountBy").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaPairRDD<Integer, String> javaPairRDD = jsc.parallelizePairs(
Arrays.asList(new Tuple2<Integer, String>(3, "Gnu"), new Tuple2<Integer, String>(3, "Yak"),
new Tuple2<Integer, String>(5, "Mouse"), new Tuple2<Integer, String>(3, "Dog")),
2);
javaPairRDD.countByKey().forEach(new BiConsumer<Integer, Long>() {
@Override
public void accept(Integer t, Long u) {
System.out.println(t + ":" + u);
}
});
JavaRDD<Integer> javaRDD = jsc
.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 3, 2, 6, 5, 4, 7));
javaRDD.countByValue().forEach((x, y) -> System.out.print("<" + x + ":" + y + ">\t"));
jsc.stop();
}
}