添加依赖:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.12.8</version>
</dependency>
<dependency>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
<version>2.8</version>
</dependency>
public class JavaWordCount {
public static void main(String[] args) {
/**
* conf:
* 1.设置spark的运行模式
* 2.设置spark在webui中显示的application的名称
* 3.设置当前spark application运行所需要的资源
*
* Spark运行模式:
* 1.local:开发过程中使用本地模式,多用于测试
* 2.stanalone:Spark自带的资源调度框架,支持分布式搭建
* 3.yarn:hadoop生态圈的资源调度框架
*/
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("JavaWordCount");
/**
* SparkContext是通往集群的唯一通道
*/
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("C:/words.txt");
/**
* flatMap:进一条数据出多条数据
*/
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
/**
* 将RDD转换成k,v格式
*/
JavaPairRDD<String, Integer> pairWords = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, 1);
}
});
/**
* reduceByKey
* 1.先将相同的key分组
* 2.对每组key对应的value按照逻辑去处理
*/
JavaPairRDD<String, Integer> reduce = pairWords.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
/**
* 排序
* 1.先调换k,v再排序,在调换回来
* */
JavaPairRDD<Integer, String> mapToPair = reduce.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
public Tuple2<Integer, String> call(Tuple2<String, Integer> tuple) throws Exception {
return tuple.swap();
}
});
JavaPairRDD<Integer, String> sortByKey = mapToPair.sortByKey();
JavaPairRDD<String, Integer> result = sortByKey.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
public Tuple2<String, Integer> call(Tuple2<Integer, String> tuple) throws Exception {
return tuple.swap();
}
});
result.foreach(new VoidFunction<Tuple2<String, Integer>>() {
public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
System.out.println(stringIntegerTuple2);
}
});
sc.stop();
}
}
object ScalaWordCount {
def main(args: Array[String]): Unit = {
// 代码简化
// val conf = new SparkConf()
// conf.setMaster("local").setAppName("ScalaWordCount")
// val sc = new SparkContext(conf)
// sc.textFile("C:/words.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).foreach(println)
// sc.stop()
// spark配置
val conf = new SparkConf()
conf.setMaster("local").setAppName("ScalaWordCount")
val sc = new SparkContext(conf)
// 读数据
val lines = sc.textFile("C:/words.txt")
// 分割数据
val words = lines.flatMap(line => {line.split(" ")})
// 计数
val pairWords = words.map(word => {new Tuple2(word, 1)})
val result = pairWords.reduceByKey((v1: Int, v2: Int) => {v1 + v2})
//排序,按照tuple中的第1位升序排序
result.sortBy(tuple=>{tuple._2}).foreach(println)
sc.stop()
}
}