1.hdfs目录/data下的数据文件peopleinfo.txt,该文件包含了序号、性别和身高三个列,形式如下:
1 F 170
2 M 178
3 M 174
4 F 165
编写Spark应用程序,该程序对HDFS文件中的数据文件peopleinfo.txt进行统计,计算得到男性总数、女性总数、男性最高身高、
女性最高身高、男性最低身高、女性最低身高。
package blog.p4
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Author Daniel
* @Description
* 第一题解答
**/
object Test1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(s"${Test1.getClass.getSimpleName}")
.setMaster("local[*]")
val sc = new SparkContext(conf)
val lines = sc.textFile("/data/peopleinfo.txt")
val peopleInfo = lines.map(line => {
val res = line.split("\\s+")
if (res == null || res.length != 3) {
null
} else {
val gender = res(1)
val height = res(2).toDouble
//返回性别与身高,过滤null值
(gender, height)
}
}).filter(t => t != null)
peopleInfo.map { case (gender, height) => {
(gender, Result(gender, 1, height, height))
}
//比较两个对象的身高
}.reduceByKey((res1, res2) => {
val maxHeight = if (res1.maxHeight > res2.maxHeight) {
res1.maxHeight
} else res2.maxHeight
val minHeight = if (res1.minHeight < res2.minHeight) {
res1.minHeight
} else res2.minHeight
Result(res1.gender, res1.total + res2.total, maxHeight, minHeight)
}).foreach(println)
sc.stop()
}
}
//根据需求定义一个模式匹配类
case class Result(gender: String, total: Int, maxHeight: Double, minHeight: Double) {
}
2.给定数据如下:
12 张三 25 男 chinese 50
12 张三 25 男 math 60
12 张三 25 男 english 70
12 李四 20 男 chinese 50
12 李四 20 男 math 50
12 李四 20 男 english 50
12 王芳 19 女 chinese 70
12 王芳 19 女 math 70
12 王芳 19 女 english 70
13 张大三 25 男 chinese 60
13 张大三 25 男 math 60
13 张大三 25 男 english 70
13 李大四 20 男 chinese 50
13 李大四 20 男 math 60
13 李大四 20 男 english 50
13 王小芳 19 女 chinese 70
13 王小芳 19 女 math 80
13 王小芳 19 女 english 70
需求如下:
-
一共有多少人参加考试?
1.1 一共有多少个小于20岁的人参加考试?
1.2 一共有多少个等于20岁的人参加考试?
1.3 一共有多少个大于20岁的人参加考试? -
一共有多个男生参加考试?
2.1 一共有多少个女生参加考试? -
12班有多少人参加考试?
3.1 13班有多少人参加考试? -
语文科目的平均成绩是多少?
4.1 数学科目的平均成绩是多少?
4.2 英语科目的平均成绩是多少? -
单个人平均成绩是多少?
-
12班平均成绩是多少?
6.1 12班男生平均总成绩是多少?
6.2 12班女生平均总成绩是多少?
6.3 同理求13班相关成绩 -
全校语文成绩最高分是多少?
7.1 12班语文成绩最低分是多少?
7.2 13班数学最高成绩是多少? -
总成绩大于150分的12班的女生有几个?
package blog.p4
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Author Daniel
* @Description
* 第二题解答
**/
object Test2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(s"${Test2.getClass.getSimpleName}")
.setMaster("local[*]")
val sc = new SparkContext(conf)
val lines = sc.textFile("file:/F:/test/test3.txt")
val studentInfo = lines.map(line => {
val info = line.split(" ")
val classId = info(0)
val name = info(1)
val age = info(2)
val sex = info(3)
val subject = info(4)
val score = info(5)
(classId, name, age, sex, subject, score)
})
// println(studentInfo.collect().foreach(println))
//1
val nums = studentInfo.map(_._2).distinct().count() //直接用distinct进行去重
//需要的是元组中第二个与第三个元素,注意这里是从1开始计数。filter中的_.2_表示x=>(a,b)中的b元素
val less20 = studentInfo.map(x => (x._2, x._3)).distinct().filter(_._2.toInt < 20).count()
val equal20 = studentInfo.map(x => (x._2, x._3)).distinct().filter(_._2.toInt == 20).count()
val more20 = studentInfo.map(x => (x._2, x._3)).distinct().filter(_._2.toInt > 20).count()
println(s"①一共有${nums}人参加考试,小于20岁的${less20}人,等于20岁的${equal20}人,大于20岁的${more20}人")
//2
//scala中字符串可以用“==”来比较
val male = studentInfo.map(x => (x._2, x._4)).distinct().filter(_._2 == "男").count()
val female = studentInfo.map(x => (x._2, x._4)).distinct().filter(_._2 == "女").count()
println(s"②男:${male},女:${female}")
//3
val class12 = studentInfo.map(x => (x._1, x._2)).distinct().filter(_._1 == "12").count()
val class13 = studentInfo.map(x => (x._1, x._2)).distinct().filter(_._1 == "13").count()
println(s"③12班:${class12},13班:${class13}")
//4
val chineseInfo = studentInfo.filter(_._5 == "chinese")
//这里用总数、个数
val chineseAvg = chineseInfo.map(_._6.toDouble).reduce(_ + _) / chineseInfo.count().toInt
val mathInfo = studentInfo.filter(_._5 == "math")
val mathAvg = mathInfo.map(_._6.toDouble).reduce(_ + _) / mathInfo.count().toInt
val englishInfo = studentInfo.filter(_._5 == "english")
val englishAvg = englishInfo.map(_._6.toDouble).reduce(_ + _) / englishInfo.count().toInt
println(s"④语文成绩为:${chineseAvg},数学成绩为:${mathAvg},英文成绩为:${englishAvg}")
//5
print("⑤")
//以名字为key来进行分组,将结果再求平均值
val scoreList = studentInfo.map(x => (x._2, x._6.toDouble)).groupByKey().mapValues(_.toList)
val individuaAvg = scoreList.mapValues(x => x.sum / x.size)
// scoreList.foreach(println)
individuaAvg.foreach(println)
//6
val class12Info = studentInfo.filter(_._1 == "12")
val class12Avg = class12Info.map(_._6.toDouble).reduce(_ + _) / class12Info.count().toInt
val class12MaleInfo = studentInfo.filter(_._1 == "12").filter(_._4 == "男")
val class12MaleAvg = class12MaleInfo.map(_._6.toDouble).reduce(_ + _) / class12MaleInfo.count().toInt
val class12FemaleInfo = studentInfo.filter(_._1 == "12").filter(_._4 == "女")
val class12FemaleAvg = class12FemaleInfo.map(_._6.toDouble).reduce(_ + _) / class12FemaleInfo.count().toInt
println(s"⑥12班平均成绩:${class12Avg},男生:${class12MaleAvg},女生:${class12FemaleAvg}")
val class13Info = studentInfo.filter(_._1 == "13")
val class13Avg = class13Info.map(_._6.toDouble).reduce(_ + _) / class13Info.count().toInt
val class13MaleInfo = studentInfo.filter(_._1 == "13").filter(_._4 == "男")
val class13MaleAvg = class13MaleInfo.map(_._6.toDouble).reduce(_ + _) / class13MaleInfo.count().toInt
val class13FemaleInfo = studentInfo.filter(_._1 == "13").filter(_._4 == "女")
val class13FemaleAvg = class13FemaleInfo.map(_._6.toDouble).reduce(_ + _) / class13FemaleInfo.count().toInt
println(s"⑦13班平均成绩:${class13Avg},男生:${class13MaleAvg},女生:${class13FemaleAvg}")
//7
//false表示按降序排,默认参数为true
val chineseMax = studentInfo.filter(_._5 == "chinese").sortBy(_._6, false).first()
val class12ChineseMin = studentInfo.filter(_._5 == "chinese").filter(_._1 == "12").sortBy(_._6).first()
val class13MathMax = studentInfo.filter(_._5 == "math").filter(_._1 == "13").sortBy(_._6, false).first()
println(s"全校语文最高分为:${chineseMax._6},12班语文最低分为:${class12ChineseMin._6},13班数学最高分为:${class13MathMax._6}")
//8
val count150 = studentInfo.filter(_._1 == "12")
.filter(_._4 == "女")
.map(x => (x._2, x._6.toDouble))
.groupByKey().mapValues(_.sum)
.filter(_._2 > 150)
.count()
println(s"⑧总成绩大于150分的12班的女生有${count150}个")
sc.stop()
}
}