球队 比赛 反超球员及连续得分球员问题计算
问题:两支篮球队进行了激烈的篮球比赛,比分交替上升。比赛结束后,你有一张两队得分分数的明细表,记录了球队team,球员号码number,球员姓名name, 得分分数score 以及得分时间score_time(string,秒级)。现在球队要对比赛中表现突出的球员做出嘉奖,所以请你用sql统计出
1)比赛中帮助各自球队反超比分的球员姓名以及对应时间。
2)连续三次(及以上)为球队得分的球员名单以及对应时间段内获得的分数。
一 数据准备
1建表
hdfs dfs -mkdir '/tmp/yj_ext_csv_unc_basketball_game'
use tmp;
create external table tmp.yj_ext_csv_unc_basketball_game
(
team string,
number int,
score int,
score_time int
)
partitioned by (dt STRING)
row format delimited fields terminated by ',' lines terminated by '\n'
stored as textfile
LOCATION '/tmp/yj_ext_csv_unc_basketball_game/';
2.导入数据
数据样例:
a,2,1,0
a,1,2,1
a,1,3,2
b,1,4,3
a,1,2,4
b,2,5,5
b,6,3,6
a,1,5,7
a,3,2,8
b,2,4,9
b,5,3,10
a,1,2,11
a,1,3,12
b,5,4,13
a,4,3,14
a,4,2,15
a,4,3,16
b,6,7,17
hdfs dfs -put basketball.cav /tmp/yj_ext_csv_unc_basketball_game/20190527
altet table tmp.yj_ext_csv_unc_basketball_game partition(dt='20190527') loaction '/tmp/yj_ext_csv_unc_basketball_game/20190527'
二 问题分析
- 1 计算比赛中帮助各自球队反超比分的球员姓名以及对应时间
计算a,b球队的累计得分总和;一个时间点上只有一个球队得分,所以同一时间点如果a球队得分,那么b球队一定得零分,一行数据增加对手球队信息
select *, case when team='a' then 'b' else 'a' end team_2 , 0 number_2, 0 score_2 from tmp.yj_ext_csv_unc_freemudb where dt='20190527';
数据结果
team number score score_time dt team_2 number_2 score_2 sum_a sum_b
a 2 1 0 20190527 b 0 0 1 0
a 1 2 1 20190527 b 0 0 3 0
a 1 3 2 20190527 b 0 0 6 0
b 1 4 3 20190527 a 0 0 6 4
a 1 2 4 20190527 b 0 0 8 4
b 2 5 5 20190527 a 0 0 8 9
b 6 3 6 20190527 a 0 0 8 12
a 1 5 7 20190527 b 0 0 13 12
a 3 2 8 20190527 b 0 0 15 12
b 2 4 9 20190527 a 0 0 15 16
b 5 3 10 20190527 a 0 0 15 19
a 1 2 11 20190527 b 0 0 17 19
a 1 3 12 20190527 b 0 0 20 19
b 5 4 13 20190527 a 0 0 20 23
a 4 3 14 20190527 b 0 0 23 23
a 4 2 15 20190527 b 0 0 25 23
a 4 3 16 20190527 b 0 0 28 23
b 6 7 17 20190527 a 0 0 28 30
-- 计算a,b球队累计得分
create table tmp.yj_bask_tmp as
(select *, sum(case when team='a' then score else score_2 end) over(order by score_time) sum_a,
sum(case when team='b' then score else score_2 end) over(order by score_time) sum_b
from (select *, case when team='a' then 'b' else 'a' end team_2 , 0 number_2, 0 score_2 from tmp.yj_ext_csv_unc_freemudb where dt='20190527') a
team number score score_time dt team_2 number_2 score_2 sum_a sum_b) a
数据结果
a 2 1 0 20190527 b 0 0 1 0
a 1 2 1 20190527 b 0 0 3 0
a 1 3 2 20190527 b 0 0 6 0
b 1 4 3 20190527 a 0 0 6 4
a 1 2 4 20190527 b 0 0 8 4
b 2 5 5 20190527 a 0 0 8 9
b 6 3 6 20190527 a 0 0 8 12
a 1 5 7 20190527 b 0 0 13 12
a 3 2 8 20190527 b 0 0 15 12
b 2 4 9 20190527 a 0 0 15 16
b 5 3 10 20190527 a 0 0 15 19
a 1 2 11 20190527 b 0 0 17 19
a 1 3 12 20190527 b 0 0 20 19
b 5 4 13 20190527 a 0 0 20 23
a 4 3 14 20190527 b 0 0 23 23
a 4 2 15 20190527 b 0 0 25 23
a 4 3 16 20190527 b 0 0 28 23
b 6 7 17 20190527 a 0 0 28 30
a队被反超时b队的球员信息
select * from (select a.*, b.a_lose float_a_lost, b.b_lose float_b_lose from
(select *, (sum_a-sum_b) a_lose, (sum_b-sum_a) b_lose, row_number() over (order by score_time) rank from tmp.yj_bask_tmp) a
left join
(select *, (sum_a-sum_b) a_lose, (sum_b-sum_a) b_lose, row_number() over (order by score_time) rank from tmp.yj_bask_tmp) b
on a.rank = (b.rank+1)) c
where a_lose < 0 and float_a_lost > 0;
team number score score_time dt team_2 number_2 score_2 sum_a sum_b a_lose b_lose rank float_a_lost float_b_lose
数据结果
b 2 5 5 20190527 a 0 0 7 9 -2 2 5 3 -3
b 2 4 9 20190527 a 0 0 14 16 -2 2 9 2 -2
- 2 连续三次(及以上)为球队得分的球员名单以及对应时间段内获得的分数
-- a队连续三次得分的球员信息
-- 按时间对每条得分记录排序rank,把同一球员的相关信息分到一组排序,然后用时间排序rank减去分组排序
select number, case when count(1) >=3 then sum(score) else -100 end,count(1) from
(select * , (rank-row_number() over (partition by number order by rank)) diff from
(select *, row_number() over (order by score_time) rank from tmp.yj_ext_csv_unc_basketball_game where dt='20190527' and team='a') a) b
group by number, diff;
数据结果
number _c1 _c2
1 12 4
1 -100 2
2 -100 1
3 -100 1
4 8 3