spark count(distinct)over() 数据处理
业务描述
有这么一个业务,需要过滤排除掉相同设备不同账号,以及相同账号不同设备的数据,只留下设备与账号一对一的数据:
如果从关系型数据库来看,A 多对对 B, 需要找出并保留 A一对一的B。
数据准备
设备id | 账号 |
---|---|
1 | a |
2 | b |
2 | c |
3 | b |
3 | d |
4 | d |
5 | e |
5 | e |
/*
从数据上看只有(1,a),(5,e)满足一对一的要求,
设备2存在b/c两个账号,设备3存在b/d两个账号,账号d存在3、4两个设备
*/
-- 首先模拟hive处理数据, hive --version => Hive 2.1.1-cdh6.2.0
WITH da AS(
SELECT 1 dev_id, 'a' acc UNION ALL
SELECT 2 dev_id, 'b' acc UNION ALL
SELECT 2 dev_id, 'c' acc UNION ALL
SELECT 3 dev_id, 'b' acc UNION ALL
SELECT 3 dev_id, 'd' acc UNION ALL
SELECT 4 dev_id, 'd' acc UNION ALL
SELECT 5 dev_id, 'e' acc UNION ALL
SELECT 5 dev_id, 'e' acc)
SELECT dev_id, acc FROM
(SELECT dev_id ,--设备
acc , --账号
COUNT(DISTINCT dev_id) OVER(PARTITION BY acc) sadd_cnt, --相同账号不同设备个数
COUNT(DISTINCT acc) OVER(PARTITION BY dev_id) sdda_cnt --相同设备不同账号个数
from da) t where sadd_cnt = 1 and sdda_cnt = 1;
出现问题
将hiveQL 移植到spark运行
spark.sql(
s"""
|SELECT dev_id, acc FROM
|(SELECT dev_id ,--设备
|acc , --账号
|COUNT(DISTINCT dev_id) OVER(PARTITION BY acc) sadd_cnt, --相同账号不同设备个数
|COUNT(DISTINCT acc) OVER(PARTITION BY dev_id) sdda_cnt --相同设备不同账号个数
|from (SELECT 1 dev_id, 'a' acc UNION ALL
|SELECT 2 dev_id, 'b' acc UNION ALL
|SELECT 2 dev_id, 'c' acc UNION ALL
|SELECT 3 dev_id, 'b' acc UNION ALL
|SELECT 3 dev_id, 'd' acc UNION ALL
|SELECT 4 dev_id, 'd' acc UNION ALL
|SELECT 5 dev_id, 'e' acc UNION ALL
|SELECT 5 dev_id, 'e' acc) )
|t where sadd_cnt = 1 and sdda_cnt = 1
""".stripMargin).show()
解决问题
参考hive解决 count(distinct)over()
WITH da AS (
SELECT 1 dev_id, 'a' acc UNION ALL
SELECT 2 dev_id, 'b' acc UNION ALL
SELECT 2 dev_id, 'c' acc UNION ALL
SELECT 3 dev_id, 'b' acc UNION ALL
SELECT 3 dev_id, 'd' acc UNION ALL
SELECT 4 dev_id, 'd' acc UNION ALL
SELECT 5 dev_id, 'e' acc UNION ALL
SELECT 5 dev_id, 'e' acc)
SELECT dev_id, acc FROM
(SELECT dev_id ,--设备
acc , --账号
SIZE(COLLECT_SET( dev_id) OVER(PARTITION BY acc)) sadd_cnt, --相同账号不同设备个数
SIZE(COLLECT_SET( acc) OVER(PARTITION BY dev_id)) sdda_cnt --相同设备不同账号个数
FROM da) t WHERE sadd_cnt = 1 AND sdda_cnt = 1;
改用spark scala代码
spark.sql(
s"""
|WITH da AS (
|SELECT 1 dev_id, 'a' acc UNION ALL
|SELECT 2 dev_id, 'b' acc UNION ALL
|SELECT 2 dev_id, 'c' acc UNION ALL
|SELECT 3 dev_id, 'b' acc UNION ALL
|SELECT 3 dev_id, 'd' acc UNION ALL
|SELECT 4 dev_id, 'd' acc UNION ALL
|SELECT 5 dev_id, 'e' acc UNION ALL
|SELECT 5 dev_id, 'e' acc)
|SELECT dev_id, acc FROM
|(SELECT dev_id ,--设备
|acc , --账号
|SIZE(COLLECT_SET( dev_id) OVER(PARTITION BY acc)) sadd_cnt, --相同账号不同设备个数
|SIZE(COLLECT_SET( acc) OVER(PARTITION BY dev_id)) sdda_cnt --相同设备不同账号个数
|FROM da) t WHERE sadd_cnt = 1 AND sdda_cnt = 1
""".stripMargin).show()
参考站点
HIVE----count(distinct ) over() 无法使用解决办法https://www.cnblogs.com/luckyfruit/p/13093203.html