目录
1.问题
1.求一个页面用户曝光次数的中位数
2.求多个页面用户曝光次数的中位数
1.2数据准备
CREATE TABLE IF NOT EXISTS test_exposure_count
(
page STRING
,uid STRING
,c STRING
)
;
INSERT OVERWRITE TABLE test_exposure_count VALUES
('ElementId-8GG5','211861079118587230','44'),
('ElementId-8GG5','211830851265539182','31'),
('ElementId-8GG5','211840774078481572','64'),
('ElementId-8GG5','211830852525499241','88'),
('ElementId-8GG5','211830855055469232','11'),
('ElementId-8GG5','211840770048501629','50'),
('ElementId-8GG5','211830856145479147','12'),
('ElementId-8GG5','211830870375599205','86'),
('ElementId-8GG5','211861081548657320','43'),
('ElementId-8GG5','211830877435499343','50'),
('ElementId-8GG5','211861081098677442','56'),
('ElementId-8GG5','211861080558657317','3'),
('ElementId-8GG5','211840762428551551','21'),
('ElementId-8GG5','211830878365629243','13'),
('ElementId-G5BH','211840901008611824','44'),
('ElementId-G5BH','211840747348401546','31'),
('ElementId-G5BH','211840772058511551','64'),
('ElementId-G5BH','211840786458531560','88'),
('ElementId-G5BH','211840796068591649','11'),
('ElementId-G5BH','211840806518401669','50'),
('ElementId-G5BH','211840815448461734','12'),
('ElementId-G5BH','211840826188621683','86'),
('ElementId-G5BH','211840840418571714','43'),
('ElementId-G5BH','211840862028491888','50'),
('ElementId-G5BH','211840880248391869','56'),
('ElementId-G5BH','211840739268691553','3'),
('ElementId-G5BH','211840903258401893','21');
SELECT COUNT(*) FROM test_exposure_count GROUP BY page;
ElementId-8GG5 14
ElementId-G5BH 13
2.问题解决思路
2.1问题1
2.1.1 方法一 求中间值序号
先求出中间这条数据的序列号,14条数据就是7和8,15条数据就是8,这个地方必须用ceiling向下取整,因为不好拿两个序号
SELECT *
FROM (
SELECT *
,ROW_NUMBER() OVER (ORDER BY c) AS r
FROM test_exposure_count
) t1
JOIN (
SELECT ceiling(AVG(r)) AS num
FROM (
SELECT ROW_NUMBER() OVER (ORDER BY c) AS r
FROM test_exposure_count
)
) t2
ON t1.r = t2.num
;
ElementId-G5BH 211840901008611824 44 14 14
2.1.2 方法二 正反排序
就比较简单取巧了,正反两个排序,正反序号相等或者刚好差一位就是中间那条数据了
SELECT AVG(c)
FROM (
SELECT *
,ROW_NUMBER() OVER (ORDER BY c DESC ) AS r1
,ROW_NUMBER() OVER (ORDER BY c ASC ) AS r2
FROM test_exposure_count
)
WHERE r1 - r2 = 0
OR ABS(r1 - r2) = 1
;
44.0
2.2问题2
2.1.1 方法一 求中间值序号
和问题一思路基本一致
SELECT *
FROM (
SELECT *
,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c) AS r
FROM test_exposure_count
) t1
JOIN (
SELECT page,ceiling(AVG(r)) AS num
FROM (
SELECT *,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c) AS r
FROM test_exposure_count
)
GROUP BY page
) t2
ON t1.r = t2.num
AND t1.page = t2.page
;
ElementId-8GG5 44.0
ElementId-G5BH 44.0
2.1.2 方法二 正反排序
此时用这种方法就不行了
没法解决group by的问题
SELECT *
FROM (
SELECT *
,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c DESC ) AS r1
,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c ASC ) AS r2
FROM test_exposure_count
)
-- 这一步之后就不行了
WHERE r1 - r2 = 0
OR ABS(r1 - r2) = 1
ORDER BY r1
LIMIT 1
;
最优解
两个问题都可以使用
还是求中间值序号,用了count窗口函数
SELECT page
,AVG(c)
FROM (
SELECT page
,uid
,c
,row_number() OVER(PARTITION BY page ORDER BY c) AS rnk
,COUNT(uid) OVER(PARTITION BY page) AS cnt
FROM test_exposure_count
) t
WHERE rnk IN (cnt / 2,cnt / 2 + 1,cnt / 2 + 0.5)
GROUP BY page
;
ElementId-8GG5 43.5
ElementId-G5BH 44.0