问题
计算每日访问用户 在之后日期的留存数
基础表 每日访问用户ID
create external table if not exists user_visit_date (
user_id bigint comment '用户ID'
)
comment '每日访问用户'
partitioned by (p_day date comment '分区日期')
stored as parquet;
解决思考
Step 1. 先算出历史访问用户 在昨日的 留存数 (关键思路)
create external table if not exists user_before_visit_date (
before_visit_date date comment '历史访问日期',
remain_count bigint comment '在visit_date留存人数'
) comment '历史访问日期在visit_date的留存人数'
partitioned by (visit_date date comment '分区日期')
stored as parquet;
insert overwrite table user_before_visit_date partition(visit_date=${yesterday})
select
t2.p_day as before_visit_date,
count(1) as remain_count
from user_visit_date t1
inner join user_visit_date t2
on t1.user_id = t2.user_id
and t2.p_day >= date_sub(${yesterday}, 30)
and t2.p_day < ${yesterday}
where t1.p_day = ${yesterday}
group by t2.p_day
Step 2. 动态分区计算 历史日期的汇总(group by) 留存数
select
max(if(datediff(visit_date, before_visit_date) = 1, remain_count, 0)) as 1_day_remain_count, -- 1日留存数
max(if(datediff(visit_date, before_visit_date) = 2, remain_count, 0)) as 2_day_remain_count, -- 2日留存数
max(if(datediff(visit_date, before_visit_date) = 3, remain_count, 0)) as 3_day_remain_count, -- 3日留存数
max(if(datediff(visit_date, before_visit_date) = 7, remain_count, 0)) as 7_day_remain_count, -- 7日留存数
max(if(datediff(visit_date, before_visit_date) = 15, remain_count, 0)) as 15_day_remain_count, -- 15日留存数
max(if(datediff(visit_date, before_visit_date) = 30, remain_count, 0)) as 30_day_remain_count, -- 30日留存数
before_visit_date as p_day
from user_before_visit_date
where visit_date >= date_sub(${yesterday}, 30) and visit_date <= ${yesterday}
group by before_visit_date
;