SQL练习——用户活跃区间记录表

区间记录表:4号

g01,2020-09-01,2020-09-01,2020-09-01 
g01,2020-09-01,2020-09-03,2020-09-03 
g02,2020-09-01,2020-09-01,2020-09-02 
g03,2020-09-01,2020-09-01,9999-12-31 
g04,2020-09-02,2020-09-02,2020-09-02 
g05,2020-09-02,2020-09-02,2020-09-03 
g06,2020-09-02,2020-09-02,9999-12-31 
g07,2020-09-03,2020-09-03,2020-09-03 
g08,2020-09-03,2020-09-03,9999-12-31 

日活表:5号

g01,2020-09-05
g03,2020-09-05
g11,2020-09-05
g12,2020-09-05

通过上面的表需要的结果
区间记录表:5号

g01,2020-09-01,2020-09-01,2020-09-01 
g01,2020-09-01,2020-09-03,2020-09-03 
g01,2020-09-01,2020-09-05,9999-12-31
g02,2020-09-01,2020-09-01,2020-09-02 
g03,2020-09-01,2020-09-01,9999-12-31 
g04,2020-09-02,2020-09-02,2020-09-02 
g05,2020-09-02,2020-09-02,2020-09-03 
g06,2020-09-02,2020-09-02,2020-09-04 
g07,2020-09-03,2020-09-03,2020-09-03 
g08,2020-09-03,2020-09-03,2020-09-04 
g11,2020-09-05,2020-09-05,9999-12-31 
g12,2020-09-05,2020-09-05,9999-12-31 

情况1:前日不在,今在,(老记录要保留,新添一行)
情况2:前日不在,今不在(保留原纪录)
情况3:前日在,今不在(修改此人的最后区间)
情况4:前日在,今在(保留原纪录)
情况5:新用户,(添加记录)

– 逻辑:

  1. 用区间记录表的T-1日 FULL JOIN 日活T日
    得到结果part1:
    历史记录保留
    新增用户新增
    之前没封闭的区间今日封闭(今日没来)
  2. 从区间记录表的T-1日中过滤出所有昨天没活跃的人, JOIN T日的日
    得到结果part2
    这些人应该新增的行

– 建表:

CREATE TABLE dws.app_user_act_rng(
guid       string,
first_dt   string,
rng_start  string,
rng_end    string
) 
PARTITIONED BY (dt string)
stored as parquet
; 

– ETL 计算

– 源表:dws.app_user_act_rng的T-1日分区, dws.app_trf_agr_user 流量用户聚合表
– 目标:dws.app_user_act_rng的T日分区

/*
准备测试数据和测试表
*/

CREATE TABLE dws.app_user_act_rng_test(
guid       string,
first_dt   string,
rng_start  string,
rng_end    string
) 
PARTITIONED BY (dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
;

LOAD DATA LOCAL INPATH '/root/rng.4' INTO TABLE dws.app_user_act_rng_test PARTITION(dt='2020-09-04')

4日的区间记录表
g01,2020-09-01,2020-09-01,2020-09-01
g01,2020-09-01,2020-09-03,2020-09-03
g02,2020-09-01,2020-09-01,2020-09-02
g03,2020-09-01,2020-09-01,9999-12-31
g04,2020-09-02,2020-09-02,2020-09-02
g05,2020-09-02,2020-09-02,2020-09-03
g06,2020-09-02,2020-09-02,9999-12-31
g07,2020-09-03,2020-09-03,2020-09-03
g08,2020-09-03,2020-09-03,9999-12-31

CREATE TABLE dws.app_trf_agr_user_test(
guid string
)
PARTITIONED BY (DT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
;

LOAD DATA LOCAL INPATH '/root/dau.5' INTO TABLE dws.app_trf_agr_user_test  PARTITION(dt='2020-09-05')

日活表:5号
g01,2020-09-05
g03,2020-09-05
g11,2020-09-05
g12,2020-09-05

– 结果part1的计算如下:

SELECT
a.*,
b.*
FROM 
  dws.app_user_act_rng_test a
FULL JOIN 
  dws.app_trf_agr_user_test b
ON a.dt='2020-09-04' AND b.dt='2020-09-05' AND a.guid = b.guid  

±--------±------------±-------------±------------±--------±------------+
| a.guid | a.first_dt | a.rng_start | a.rng_end | b.guid | b.dt |
±--------±------------±-------------±------------±--------±------------+
| g01 | 2020-09-01 | 2020-09-01 | 2020-09-01 | g01 | 2020-09-05 |
| g01 | 2020-09-01 | 2020-09-03 | 2020-09-03 | g01 | 2020-09-05 |
| g02 | 2020-09-01 | 2020-09-01 | 2020-09-02 | NULL | NULL |
| g03 | 2020-09-01 | 2020-09-01 | 9999-12-31 | g03 | 2020-09-05 |
| g04 | 2020-09-02 | 2020-09-02 | 2020-09-02 | NULL | NULL |
| g05 | 2020-09-02 | 2020-09-02 | 2020-09-03 | NULL | NULL |
| g06 | 2020-09-02 | 2020-09-02 | 9999-12-31 | NULL | NULL |
| g07 | 2020-09-03 | 2020-09-03 | 2020-09-03 | NULL | NULL |
| g08 | 2020-09-03 | 2020-09-03 | 9999-12-31 | NULL | NULL |
| NULL | NULL | NULL | NULL | g11 | 2020-09-05 |
| NULL | NULL | NULL | NULL | g12 | 2020-09-05 |
±--------±------------±-------------±------------±--------±------------+

SELECT
  nvl(a.guid,b.guid) as guid,
  nvl(a.first_dt,b.dt) as first_dt,
  nvl(a.rng_start,b.dt) as rng_start,
  case 
     when a.rng_end = '9999-12-31' and b.guid is null then a.dt
     when a.rng_end is null then '9999-12-31'
     else a.rng_end
  end as rng_end
FROM 
  dws.app_user_act_rng_test a
FULL JOIN 
  dws.app_trf_agr_user_test b
ON a.dt='2020-09-04' AND b.dt='2020-09-05' AND a.guid = b.guid  

±------±------------±------------±------------+
| guid | first_dt | rng_start | rng_end |
±------±------------±------------±------------+
| g01 | 2020-09-01 | 2020-09-03 | 2020-09-03 |
| g01 | 2020-09-01 | 2020-09-01 | 2020-09-01 |
| g02 | 2020-09-01 | 2020-09-01 | 2020-09-02 |
| g03 | 2020-09-01 | 2020-09-01 | 9999-12-31 |
| g04 | 2020-09-02 | 2020-09-02 | 2020-09-02 |
| g05 | 2020-09-02 | 2020-09-02 | 2020-09-03 |
| g06 | 2020-09-02 | 2020-09-02 | 2020-09-04 |
| g07 | 2020-09-03 | 2020-09-03 | 2020-09-03 |
| g08 | 2020-09-03 | 2020-09-03 | 2020-09-04 |
| g11 | 2020-09-05 | 2020-09-05 | 9999-12-31 |
| g12 | 2020-09-05 | 2020-09-05 | 9999-12-31 |
±------±------------±------------±------------+

– 结果part2的计算如下:

SELECT 
  a.guid        as guid,
  a.first_dt    as first_dt,
  b.dt          as rng_start,
  '9999-12-31'   as rng_end

FROM 
  (
    SELECT
     guid,
     first_dt
    FROM dws.app_user_act_rng_test WHERE dt='2020-09-04'
    GROUP BY guid,first_dt
    HAVING max(rng_end)!='9999-12-31'
  ) a
JOIN
  dws.app_trf_agr_user_test b
ON a.guid=b.guid and b.dt='2020-09-05'

±------±------------±------------±------------+
| guid | first_dt | rng_start | rng_end |
±------±------------±------------±------------+
| g01 | 2020-09-01 | 2020-09-05 | 9999-12-31 |
±------±------------±------------±------------+

– 最终整合完整语句:

SELECT
  nvl(a.guid,b.guid) as guid,
  nvl(a.first_dt,b.dt) as first_dt,
  nvl(a.rng_start,b.dt) as rng_start,
  case 
     when a.rng_end = '9999-12-31' and b.guid is null then a.dt
     when a.rng_end is null then '9999-12-31'
     else a.rng_end
  end as rng_end
FROM 
  dws.app_user_act_rng_test a
FULL JOIN 
  dws.app_trf_agr_user_test b
ON a.dt='2020-09-04' AND b.dt='2020-09-05' AND a.guid = b.guid  

UNION ALL

SELECT 
  a.guid        as guid,
  a.first_dt    as first_dt,
  b.dt          as rng_start,
  '9999-12-31'   as rng_end

FROM 
  (
    SELECT
     guid,
     first_dt
    FROM dws.app_user_act_rng_test WHERE dt='2020-09-04'
    GROUP BY guid,first_dt
    HAVING max(rng_end)!='9999-12-31'
  ) a
JOIN
  dws.app_trf_agr_user_test b
ON a.guid=b.guid and b.dt='2020-09-05'

±----------±--------------±---------------±-------------+
| _u1.guid | _u1.first_dt | _u1.rng_start | _u1.rng_end |
±----------±--------------±---------------±-------------+
| g01 | 2020-09-01 | 2020-09-03 | 2020-09-03 |
| g01 | 2020-09-01 | 2020-09-01 | 2020-09-01 |
| g02 | 2020-09-01 | 2020-09-01 | 2020-09-02 |
| g03 | 2020-09-01 | 2020-09-01 | 9999-12-31 |
| g04 | 2020-09-02 | 2020-09-02 | 2020-09-02 |
| g05 | 2020-09-02 | 2020-09-02 | 2020-09-03 |
| g06 | 2020-09-02 | 2020-09-02 | 2020-09-04 |
| g07 | 2020-09-03 | 2020-09-03 | 2020-09-03 |
| g08 | 2020-09-03 | 2020-09-03 | 2020-09-04 |
| g11 | 2020-09-05 | 2020-09-05 | 9999-12-31 |
| g12 | 2020-09-05 | 2020-09-05 | 9999-12-31 |
| g01 | 2020-09-01 | 2020-09-05 | 9999-12-31 |
±----------±--------------±---------------±-------------+

猜你喜欢

转载自blog.csdn.net/weixin_43648241/article/details/108882884