- full join 横向join ,不能map join 走shuffle
- row_number() over ( partition by 主键 order by $flag desc) rank ... where rank =1 ,走shufle
select
id,
order_datekey,
f_procurement_order,
from
(
select
id,
order_datekey,
f_procurement_order,
row_number() over (
partition by id
order by
b_flag_i desc
) rank
from
(
select
id,
order_datekey,
f_procurement_order,
0 b_flag_i
from
ods_pms_procurement_order_item_hm old
WHERE
c_t >= 1479916800
or u_t >= 1479916800
union all
select
id,
order_datekey,
f_procurement_order,
1 b_flag_i
from
ods_pms_procurement_order_item_hm_delta_64124FEADBFA9720 new
) t
) st
where
rank = 1;
- 差集 + 并集方式 效率最高 前提是增量数据较少,要不也要走shuffle
SET hive.mapred.mode=nonstrict;
INSERT overwrite TABLE $target.table
SELECT
$stream.format
FROM
$target.table old left anti
join ($delta) new on $stream.unique_keys
UNION ALL
SELECT
$stream.format
FROM
$target.table ;
fields = 'id,name'
new = 'new'
old = 'old'
and_str = ' AND '
cmd = []
for field in fields.split(','):
str = old + '.' + field + ' = ' + new + '.' + field
cmd.append(str)
print and_str.join(cmd)