INSERT OVERWRITE TABLE pv_users
SELECT
pv.pageid,
u.age
FROM page_view p
JOIN user u ON (pv.userid = u.userid)
JOIN newuser x ON (u.userid = x.userid);
场景:有一张 user 表,为卖家每天收到表,user_id,ds(日期)为 key,属性有主营类目,指标有交易金额,交易笔数。每天要取前10天的总收入,总笔数,和最近一天的主营类目。
常用方法
# 第一步:利用分析函数,取每个 user_id 最近一天的主营类目,存入临时表 t1。
CREATE TABLE t1 AS
SELECT
user_id,
substr(MAX(CONCAT(ds,cat),9) AS main_cat)
FROM users
WHERE ds=20120329 // 20120329 为日期列的值,实际代码中可以用函数表示出当天日期 GROUP BY user_id;
# 第二步:汇总 10 天的总交易金额,交易笔数,存入临时表 t2
CREATE TABLE t2 AS
SELECT
user_id,
sum(qty) AS qty,SUM(amt) AS amt
FROM users
WHERE ds BETWEEN 20120301 AND 20120329
GROUP BY user_id
# 第三步:关联 t1,t2,得到最终的结果。
SELECT
t1.user_id,
t1.main_cat,
t2.qty,t2.amt
FROM t1
JOIN t2 ON t1.user_id=t2.user_id
优化方法
SELECT
user_id,
substr(MAX(CONCAT(ds,cat)),9) AS main_cat,
SUM(qty),
SUM(amt)
FROM users
WHERE ds BETWEEN 20120301 AND 20120329
GROUP BY user_id
解决方法 1:user_id 为空的不参与关联,子查询过滤 null
SELECT
*
FROM log a
JOIN
bmw_users b
ON a.user_id IS NOT NULL AND a.user_id=b.user_id
UNION All
SELECT
*
FROM log a
WHERE a.user_id IS NULL
解决方法 2 如下所示:函数过滤 null
SELECT
*
FROM log a
LEFT OUTER JOIN
bmw_users b
ON
CASE WHEN a.user_id IS NULL THEN CONCAT('dp_hive',RAND()) ELSE a.user_id END =b.user_id; // 这句话写的好骚气啊,还有这种操作,我没有试过
解决方法:union all
SELECT
*
FROM effect a
JOIN
(
SELECT
auction_id AS auction_id
FROM auctions
UNION All
SELECT
auction_string_id AS auction_id
FROM auctions
)b
ON a.auction_id=b.auction_id
多表 union all 会优化成一个 job。比分别过滤数字 id,字符串 id 然后分别和商品表关联性能要好。
解决Hive对UNION ALL优化的短板
Hive 对 union all 的优化的特性:对 union all 优化只局限于非嵌套查询。
消灭子查询内的 group by
示例 1:子查询内有 group by
SELECT
*
FROM
(
SELECT
*
FROM t1
GROUP BY c1,c2,c3
UNION ALL
SELECT
*
FROM t2
GROUP BY c1,c2,c3
)t3
GROUP BY c1,c2,c3
从业务逻辑上说,子查询内的 GROUP BY 怎么都看显得多余(功能上的多余,除非有 COUNT(DISTINCT)),如果不是因为 Hive Bug 或者性能上的考量(曾经出现如果不执行子查询 GROUP BY,数据得不到正确的结果的 Hive Bug)。所以这个 Hive 按经验转换成如下所示:
SELECT
*
FROM
(
SELECT
*
FROM t1
UNION ALL
SELECT
*
FROM t2
)t3
GROUP BY c1,c2,c3
调优结果:经过测试,并未出现 union all 的 Hive Bug,数据是一致的。MapReduce 的 作业数由 3 减少到 1。
消灭子查询内的 COUNT(DISTINCT),MAX,MIN。
SELECT
*
FROM
(
SELECT
*
FROM t1
UNION ALL
SELECT
c1,
c2,
c3,
COUNT(DISTINCT c4)
FROM t2
GROUP BY c1,c2,c3
)t3
GROUP BY c1,c2,c3;
由于子查询里头有 COUNT(DISTINCT)操作,直接去 GROUP BY 将达不到业务目标。这时采用临时表消灭 COUNT(DISTINCT)作业不但能解决倾斜问题,还能有效减少 jobs。
INSERT t4 SELECT c1,c2,c3,c4 FROM t2 GROUP BY c1,c2,c3;
SELECT
c1,
c2,
c3,
SUM(income),
SUM(uv)
FROM
(
SELECT
c1,
c2,
c3,
income,
0 AS uv
FROM t1
UNION ALL
SELECT
c1,
c2,
c3,
0 AS income,
1 AS uv
FROM t2
)t3
GROUP BY c1,c2,c3;
GROUP BY替代COUNT(DISTINCT)达到优化效果
计算 uv 的时候,经常会用到 COUNT(DISTINCT),但在数据比较倾斜的时候 COUNT(DISTINCT) 会比较慢。这时可以尝试用 GROUP BY 改写代码计算 uv。
原有代码
ALTER TABLE s_dw_tanx_adzone_uv ADD PARTITION (ds=20120329)
SELECT
20120329 AS thedate,
adzoneid,
COUNT(DISTINCT acookie) AS uv
FROM s_ods_log_tanx_pv t
WHERE t.ds=20120329
GROUP BY adzoneid
#统计每日IP
CREATE TABLE ip_2014_12_29 AS
SELECT
COUNT(DISTINCT ip) AS IP
FROM logdfs
WHERE logdate='2014_12_29';
耗时:24.805 seconds
#统计每日IP(改造)
CREATE TABLE ip_2014_12_29 AS
SELECT
COUNT(1) AS IP
FROM
(
SELECT
DISTINCT ip
from logdfs
WHERE logdate='2014_12_29'
)tmp;
耗时:46.833 seconds