- 1.写出下面的结果
- 2. 查询每门课都大于80分的同学的姓名
- 3.同时最大在线人数问题
- 问题:
- SQL解答:
- 4.打折日期交叉问题
- 问题:
- SQL解答:
- 第一种方式:
- 第二种方式:
- 5.奖金瓜分问题(拼多多)
- 问题:
- SQL解答:
- 6.找出使用相同ip的用户
- 问题:
- SQL解答:
- 7.找出恶意购买用户
- 问题:
- SQL解答:
- 8.互相关注的人
- 问题:
- SQL解答:
- 9.间隔连续问题(连续的升级版)
- 问题:
- SQL解答:
- 10.有效值追溯
- 问题:
- SQL解答:
- 补充last_value函数及first_value函数
- 11.累计占比
- 问题:
- SQL解答:
- 12.访问会话切割
- 问题:
- SQL解答:
- 13.计算部门的平均工资
- 问题:
- SQL解答:
- 14.分组topN
- 问题:
- SQL解答:
- 15.每年的在校人数
- 问题:
- SQL解答:
- 16.sql实现分钟级的趋势图
- 问题
- sql解答
- 17.补全缺失日的月销售累计
- 问题:
- SQL解决:
1.写出下面的结果
select * from (
select 1 as num
union all
select 2 as num
union all
select 3 as num
) t1 left join
(
select 1 as num
union all
select 2 as num
)t2 on t1.num=t2.num and t1.num=2
答案:结果为
t1 | t2 |
---|---|
1 | NULL |
2 | 2 |
3 | NULL |
2. 查询每门课都大于80分的同学的姓名
name | course | grad |
---|---|---|
张三 | 语文 | 88 |
张三 | 数学 | 99 |
张三 | 英语 | 88 |
李四 | 语文 | 70 |
李四 | 数学 | 90 |
王五 | 语文 | 88 |
王五 | 英语 | 90 |
答案:
select distinct name from table where
name not in (select distinct name from table where grad<=80)
3.同时最大在线人数问题
问题:
问题:如下为某直播平台各主播的开播及关播时间明细数据,现在需要计算出该平台最高峰期同时在线的主播人数。
user_id start_date end_date
1001 2021-06-14 12:12:12 2021-06-14 18:12:12
1003 2021-06-14 13:12:12 2021-06-14 16:12:12
1004 2021-06-14 13:15:12 2021-06-14 20:12:12
1002 2021-06-14 15:12:12 2021-06-14 16:12:12
1005 2021-06-14 15:18:12 2021-06-14 20:12:12
1001 2021-06-14 20:12:12 2021-06-14 23:12:12
1006 2021-06-14 21:12:12 2021-06-14 23:15:12
1007 2021-06-14 22:12:12 2021-06-14 23:10:12
SQL解答:
这是非常经典的一个面试题,不管大厂小厂都有问到过。解题思路也比较固定:就是用1代表开播(此时用开播时间),-1代表关播(此时用关播时间),可以理解1代表主播开播加入增1,-1代表主播关播离开减1,然后开窗可以计算出到每个时间点时有多少主播同时在线,最后求最大值即可。
with tmp as
(
select 1001 as user_id, '2021-06-14 12:12:12' as start_date , '2021-06-14 18:12:12' as end_date
union all
select 1003 as user_id, '2021-06-14 13:12:12' as start_date , '2021-06-14 16:12:12' as end_date
union all
select 1004 as user_id, '2021-06-14 13:15:12' as start_date , '2021-06-14 20:12:12' as end_date
union all
select 1002 as user_id, '2021-06-14 15:12:12' as start_date , '2021-06-14 16:12:12' as end_date
union all
select 1005 as user_id, '2021-06-14 15:18:12' as start_date , '2021-06-14 20:12:12' as end_date
union all
select 1001 as user_id, '2021-06-14 20:12:12' as start_date , '2021-06-14 23:12:12' as end_date
union all
select 1006 as user_id, '2021-06-14 21:12:12' as start_date , '2021-06-14 23:15:12' as end_date
union all
select 1007 as user_id, '2021-06-14 22:12:12' as start_date , '2021-06-14 23:10:12' as end_date
)
select
max(online_nums) as max_online_nums
from
(
select
user_id
,dt
,sum(flag) over(order by dt) as online_nums
from
(
select
user_id
,start_date as dt
,1 as flag --开播记为1
from tmp
union all
select
user_id
,end_date as dt
,-1 as flag --关播记为-1
from tmp
)t1
)t1
;
4.打折日期交叉问题
问题:
如下为某平台的商品促销数据,字段含义分别为品牌名称、打折开始日期、打折结束日期,现在要计算每个品牌的打折销售天数(注意其中的交叉日期)。比如vivo的打折销售天数就为17天。
brand start_date end_date
xiaomi 2021-06-05 2021-06-09
xiaomi 2021-06-11 2021-06-21
vivo 2021-06-05 2021-06-15
vivo 2021-06-09 2021-06-21
honor 2021-06-05 2021-06-21
honor 2021-06-09 2021-06-15
redmi 2021-06-17 2021-06-26
huawei 2021-06-05 2021-06-26
huawei 2021-06-09 2021-06-15
huawei 2021-06-17 2021-06-21
SQL解答:
第一种方式:
根据每个品牌的促销开始时间和结束时间可以得到品牌每天促销的明细数据,然后,按品牌分组,日期去重就可以得到每个品牌打折销售天数。但此种方式适合数据量不大的情况,因为该方法会让数据膨胀的很厉害。
with temp as (
select 'xiaomi' as brand ,'2021-06-05' as start_date,'2021-06-09' as end_date
union all
select 'xiaomi' as brand ,'2021-06-11' as start_date,'2021-06-21' as end_date
union all
select 'vivo' as brand ,'2021-06-05' as start_date,'2021-06-15' as end_date
union all
select 'vivo' as brand ,'2021-06-09' as start_date,'2021-06-21' as end_date
union all
select 'honor' as brand ,'2021-06-05' as start_date,'2021-06-21' as end_date
union all
select 'honor' as brand ,'2021-06-09' as start_date,'2021-06-15' as end_date
union all
select 'honor' as brand ,'2021-06-17' as start_date,'2021-06-26' as end_date
union all
select 'huawei' as brand ,'2021-06-05' as start_date,'2021-06-26' as end_date
union all
select 'huawei' as brand ,'2021-06-09' as start_date,'2021-06-15' as end_date
union all
select 'huawei' as brand ,'2021-06-17' as start_date,'2021-06-21' as end_date
)
select
brand
,start_date
,end_date
,date_add(start_date,tmp.col_idx) as dt,
col_idx,
col_val
from temp
lateral VIEW posexplode(split(repeat("#,",datediff(date(end_date), date(start_date))),'#')) tmp AS col_idx,col_val
;
备注:补充repeat函数
select repeat("#,",datediff('2023-12-18','2023-12-01'))
#,#,#,#,#,#,#,#,#,#,#,#,#,#,#,#,#,
select split(repeat("#,",datediff('2023-12-18','2023-12-01')),'#')
["",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",","]
第二种方式:
第二种方式规避数据膨胀的情况,经过适当的处理,消除日期交叉的情况
with temp as (
select 'xiaomi' as brand ,'2021-06-05' as start_date,'2021-06-09' as end_date
union all
select 'xiaomi' as brand ,'2021-06-11' as start_date,'2021-06-21' as end_date
union all
select 'vivo' as brand ,'2021-06-05' as start_date,'2021-06-15' as end_date
union all
select 'vivo' as brand ,'2021-06-09' as start_date,'2021-06-21' as end_date
union all
select 'honor' as brand ,'2021-06-05' as start_date,'2021-06-21' as end_date
union all
select 'honor' as brand ,'2021-06-09' as start_date,'2021-06-15' as end_date
union all
select 'honor' as brand ,'2021-06-17' as start_date,'2021-06-26' as end_date
union all
select 'huawei' as brand ,'2021-06-05' as start_date,'2021-06-26' as end_date
union all
select 'huawei' as brand ,'2021-06-09' as start_date,'2021-06-15' as end_date
union all
select 'huawei' as brand ,'2021-06-17' as start_date,'2021-06-21' as end_date
)
select
brand
,sum(datediff(date(end_date),date(start_date))+1)
from
(
select
brand
,case
when start_date<=max_date then date_add(date(max_date),1)
else start_date end
as start_date
,end_date
from(
select
brand
,start_date
,end_date
,max(end_date) over(partition by brand order by start_date rows between UNBOUNDED PRECEDING and 1 PRECEDING ) as max_date --获取同一品牌内按开始日期排序后,取第一行到前一行的最大结束时间
from temp
)t1
)t1
where end_date>=start_date
group by brand
;
补充:rows 和range的区别
在 SQL 中,rows 和 range 是两种不同的窗口帧(window frame)类型,它们定义了窗口函数的计算范围。
rows 窗口帧是基于行的,它使用一组相对于当前行的行号来定义窗口函数的计算范围。rows 窗口帧可以指定 UNBOUNDED PRECEDING、n PRECEDING、CURRENT ROW、n FOLLOWING 和 UNBOUNDED FOLLOWING 五种窗口帧范围。
range 窗口帧是基于值的,它使用一组相对于当前行的数值范围来定义窗口函数的计算范围。range 窗口帧可以指定 UNBOUNDED PRECEDING、n PRECEDING、CURRENT ROW、n FOLLOWING 和 UNBOUNDED FOLLOWING 五种窗口帧范围。
注释:
PRECEDING:往前
FOLLOWING:往后
CURRENT ROW:当前行
UNBOUNDED:起点
UNBOUNDED PRECEDING 表示从前面的起点
UNBOUNDED FOLLOWING:表示到后面的终点
一般来说,rows 和 range 窗口帧都可以用于定义窗口函数的计算范围,但是它们有一些不同的特点:rows 窗口帧是基于行的,它使用一组相对于当前行的行号来定义窗口函数的计算范围。因此,rows 窗口帧适用于基于行号的计算,例如计算排名、移动平均等。range 窗口帧是基于值的,它使用一组相对于当前行的数值范围来定义窗口函数的计算范围。因此,range 窗口帧适用于基于数值范围的计算,例如计算累计和、百分比等。
一般情况下,rows 窗口帧比 range 窗口帧更常用,因为基于行号的计算更加常见。但是在某些特殊情况下,range 窗口帧也可以使用。
例如:当窗口函数的计算范围基于连续的数值范围时,可以使用 range 窗口帧。例如,计算累计和、计算百分比等。当窗口函数的计算范围包含重复的值时,可以使用 range 窗口帧来避免重复计算。例如,计算连续相同值的最大长度、计算某个值在窗口中的出现次数等。
需要注意的是,对于一些特殊的窗口函数,可能只能使用 rows 窗口帧,例如计算排名、计算移动平均等。因此,在使用 range 窗口帧时,需要根据具体的需求和窗口函数的特性选择合适的窗口帧类型。
5.奖金瓜分问题(拼多多)
问题:
在活动大促中,有玩游戏瓜分奖金环节。现有奖金池为10000元,代表奖金池中的初始额度。用户的分数信息如下:
user_id score
100 60
101 45
102 40
103 35
104 30
105 25
106 15
107 10
108 5
表中的数据代表每一个用户和其对应的得分,user_id和score都不会有重复值。瓜分奖金的规则如下:按照score从高到低依次瓜分,每个人都能分走当前奖金池里面剩余奖金的一半,当奖金池里面剩余的奖金少于250时(不含),则停止瓜分奖金。
现在需要查询出所有分到奖金的user_id和其对应的奖金。
SQL解答:
这是拼多多的一个面试题,需要先进行一点数学层面的分析,把整个瓜分逻辑捋清楚之后不难。这里给出一种思考逻辑:假设奖金池的初始总奖金为n,那么第一名分到的奖金为n/2,第二名分到奖金n/4,第三名分到的奖金为n/8,依次类推第x名分到的奖金为n/2^x,然后计算即可。
with temp as (
select '100' as user_id ,'60' as score
union all
select '101' as user_id ,'45' as score
union all
select '102' as user_id ,'40' as score
union all
select '103' as user_id ,'35' as score
union all
select '104' as user_id ,'25' as score
union all
select '105' as user_id ,'15' as score
union all
select '106' as user_id ,'10' as score
union all
select '107' as user_id ,'5' as score
union all
select '108' as user_id ,'90' as score
union all
select '109' as user_id ,'80' as score
)
select user_id,score,power(0.5,rn)*10000 from
(select user_id,score,row_number() over(order by score desc) as rn
from temp )t
where power(0.5,rn)*10000>=250
6.找出使用相同ip的用户
问题:
现在有一张用户登陆日志表,该表包括user_id,ip,log_time三个字段,现在需要找出共同使用ip数量超过3个(含)的所有用户对。比如下面的示例数据,101和102用户共同使用的ip为4个,101和103用户共同使用的ip为3个,102和103用户共同使用的ip为3个。
(101,'192.168.10.101','2022-05-10 11:00:00'),
(101,'192.168.10.101','2022-05-10 11:01:00'),
(101,'192.168.10.102','2022-05-10 11:02:00'),
(101,'192.168.10.103','2022-05-10 11:03:00'),
(101,'192.168.10.104','2022-05-10 11:04:00'),
(102,'192.168.10.101','2022-05-10 11:04:30'),
(102,'192.168.10.102','2022-05-10 11:05:00'),
(102,'192.168.10.103','2022-05-10 11:06:00'),
(102,'192.168.10.104','2022-05-10 11:07:00'),
(103,'192.168.10.102','2022-05-10 11:08:00'),
(103,'192.168.10.103','2022-05-10 11:08:00'),
(103,'192.168.10.104','2022-05-10 11:10:00'),
(104,'192.168.10.103','2022-05-10 11:11:00'),
(104,'192.168.10.104','2022-05-10 11:12:00'),
(105,'192.168.10.105','2022-05-10 11:13:00')
SQL解答:
问题的关键点是使用自连接,先按用户和ip去重之后进行自关联。因为如果公共使用ip达到3个及以上的话,那么同一个用户对至少会出现3条数据,筛选一下就行。
with user_login as (
select 101 as user_id ,'192.168.10.101' as ip ,'2022-05-10 11:00:00' as log_time
union all
select 101 as user_id ,'192.168.10.101' as ip ,'2022-05-10 11:01:00' as log_time
union all
select 101 as user_id ,'192.168.10.102' as ip ,'2022-05-10 11:02:00' as log_time
union all
select 101 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:03:00' as log_time
union all
select 101 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:04:00' as log_time
union all
select 102 as user_id ,'192.168.10.101' as ip ,'2022-05-10 11:04:30' as log_time
union all
select 102 as user_id ,'192.168.10.102' as ip ,'2022-05-10 11:05:00' as log_time
union all
select 102 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:06:00' as log_time
union all
select 102 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:07:00' as log_time
union all
select 103 as user_id ,'192.168.10.102' as ip ,'2022-05-10 11:08:00' as log_time
union all
select 103 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:08:00' as log_time
union all
select 103 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:10:00' as log_time
union all
select 104 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:11:00' as log_time
union all
select 104 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:12:00' as log_time
union all
select 105 as user_id ,'192.168.10.105' as ip ,'2022-05-10 11:13:00' as log_time
),
tmp as
(
select
user_id
,ip
from user_login --实际换成自己的表或上面的样例数据
group by user_id,ip --同一个ip同一用户可能多次登录,先去重
)
select
t1.user_id
,t2.user_id
,count(t1.ip) as ip_cnt
from tmp t1
inner join tmp t2
on t1.ip=t2.ip --通过ip自关联
where t1.user_id<t2.user_id --因为存在101对102,102对101的情况,保留一种即可
group by t1.user_id,t2.user_id
having ip_cnt>=3 --保留用户对ip数量超过3个的(含)
7.找出恶意购买用户
问题:
下面是某电商网站的订单数据,包括order_id,user_id,order_status和operate_time四个字段,我们需要找出所有恶意购买的用户。恶意购买的用户定义是:同一个用户,在任意半小时内(含),取消订单次数>=2次的就被视为恶意买家。比如该样例数据中c用户就是恶意买家。
order_id user_id order_status operate_time
1101 a 已支付 2023-01-01 10:00:00
1102 a 已取消 2023-01-01 10:10:00
1103 a 待支付 2023-01-01 10:20:00
1104 b 已取消 2023-01-01 10:30:00
1105 a 待确认 2023-01-01 10:50:00
1106 a 已取消 2023-01-01 11:00:00
1107 b 已取消 2023-01-01 11:40:00
1108 b 已取消 2023-01-01 11:50:00
1109 b 已支付 2023-01-01 12:00:00
1110 b 已取消 2023-01-01 12:11:00
1111 c 已取消 2023-01-01 12:20:00
1112 c 已取消 2023-01-01 12:30:00
1113 c 已取消 2023-01-01 12:55:00
1114 c 已取消 2023-01-01 13:00:00
SQL解答:
典型的滑动窗口的场景。Hive中也是有滑动窗口的功能的(按数据范围开窗,range between and )。针对这个例子,窗口大小就是半小时,然后按每条数据进行滑动,在窗口内判断该条数据对应的用户是否是恶意用户。
with temp as (
select 1101 as order_id ,'a' as user_id ,"已支付" as order_status, "2023-01-01 10:00:00" as operate_time
union all
select 1102 as order_id ,'a' as user_id ,"已取消" as order_status, "2023-01-01 10:10:00" as operate_time
union all
select 1103 as order_id ,'a' as user_id ,"待支付" as order_status, "2023-01-01 10:20:00" as operate_time
union all
select 1104 as order_id ,'b' as user_id ,"已取消" as order_status, "2023-01-01 10:30:00" as operate_time
union all
select 1105 as order_id ,'a' as user_id ,"待确认" as order_status, "2023-01-01 10:50:00" as operate_time
union all
select 1106 as order_id ,'a' as user_id ,"已取消" as order_status, "2023-01-01 11:00:00" as operate_time
union all
select 1107 as order_id ,'b' as user_id ,"已取消" as order_status, "2023-01-01 11:40:00" as operate_time
union all
select 1108 as order_id ,'b' as user_id ,"已取消" as order_status, "2023-01-01 11:50:00" as operate_time
union all
select 1109 as order_id ,'b' as user_id ,"已支付" as order_status, "2023-01-01 12:00:00" as operate_time
union all
select 1110 as order_id ,'b' as user_id ,"已取消" as order_status, "2023-01-01 12:11:00" as operate_time
union all
select 1111 as order_id ,'c' as user_id ,"已取消" as order_status, "2023-01-01 12:20:00" as operate_time
union all
select 1112 as order_id ,'c' as user_id ,"已取消" as order_status, "2023-01-01 12:30:00" as operate_time
union all
select 1113 as order_id ,'c' as user_id ,"已取消" as order_status, "2023-01-01 12:55:00" as operate_time
union all
select 1114 as order_id ,'c' as user_id ,"已取消" as order_status, "2023-01-01 13:00:00" as operate_time
)
select
distinct user_id
from
(select
user_id
,order_id
,count(order_id) over (partition by user_id order by operate_time range between 1800 preceding and current row) as cancel_order_cnt
--通过range between以当前行为锚点,圈定数据范围为operate_time为近30分钟内,然后算该范围内的取消订单数
from ( select
order_id
,user_id
,unix_timestamp(operate_time) as operate_time from temp ---由于range...between只能整数比较,这里先转换为秒 where order_status='已取消'
)t1
)t2
where cancel_order_cnt>=2
--结果
"user_id"
"b"
"c"
8.互相关注的人
问题:
现在有一张relation表,里面只有两个字段:from_user和to_user,代表关注关系从from指向to,即from_user关注了to_user。现在要找出互相关注的所有人。
from_user to_user
孙悟空 唐僧
唐僧 如来佛祖
唐僧 观音菩萨
观音菩萨 如来佛祖
唐僧 孙悟空
孙悟空 玉皇大帝
玉皇大帝 如来佛祖
如来佛祖 观音菩萨
如来佛祖 玉皇大帝
如来佛祖 唐僧
孙悟空 猪八戒
猪八戒 嫦娥
猪八戒 孙悟空
猪八戒 唐僧
猪八戒 沙僧
沙僧 猪八戒
沙僧 玉皇大帝
沙僧 孙悟空
沙僧 唐僧
SQL解答:
解答思路一:使用自关联即可,这种方式简单也最易理解。适合数据量不是很大的情况,因为会导致数据膨胀。
with tmp as
(
select '孙悟空' as from_user , '唐僧' as to_user
union all
select '唐僧' as from_user , '如来佛祖' as to_user
union all
select '唐僧' as from_user , '观音菩萨' as to_user
union all
select '观音菩萨' as from_user , '如来佛祖' as to_user
union all
select '唐僧' as from_user , '孙悟空' as to_user
union all
select '孙悟空' as from_user , '玉皇大帝' as to_user
union all
select '玉皇大帝' as from_user , '如来佛祖' as to_user
union all
select '如来佛祖' as from_user , '观音菩萨' as to_user
union all
select '如来佛祖' as from_user , '玉皇大帝' as to_user
union all
select '如来佛祖' as from_user , '唐僧' as to_user
union all
select '孙悟空' as from_user , '猪八戒' as to_user
union all
select '猪八戒' as from_user , '嫦娥' as to_user
union all
select '猪八戒' as from_user , '孙悟空' as to_user
union all
select '猪八戒' as from_user , '唐僧' as to_user
union all
select '猪八戒' as from_user , '沙僧' as to_user
union all
select '沙僧' as from_user , '猪八戒' as to_user
union all
select '沙僧' as from_user , '玉皇大帝' as to_user
union all
select '沙僧' as from_user , '孙悟空' as to_user
union all
select '沙僧' as from_user , '唐僧' as to_user
)
select
a.from_user,
a.to_user,
if(b.from_user is not null, 1, 0) as is_friend -- 1:互相关注
from tmp a
left join tmp b
on a.from_user=b.to_user and a.to_user=b.from_user where if(b.from_user is not null, 1, 0) =1
;
9.间隔连续问题(连续的升级版)
问题:
下面是某游戏公司记录的用户每日登录数据, 计算每个用户最大的连续登录天数,定义连续登录时可以间隔一天。举例:如果一个用户在 1,3,5,6,9 登录了游戏,则视为连续 6 天登录。
user_id dt
1001 2021-12-12
1002 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1002 2021-12-16
1001 2021-12-19
1002 2021-12-17
1001 2021-12-20
SQL解答:
这是个连续问题的升级版,当满足某种要求时我们也是算作连续的,所以不能使用传统的连续编号,然后做差值的解法了。核心思路解析如下
| 登录日期 | 第一步:上一个日期 | 第二步:判断登录日期与上一个日期差值是否在2之内 | 第三步:然后根据标记开窗做sum |
| ——– | —————— | ———————————————– | :—————————- |
| 1 | 1 | 0 | 0 |
| 3 | 1 | 0 | 0 |
| 5 | 3 | 0 | 0 |
| 6 | 5 | 0 | 0 |
| 9 | 6 | 1 | 1 |
| 11 | 9 | 0 | 1 |
这种解法是比较常见的,很多场景都可以这样使用。还有比如计算用户的会话数,当两次会话时间超过1分钟时就算做不同的会话,也可以这样做。
with temp as (
select '1001' as user_id , '2021-12-12' as dt
union all
select '1002' as user_id , '2021-12-12' as dt
union all
select '1001' as user_id , '2021-12-13' as dt
union all
select '1001' as user_id , '2021-12-14' as dt
union all
select '1001' as user_id , '2021-12-16' as dt
union all
select '1002' as user_id , '2021-12-16' as dt
union all
select '1001' as user_id , '2021-12-19' as dt
union all
select '1002' as user_id , '2021-12-17' as dt
union all
select '1001' as user_id , '2021-12-20' as dt
)
select
user_id
,max(diff) as max_login_days
from
( select
user_id
,user_group
,datediff(max(dt),min(dt))+1 as diff --拿到每个用户下,连续时间里面最大日期与最小日期的差值加1就得到来连续天数
from
(
select
user_id
,dt
-- 如果当前日期与上一个日期的差值在2之内,那么就给0,否则给1
,sum(if(datediff(dt,last_dt)<=2,0,1)) over(partition by user_id order by dt) as user_group
from
(
select
user_id
,dt
,lag(dt,1,dt) over(partition by user_id order by dt) as last_dt --根据user_id分组,拿到当前行的上一个日期,没有上一个就给自己本身的值
from temp
)t1
)t1
group by user_id ,user_group
)t1
group by user_id
;
或者
with temp as (
select '1001' as user_id , '2021-12-12' as dt
union all
select '1002' as user_id , '2021-12-12' as dt
union all
select '1001' as user_id , '2021-12-13' as dt
union all
select '1001' as user_id , '2021-12-14' as dt
union all
select '1001' as user_id , '2021-12-16' as dt
union all
select '1002' as user_id , '2021-12-16' as dt
union all
select '1001' as user_id , '2021-12-19' as dt
union all
select '1002' as user_id , '2021-12-17' as dt
union all
select '1001' as user_id , '2021-12-20' as dt
)
select
user_id
,max(diff) as max_login_days
from
( select
user_id
,user_group
,datediff(max(dt),min(dt))+1 as diff --拿到每个用户下,连续时间里面最大日期与最小日期的差值加1就得到来连续天数
from
(
select
user_id
,dt
-- 如果当前日期与上一个日期的差值在2之内,那么就给0,否则给1
,sum(if(datediff(dt,last_dt)<=2,0,1)) over(partition by user_id order by dt) as user_group
from
(
select user_id,dt,min(dt) over (partition by user_id order by dt rows between 1 preceding and current row ) as last_dt from temp
)t1
)t1
group by user_id ,user_group
)t1
group by user_id
;
补充:hive登录相关传送门
hive连续登录问题:hive连续登录问题
hive之连续登录问题(2):hive之连续登录问题(2)
Hive之连续登录问题(补充):Hive之连续登录问题(补充)
10.有效值追溯
问题:
现在有一张商品入库表,包括商品id、商品成本和入库日期3个字段,由于某些原因,导致部分商品的成本缺失(为0或者没有值都是缺失),这样不利于我们计算成本。所以现在要把缺失的商品进价补充完整,补充的依据是使用相同商品的最近一次有效成本作为当前商品的成本。比如2023-11-04号101商品的cost就需要用300.39填充。
product_id cost date
101 300.39 2023-11-01
102 500 2023-11-02
101 0 2023-11-03
101 2023-11-04
102 600 2023-11-04
102 2023-11-05
103 983 2023-11-06
123456789
SQL解答:
该场景核心考察的就是开窗函数(last_value)的熟练使用。在同一个窗口内取最后一条有效值作为当前行的值。题不难,但是是一种很好的解题思路。
with temp as
(
select 101 as product_id,300.39 as cost,'2023-11-01' as date
union all
select 102 as product_id,500 as cost,'2023-11-02' as date
union all
select 101 as product_id,0 as cost,'2023-11-03' as date
union all
select 101 as product_id,null as cost,'2023-11-04' as date
union all
select 102 as product_id,600 as cost,'2023-11-04' as date
union all
select 102 as product_id,null as cost,'2023-11-05' as date
union all
select 103 as product_id,983 as cost,'2023-11-06' as date
)
select
product_id
-- last_value第二个参数设置为true,表示取最后一个值时跳过null值。默认不写为false
,last_value(cost,true) over(partition by product_id order by date) as valid_last_value
,date
from(
select
product_id
,if(cost=0 or cost is null,null,cost) as cost --当cost为0或null时就给null值
,date
from temp
) t1
12345678910111213141516171819202122232425262728
补充last_value函数及first_value函数
1、last_value函数
在 Hive 中,LAST_VALUE() 是一种窗口函数,用于获取指定列的最后一个值。它的语法如下:
LAST_VALUE(expr,ignore_nulls) OVER (
[PARTITION BY partition_expression, ... ]
ORDER BY sort_expression [ASC|DESC], ...
[ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | range_frame]
)
12345
其中,expr 表示要获取最后一个值的列名或表达式,ignore_nulls 表示是否忽略 NULL 值,如果设置为 true,则 LAST_VALUE() 函数会忽略 NULL 值并返回最后一个非 NULL 值,否则返回 NULL。PARTITION BY 子句用于指定分区字段,ORDER BY 子句用于指定排序规则,ROWS 子句用于指定窗口范围。
SELECT content_id, create_time, value, LAST_VALUE(value, true) OVER (PARTITION BY content_id ORDER BY create_time) as last_value
FROM my_table;
12
2、first_value函数
在 Hive 中,FIRST_VALUE() 是一种窗口函数,用于获取指定列的第一个值。它的语法如下:
FIRST_VALUE(expr) OVER (
[PARTITION BY partition_expression, ... ]
ORDER BY sort_expression [ASC|DESC], ...
[ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | range_frame]
)
12345
其中,expr 表示要获取第一个值的列名或表达式,PARTITION BY 子句用于指定分区字段,ORDER BY 子句用于指定排序规则,ROWS 子句用于指定窗口范围。FIRST_VALUE() 函数的括号中除了参数 expr 之外,没有其他可选参数。FIRST_VALUE() 函数的主要作用是返回指定列的第一个值,并且不支持忽略 NULL 值。
SELECT content_id, create_time, value, FIRST_VALUE(value) OVER (PARTITION BY content_id ORDER BY create_time) as first_value
FROM my_table;
12
11.累计占比
问题:
现在有一张每个年份的每个部门的收入表。现在需要算每个部门的收入占同类型部门的收入的占比和当年整个公司的收入占比。要求一条SQL计算出来。比如研发部和产品部属于同类型的,都是产研;财务部和人事部都属于职能。
year dept income
2023 研发部 5000
2023 产品部 6000
2023 财务部 7000
2023 人事部 8000
2022 研发部 10000
2022 产品部 8000
2022 财务部 9000
2022 人事部 8000
SQL解答:
考察sum() over 开窗函数的使用。
with temp as
(
select '2023' as year,"研发部" as dept,5000 as income
union all
select '2023' as year,"产品部" as dept,6000 as income
union all
select '2023' as year,"财务部" as dept,7000 as income
union all
select '2023' as year,"人事部" as dept,8000 as income
union all
select '2022' as year,"研发部" as dept,10000 as income
union all
select '2022' as year,"产品部" as dept,8000 as income
union all
select '2022' as year,"财务部" as dept,9000 as income
union all
select '2022' as year,"人事部" as dept,8000 as income
)
select
year
,dept
,income
,round(income/similar_dept_income,2) as similar_dept_income_rate
,round(income/year_income,2) as year_income_rate
from
(
select
year
,dept
,income
,sum(income) over(partition by year,case
when dept in("研发部","产品部") then "产研"
when dept in("财务部","人事部") then "职能"
end
) as similar_dept_income --同类型部门收入
,sum(income) over(partition by year) as year_income
from temp
) t1
;
----结果为:
序号 year dept income similar_dept_income_rate year_income_rate
1 2022 研发部 10000 0.56 0.29
2 2022 产品部 8000 0.44 0.23
3 2022 财务部 9000 0.53 0.26
4 2022 人事部 8000 0.47 0.23
5 2023 研发部 5000 0.45 0.19
6 2023 产品部 6000 0.55 0.23
7 2023 财务部 7000 0.47 0.27
8 2023 人事部 8000 0.53 0.31
12.访问会话切割
问题:
如下为某电商公司用户访问网站的数据,包括用户id和访问时间两个字段。现有如下规则:如果某个用户的连续的访问记录时间间隔小于60秒,则属于同一个会话,现在需要计算每个用户有多少个会话。比如A用户在第1秒,60秒,200秒,230秒有三次访问记录,则该用户有2个会话,其中第一个会话是第1秒和第60秒的记录,第二个会话是第200秒和230秒的记录。
user_id ts
1001 16920000000
1001 16920000050
1002 16920000065
1002 16920000080
1001 16920000150
1002 16920000160
SQL解答:
先按用户分组、时间排序后取每行数据的前一行的时间,然后判断当前行的时间与前一行时间的差值,看是否在给定的范围内,然后再做开窗累加就可以得到每个用户不同的会话编号了。思路如下图:
user_id | ts | 判断与上一行差值是否小于60 | 开窗累加当做会话编号 |
---|---|---|---|
A | 1 | 0 | 0 |
A | 60 | 0 | 0 |
A | 200 | 1 | 1 |
A | 230 | 0 | 1 |
with tmp as (
select 1001 as user_id,16920000000 as ts
union all
select 1001 as user_id,16920000050 as ts
union all
select 1002 as user_id,16920000065 as ts
union all
select 1002 as user_id,16920000080 as ts
union all
select 1001 as user_id,16920000150 as ts
union all
select 1002 as user_id,16920000160 as ts
)
select
user_id
,count(distinct user_group) as user_group_cnt
from
(
select
user_id
,ts
-- 开窗做累加
,sum(flag) over(partition by user_id order by ts) as user_group
from
(
select
user_id
,ts
-- 判断当前行的时间与上一行的差值
,if(ts-last_ts<60,0,1) as flag
from
(
select
user_id
,ts
-- 取当前行的上一个时间,没有上一行就给自身的时间
,lag(ts,1,ts) over(partition by user_id order by ts) as last_ts
from tmp
)t1
)t1
)t1
group by user_id;
13.计算部门的平均工资
问题:
现在要计算每个部门的平均工资(工资和/员工数),但是要去掉部门的最高工资和最低工资(如果一个部门最高或最低工资有并列的,去掉一个最高的和一个最低的)后,计算部门的平均工资。
101 ,"研发部" ,50000
102 ,"研发部" ,50000
103 ,"研发部" ,10000
104 ,"研发部" ,20000
105 ,"研发部" ,30000
106 ,"市场部" ,20000
107 ,"市场部" ,30000
108 ,"产品部" ,20000
109 ,"产品部" ,30000
110 ,"产品部" ,25000
111 ,"市场部" ,14000
112 ,"市场部" ,8000
113 ,"产品部" ,20000
1234567891011121314
SQL解答:
使用开窗函数找出该部门的最高工资和最低工资,并按最高到最低和最低到最高分别给排名,最后排除掉满足工资要求且排名在最前面的员工即可。
with temp as
(
select 101 as user_id,"研发部" as dept,50000 as salary
union all
select 102 as user_id,"研发部" as dept,50000 as salary
union all
select 103 as user_id,"研发部" as dept,10000 as salary
union all
select 104 as user_id,"研发部" as dept,20000 as salary
union all
select 105 as user_id,"研发部" as dept,30000 as salary
union all
select 106 as user_id,"市场部" as dept,20000 as salary
union all
select 107 as user_id,"市场部" as dept,30000 as salary
union all
select 108 as user_id,"产品部" as dept,20000 as salary
union all
select 109 as user_id,"产品部" as dept,30000 as salary
union all
select 110 as user_id,"产品部" as dept,25000 as salary
union all
select 111 as user_id,"市场部" as dept,14000 as salary
union all
select 112 as user_id,"市场部" as dept,8000 as salary
union all
select 113 as user_id,"产品部" as dept,20000 as salary
)
select
dept
,dept_total_salary
,dept_employee_cnt
,round(dept_total_salary/dept_employee_cnt,2) as dept_avg_salary
from
(
select
dept
,sum(if((salary=min_salary and min_salary_rn=1) or (salary=max_salary and max_salary_rn=1),0,salary)) as dept_total_salary --去掉一个最高工资和一个最低工资后剩下的部门总工资
,sum(if((salary=min_salary and min_salary_rn=1) or (salary=max_salary and max_salary_rn=1),0,1)) as dept_employee_cnt --去掉一个最高分和一个最低分后的剩余人数
from
(
select
user_id
,dept
,salary
,min(salary) over(partition by dept) as min_salary --部门的最高工资
,max(salary) over(partition by dept) as max_salary --部门的最低工资
,row_number() over(partition by dept order by salary desc) as max_salary_rn --为了使用编号去掉其中一个最高分
,row_number() over(partition by dept order by salary) as min_salary_rn --为了使用编号去掉其中一个最低分
from temp
)t1
group by dept
)t1
;
----结果为:
dept dept_total_salary dept_employee_cnt dept_avg_salary
产品部 45000 2 22500.0
市场部 34000 2 17000.0
研发部 100000 3 33333.33
14.分组topN
问题:
下面是某个班级的成绩表,需要筛选出每个科目前2名的学生信息。如果分数一样,名次是并列的,后面的同学名次就不连续。比如有2个同学是第一名,那么下一个同学的名次就是第3名,呈现1,1,3的名次排列。
stu_id stu_name subject score
1001 张三 语文 90
1001 张三 数学 80
1001 张三 英语 70
1002 李四 语文 90
1002 李四 数学 75
1002 李四 英语 80
1003 王五 语文 80
1003 王五 数学 70
1003 王五 英语 60
1234567891011
SQL解答:
非常典型的分组topN的问题,是面试经常被问到的。直接使用开窗函数排名即可,注意下row_number、rank、dense_rank三个开窗函数不同场景下的使用。
with temp as
(
select 1001 as stu_id,'张三' as stu_name,'语文' as subject,90 as score
union all
select 1001 as stu_id,'张三' as stu_name,'数学' as subject,80 as score
union all
select 1001 as stu_id,'张三' as stu_name,'英语' as subject,70 as score
union all
select 1002 as stu_id,'李四' as stu_name,'语文' as subject,90 as score
union all
select 1002 as stu_id,'李四' as stu_name,'数学' as subject,75 as score
union all
select 1002 as stu_id,'李四' as stu_name,'英语' as subject,80 as score
union all
select 1003 as stu_id,'王五' as stu_name,'语文' as subject,80 as score
union all
select 1003 as stu_id,'王五' as stu_name,'数学' as subject,70 as score
union all
select 1003 as stu_id,'王五' as stu_name,'英语' as subject,60 as score
)
select
stu_id
,stu_name
,subject
,score
from
(
select
stu_id
,stu_name
,subject
,score
,rank() over(partition by subject order by score desc) as rk
from temp
) t1
where rk<=2;
---结果:
stu_id stu_name subject score
1001 张三 数学 80
1002 李四 数学 75
1002 李四 英语 80
1001 张三 英语 70
1001 张三 语文 90
1002 李四 语文 90
15.每年的在校人数
问题:
year表示学生入学年度,num表示对应年度录取学生人数,stu_len表示录取学生的学制;说明:例如录取年度2018学制是3年,表示该批学生在校年份为20182019、20192020、2020-2021,在算每年的在校人数时,2018/2019/2020/2021年份都需要算上。
以下是示例数据:
id year num stu_len
1 2018 2000 3
2 2019 2000 3
3 2020 1000 4
3 2020 2000 3
12345
根据以上示例计算出每年的在校人数
SQL解答:
由于需要计算每年的在校人数,所以先要造出连续的年份。然后与源表进行关联,关联条件保证年份在入学年份和结束年份之间即可。
with temp as
(
select 2018 as year,3 as stu_len,2000 as num
union all
select 2019 as year,3 as stu_len,2000 as num
union all
select 2020 as year,4 as stu_len,1000 as num
union all
select 2020 as year,3 as stu_len,2000 as num
)
select
t1.year
,sum(t2.num) as stu_num
from
(
select
t1.min_year+tab.pos as year
from
(
select
min(year) as min_year
,max(year+stu_len) as max_year
from temp
)t1
lateral view posexplode(split(repeat(',',max_year-min_year),',')) tab as pos,val
)t1
inner join temp t2
on t1.year between t2.year AND t2.year + t2.stu_len
group by t1.year
16.sql实现分钟级的趋势图
问题
在Hive或者ODPS中,怎么用sql实现分钟级的趋势图?比如从交易表中,如何统计0点到每分钟的交易趋势图?
原表:trade_A(trade_id,pay_time(格式是2020-08-05 10:30:28),pay_gmv)。希望用sql实现分钟级的0点到当前分钟的GMV。
结果表:result_A(minute_rn(分钟顺序),pay_gmv_td(每分钟的交易额,都是0点到当前分钟的累加值))。
以下是示例数据:
with tmp as (
select 101 as trade_id,'2020-08-05 00:30:28' as pay_time,100 as pay_gmv
union all
select 102 as trade_id,'2020-08-05 00:30:58' as pay_time,200 as pay_gmv
union all
select 103 as trade_id,'2020-08-05 00:35:28' as pay_time,300 as pay_gmv
union all
select 104 as trade_id,'2020-08-05 01:36:28' as pay_time,400 as pay_gmv
union all
select 105 as trade_id,'2020-08-06 00:20:28' as pay_time,500 as pay_gmv
union all
select 106 as trade_id,'2020-08-06 00:21:28' as pay_time,600 as pay_gmv
)
123456789101112131415
sql解答
其实这个题的核心就是使用sql生成分钟级的序列,与之前SQL面试题挑战08:补全缺失日的月销售累计的场景类似。然后关联原表,做开窗累加即可。而且生成分钟级的这段代码是可以收藏以后作为工具代码使用的。
select
minute_rn
-- 开窗累加即可
,sum(pay_gmv) over(partition by date_format(minute_rn,'yyyy-MM-dd') order by minute_rn) as pay_gmv_td -- 按当前时间按天开窗累加,得到每分钟的累计销售额
from
(
-- 根据数据中的日期生成连续的分钟序列。从最小日期的0点到最大日期的23:59分
select
t1.minute_rn
,nvl(t2.pay_gmv,0) as pay_gmv
from
(
select
from_unixtime(unix_timestamp(t1.min_pay_time)+tab.pos*60,'yyyy-MM-dd HH:mm') as minute_rn
from
(
select
min(date_format(pay_time,'yyyy-MM-dd 00:00:00')) as min_pay_time --取最小日期
,max(date_format(pay_time,'yyyy-MM-dd 23:59:59')) as max_pay_time --取最大日期
from tmp
)t1
lateral view posexplode(split(repeat(',',(unix_timestamp(max_pay_time)-unix_timestamp(min_pay_time))/60),',')) tab as pos,val
)t1
left join
(
-- 先按同分钟的数据进行一次聚合
select
date_format(pay_time,'yyyy-MM-dd HH:mm') as pay_time
,sum(pay_gmv) as pay_gmv
from tmp
group by date_format(pay_time,'yyyy-MM-dd HH:mm')
)t2
on t1.minute_rn=t2.pay_time
)t1
;
17.补全缺失日的月销售累计
问题:
现有一张员工的销售记录表,表样式如下。现在需要统计每个员工在2023年10月份,截止到每天的月累计销售额。注意:存在有的员工在某几天是没有销售记录的。
sale_date emp_id emp_name sale_amount
'2023-10-02' ,'101' ,'张三' , 1000
'2023-10-03' ,'101' ,'张三' , 3000
'2023-10-05' ,'101' ,'张三' , 4000
'2023-10-10' ,'101' ,'张三' , 2000
'2023-10-13' ,'101' ,'张三' , 5000
'2023-10-15' ,'101' ,'张三' , 4000
'2023-10-27' ,'101' ,'张三' , 2000
'2023-10-01' ,'102' ,'李四' , 1111
'2023-10-03' ,'102' ,'李四' , 2222
'2023-10-08' ,'102' ,'李四' , 3333
'2023-10-11' ,'102' ,'李四' , 111
'2023-10-23' ,'102' ,'李四' , 4550
'2023-10-28' ,'102' ,'李四' , 6666
1234567891011121314
SQL解决:
如果是每个员工在每天都有销售记录,那么直接开窗就可以计算出来当月截至到每天的累计销售额。现在由于销售记录有缺失,他虽然在某天没有销售记录,但是他是有当月累计销售额的,且当月累计销售额与前一天的累计销售额一样,这种依然需要统计出来。所以我们首先考虑将每个人每天的销售记录补齐,当天没有销售记录的那么销售额给0,然后就可以开窗计算当月截至到每天的累计销售额了。
with temp as (
select '2023-10-02' as sale_date,'101' as emp_id,'张三' as emp_name, 1000 as sale_amount
union all
select '2023-10-03' as sale_date,'101' as emp_id,'张三' as emp_name, 3000 as sale_amount
union all
select '2023-10-05' as sale_date,'101' as emp_id,'张三' as emp_name, 4000 as sale_amount
union all
select '2023-10-10' as sale_date,'101' as emp_id,'张三' as emp_name, 2000 as sale_amount
union all
select '2023-10-13' as sale_date,'101' as emp_id,'张三' as emp_name, 5000 as sale_amount
union all
select '2023-10-15' as sale_date,'101' as emp_id,'张三' as emp_name, 4000 as sale_amount
union all
select '2023-10-27' as sale_date,'101' as emp_id,'张三' as emp_name, 2000 as sale_amount
union all
select '2023-10-01' as sale_date,'102' as emp_id,'李四' as emp_name, 1111 as sale_amount
union all
select '2023-10-03' as sale_date,'102' as emp_id,'李四' as emp_name, 2222 as sale_amount
union all
select '2023-10-08' as sale_date,'102' as emp_id,'李四' as emp_name, 3333 as sale_amount
union all
select '2023-10-11' as sale_date,'102' as emp_id,'李四' as emp_name, 111 as sale_amount
union all
select '2023-10-23' as sale_date,'102' as emp_id,'李四' as emp_name, 4550 as sale_amount
union all
select '2023-10-28' as sale_date,'102' as emp_id,'李四' as emp_name, 6666 as sale_amount
)
select
dt
,t1.emp_id
,t1.emp_name
,nvl(t2.sale_amount,0) as sale_amount
,sum(if(t2.sale_amount is null ,0,t2.sale_amount))over(partition by t1.emp_id order by t1.dt) as total_sale_amount
from
(select
date_add(t.start_date,tab.pos) as dt
,t.emp_id
,t.emp_name
from
(
select
emp_id
,emp_name
,'2023-10-01' as start_date
,'2023-10-31' as end_date
from temp
group by emp_id,emp_name
)t
lateral view posexplode(split(repeat(',',datediff(end_date,start_date)),',')) tab as pos,val
) t1
left join
(
select
sale_date
,emp_id
,emp_name
,sale_amount
from temp
)t2
on t1.dt=t2.sale_date and t1.emp_id=t2.emp_id and t1.emp_name=t2.emp_name
;
https://ykg8hl7h33.feishu.cn/docx/VNo7dJLJfoDwT1xNNVpcLg8Snlc
最后编辑:admin 更新时间:2024-04-07 15:40