1.写出下面的结果

select * from (
   select 1 as num 
   union all
   select 2 as num 
   union all 
   select 3 as num
) t1 left join 
(
    select 1 as num 
    union all 
    select 2 as num
)t2 on t1.num=t2.num and t1.num=2

答案:结果为

t1 t2
1 NULL
2 2
3 NULL

2. 查询每门课都大于80分的同学的姓名

name course grad
张三 语文 88
张三 数学 99
张三 英语 88
李四 语文 70
李四 数学 90
王五 语文 88
王五 英语 90

答案:

select distinct name from table where 
name not in (select distinct name from table where grad<=80)

3.同时最大在线人数问题

问题:

问题:如下为某直播平台各主播的开播及关播时间明细数据,现在需要计算出该平台最高峰期同时在线的主播人数。

user_id     start_date              end_date
1001    2021-06-14 12:12:12     2021-06-14 18:12:12
1003    2021-06-14 13:12:12     2021-06-14 16:12:12
1004    2021-06-14 13:15:12     2021-06-14 20:12:12
1002    2021-06-14 15:12:12     2021-06-14 16:12:12
1005    2021-06-14 15:18:12     2021-06-14 20:12:12
1001    2021-06-14 20:12:12     2021-06-14 23:12:12
1006    2021-06-14 21:12:12     2021-06-14 23:15:12
1007    2021-06-14 22:12:12     2021-06-14 23:10:12

SQL解答:

这是非常经典的一个面试题,不管大厂小厂都有问到过。解题思路也比较固定:就是用1代表开播(此时用开播时间),-1代表关播(此时用关播时间),可以理解1代表主播开播加入增1,-1代表主播关播离开减1,然后开窗可以计算出到每个时间点时有多少主播同时在线,最后求最大值即可。

with tmp as
(
    select 1001 as user_id, '2021-06-14 12:12:12' as start_date , '2021-06-14 18:12:12' as end_date
    union all
    select 1003 as user_id, '2021-06-14 13:12:12' as start_date , '2021-06-14 16:12:12' as end_date
    union all
    select 1004 as user_id, '2021-06-14 13:15:12' as start_date , '2021-06-14 20:12:12' as end_date
    union all
    select 1002 as user_id, '2021-06-14 15:12:12' as start_date , '2021-06-14 16:12:12' as end_date
    union all
    select 1005 as user_id, '2021-06-14 15:18:12' as start_date , '2021-06-14 20:12:12' as end_date
    union all
    select 1001 as user_id, '2021-06-14 20:12:12' as start_date , '2021-06-14 23:12:12' as end_date
    union all
    select 1006 as user_id, '2021-06-14 21:12:12' as start_date , '2021-06-14 23:15:12' as end_date
    union all
    select 1007 as user_id, '2021-06-14 22:12:12' as start_date , '2021-06-14 23:10:12' as end_date
)
select
max(online_nums) as max_online_nums
from
(
    select
    user_id
    ,dt
    ,sum(flag) over(order by dt) as online_nums
    from
    (
        select
        user_id
        ,start_date as dt
        ,1 as flag  --开播记为1
        from tmp
        union all
        select
        user_id
        ,end_date as dt
        ,-1 as flag --关播记为-1
        from tmp
    )t1
)t1
;

4.打折日期交叉问题

问题:

如下为某平台的商品促销数据,字段含义分别为品牌名称、打折开始日期、打折结束日期,现在要计算每个品牌的打折销售天数(注意其中的交叉日期)。比如vivo的打折销售天数就为17天。

brand   start_date  end_date
xiaomi  2021-06-05  2021-06-09
xiaomi  2021-06-11  2021-06-21
vivo    2021-06-05  2021-06-15
vivo    2021-06-09  2021-06-21 
honor   2021-06-05  2021-06-21 
honor   2021-06-09  2021-06-15
redmi   2021-06-17  2021-06-26
huawei  2021-06-05  2021-06-26
huawei  2021-06-09  2021-06-15
huawei  2021-06-17  2021-06-21

SQL解答:

第一种方式:

根据每个品牌的促销开始时间和结束时间可以得到品牌每天促销的明细数据,然后,按品牌分组,日期去重就可以得到每个品牌打折销售天数。但此种方式适合数据量不大的情况,因为该方法会让数据膨胀的很厉害。

with temp as (
        select 'xiaomi' as brand   ,'2021-06-05' as start_date,'2021-06-09' as end_date
        union all
        select 'xiaomi' as brand   ,'2021-06-11' as start_date,'2021-06-21' as end_date
        union all
        select 'vivo' as brand   ,'2021-06-05' as start_date,'2021-06-15' as end_date
        union all
        select 'vivo' as brand   ,'2021-06-09' as start_date,'2021-06-21' as end_date
        union all 
        select 'honor' as brand  ,'2021-06-05' as start_date,'2021-06-21' as end_date
        union all 
        select 'honor' as brand  ,'2021-06-09' as start_date,'2021-06-15' as end_date
        union all
        select 'honor' as brand  ,'2021-06-17' as start_date,'2021-06-26' as end_date
        union all
        select 'huawei' as brand ,'2021-06-05' as start_date,'2021-06-26' as end_date
        union all
        select 'huawei' as brand ,'2021-06-09' as start_date,'2021-06-15' as end_date
        union all
        select 'huawei' as brand ,'2021-06-17' as start_date,'2021-06-21' as end_date
)
select
    brand
    ,start_date
    ,end_date
    ,date_add(start_date,tmp.col_idx) as dt,
    col_idx,
    col_val
from temp
lateral VIEW posexplode(split(repeat("#,",datediff(date(end_date), date(start_date))),'#')) tmp AS col_idx,col_val
;

备注:补充repeat函数

select  repeat("#,",datediff('2023-12-18','2023-12-01'))    
#,#,#,#,#,#,#,#,#,#,#,#,#,#,#,#,#,

select  split(repeat("#,",datediff('2023-12-18','2023-12-01')),'#')
["",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",",","]

第二种方式:

第二种方式规避数据膨胀的情况,经过适当的处理,消除日期交叉的情况

with temp as (
        select 'xiaomi' as brand   ,'2021-06-05' as start_date,'2021-06-09' as end_date
        union all
        select 'xiaomi' as brand   ,'2021-06-11' as start_date,'2021-06-21' as end_date
        union all
        select 'vivo' as brand   ,'2021-06-05' as start_date,'2021-06-15' as end_date
        union all
        select 'vivo' as brand   ,'2021-06-09' as start_date,'2021-06-21' as end_date
        union all 
        select 'honor' as brand  ,'2021-06-05' as start_date,'2021-06-21' as end_date
        union all 
        select 'honor' as brand  ,'2021-06-09' as start_date,'2021-06-15' as end_date
        union all
        select 'honor' as brand  ,'2021-06-17' as start_date,'2021-06-26' as end_date
        union all
        select 'huawei' as brand ,'2021-06-05' as start_date,'2021-06-26' as end_date
        union all
        select 'huawei' as brand ,'2021-06-09' as start_date,'2021-06-15' as end_date
        union all
        select 'huawei' as brand ,'2021-06-17' as start_date,'2021-06-21' as end_date
)

select
brand
,sum(datediff(date(end_date),date(start_date))+1)
from
(
select
    brand
    ,case
    when start_date<=max_date then date_add(date(max_date),1)
    else start_date end
    as start_date
    ,end_date
    from(
        select
        brand
        ,start_date
        ,end_date
        ,max(end_date) over(partition by brand order by start_date rows between UNBOUNDED PRECEDING and 1 PRECEDING ) as max_date  --获取同一品牌内按开始日期排序后,取第一行到前一行的最大结束时间
        from temp
    )t1
    )t1
where end_date>=start_date
group by brand
;

补充:rows 和range的区别
在 SQL 中,rows 和 range 是两种不同的窗口帧(window frame)类型,它们定义了窗口函数的计算范围。
rows 窗口帧是基于行的,它使用一组相对于当前行的行号来定义窗口函数的计算范围。rows 窗口帧可以指定 UNBOUNDED PRECEDING、n PRECEDING、CURRENT ROW、n FOLLOWING 和 UNBOUNDED FOLLOWING 五种窗口帧范围。
range 窗口帧是基于值的,它使用一组相对于当前行的数值范围来定义窗口函数的计算范围。range 窗口帧可以指定 UNBOUNDED PRECEDING、n PRECEDING、CURRENT ROW、n FOLLOWING 和 UNBOUNDED FOLLOWING 五种窗口帧范围。

注释:
PRECEDING:往前
FOLLOWING:往后
CURRENT ROW:当前行
UNBOUNDED:起点
UNBOUNDED PRECEDING 表示从前面的起点
UNBOUNDED FOLLOWING:表示到后面的终点

一般来说,rows 和 range 窗口帧都可以用于定义窗口函数的计算范围,但是它们有一些不同的特点:rows 窗口帧是基于行的,它使用一组相对于当前行的行号来定义窗口函数的计算范围。因此,rows 窗口帧适用于基于行号的计算,例如计算排名、移动平均等。range 窗口帧是基于值的,它使用一组相对于当前行的数值范围来定义窗口函数的计算范围。因此,range 窗口帧适用于基于数值范围的计算,例如计算累计和、百分比等。

一般情况下,rows 窗口帧比 range 窗口帧更常用,因为基于行号的计算更加常见。但是在某些特殊情况下,range 窗口帧也可以使用。
例如:当窗口函数的计算范围基于连续的数值范围时,可以使用 range 窗口帧。例如,计算累计和、计算百分比等。当窗口函数的计算范围包含重复的值时,可以使用 range 窗口帧来避免重复计算。例如,计算连续相同值的最大长度、计算某个值在窗口中的出现次数等。
需要注意的是,对于一些特殊的窗口函数,可能只能使用 rows 窗口帧,例如计算排名、计算移动平均等。因此,在使用 range 窗口帧时,需要根据具体的需求和窗口函数的特性选择合适的窗口帧类型。

5.奖金瓜分问题(拼多多)

问题:

在活动大促中,有玩游戏瓜分奖金环节。现有奖金池为10000元,代表奖金池中的初始额度。用户的分数信息如下:

user_id  score
100       60
101       45
102       40
103       35
104       30
105       25
106       15
107       10
108       5

表中的数据代表每一个用户和其对应的得分,user_id和score都不会有重复值。瓜分奖金的规则如下:按照score从高到低依次瓜分,每个人都能分走当前奖金池里面剩余奖金的一半,当奖金池里面剩余的奖金少于250时(不含),则停止瓜分奖金。

现在需要查询出所有分到奖金的user_id和其对应的奖金。

SQL解答:

这是拼多多的一个面试题,需要先进行一点数学层面的分析,把整个瓜分逻辑捋清楚之后不难。这里给出一种思考逻辑:假设奖金池的初始总奖金为n,那么第一名分到的奖金为n/2,第二名分到奖金n/4,第三名分到的奖金为n/8,依次类推第x名分到的奖金为n/2^x,然后计算即可。

with temp as (
        select '100' as user_id   ,'60' as score
        union all
        select '101' as user_id   ,'45' as score
        union all
        select '102' as user_id   ,'40' as score
        union all
        select '103' as user_id   ,'35' as score
        union all 
        select '104' as user_id  ,'25' as score
        union all 
        select '105' as user_id  ,'15' as score
        union all
        select '106' as user_id  ,'10' as score
        union all
        select '107' as user_id ,'5' as score
        union all
        select '108' as user_id ,'90' as score
        union all
        select '109' as user_id ,'80' as score
        )

select user_id,score,power(0.5,rn)*10000 from
(select user_id,score,row_number() over(order by score desc) as rn
from temp )t
where power(0.5,rn)*10000>=250

6.找出使用相同ip的用户

问题:

现在有一张用户登陆日志表,该表包括user_id,ip,log_time三个字段,现在需要找出共同使用ip数量超过3个(含)的所有用户对。比如下面的示例数据,101和102用户共同使用的ip为4个,101和103用户共同使用的ip为3个,102和103用户共同使用的ip为3个。

(101,'192.168.10.101','2022-05-10 11:00:00'),
(101,'192.168.10.101','2022-05-10 11:01:00'),
(101,'192.168.10.102','2022-05-10 11:02:00'),
(101,'192.168.10.103','2022-05-10 11:03:00'),
(101,'192.168.10.104','2022-05-10 11:04:00'),

(102,'192.168.10.101','2022-05-10 11:04:30'),
(102,'192.168.10.102','2022-05-10 11:05:00'),
(102,'192.168.10.103','2022-05-10 11:06:00'),
(102,'192.168.10.104','2022-05-10 11:07:00'),

(103,'192.168.10.102','2022-05-10 11:08:00'),
(103,'192.168.10.103','2022-05-10 11:08:00'),
(103,'192.168.10.104','2022-05-10 11:10:00'),

(104,'192.168.10.103','2022-05-10 11:11:00'),
(104,'192.168.10.104','2022-05-10 11:12:00'),

(105,'192.168.10.105','2022-05-10 11:13:00')

SQL解答:

问题的关键点是使用自连接,先按用户和ip去重之后进行自关联。因为如果公共使用ip达到3个及以上的话,那么同一个用户对至少会出现3条数据,筛选一下就行。

with user_login as (
select 101 as user_id ,'192.168.10.101' as ip ,'2022-05-10 11:00:00' as log_time
union all                                                            
select 101 as user_id ,'192.168.10.101' as ip ,'2022-05-10 11:01:00' as log_time
union all                                                            
select 101 as user_id ,'192.168.10.102' as ip ,'2022-05-10 11:02:00' as log_time
union all                                                            
select 101 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:03:00' as log_time
union all                                                            
select 101 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:04:00' as log_time
union all                                                            
select 102 as user_id ,'192.168.10.101' as ip ,'2022-05-10 11:04:30' as log_time
union all                                                            
select 102 as user_id ,'192.168.10.102' as ip ,'2022-05-10 11:05:00' as log_time
union all                                                            
select 102 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:06:00' as log_time
union all                                                            
select 102 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:07:00' as log_time
union all                                                            
select 103 as user_id ,'192.168.10.102' as ip ,'2022-05-10 11:08:00' as log_time
union all                                                            
select 103 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:08:00' as log_time
union all                                                            
select 103 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:10:00' as log_time
union all                                                            
select 104 as user_id ,'192.168.10.103' as ip ,'2022-05-10 11:11:00' as log_time
union all                                                            
select 104 as user_id ,'192.168.10.104' as ip ,'2022-05-10 11:12:00' as log_time
union all                                                            
select 105 as user_id ,'192.168.10.105' as ip ,'2022-05-10 11:13:00' as log_time
),
tmp as
(
    select 
    user_id
    ,ip
    from user_login --实际换成自己的表或上面的样例数据
    group by user_id,ip  --同一个ip同一用户可能多次登录,先去重
)

select
t1.user_id
,t2.user_id
,count(t1.ip) as ip_cnt
from tmp t1
inner join tmp t2
on t1.ip=t2.ip  --通过ip自关联
where t1.user_id<t2.user_id  --因为存在101对102,102对101的情况,保留一种即可
group by t1.user_id,t2.user_id
having ip_cnt>=3  --保留用户对ip数量超过3个的(含)

7.找出恶意购买用户

问题:

下面是某电商网站的订单数据,包括order_id,user_id,order_status和operate_time四个字段,我们需要找出所有恶意购买的用户。恶意购买的用户定义是:同一个用户,在任意半小时内(含),取消订单次数>=2次的就被视为恶意买家。比如该样例数据中c用户就是恶意买家。

order_id    user_id    order_status     operate_time
1101         a         已支付        2023-01-01 10:00:00
1102         a         已取消        2023-01-01 10:10:00
1103         a         待支付        2023-01-01 10:20:00
1104         b         已取消        2023-01-01 10:30:00
1105         a         待确认        2023-01-01 10:50:00
1106         a         已取消        2023-01-01 11:00:00
1107         b         已取消        2023-01-01 11:40:00
1108         b         已取消        2023-01-01 11:50:00
1109         b         已支付        2023-01-01 12:00:00
1110         b         已取消        2023-01-01 12:11:00
1111         c         已取消        2023-01-01 12:20:00
1112         c         已取消        2023-01-01 12:30:00
1113         c         已取消        2023-01-01 12:55:00
1114         c         已取消        2023-01-01 13:00:00

SQL解答:

典型的滑动窗口的场景。Hive中也是有滑动窗口的功能的(按数据范围开窗,range between and )。针对这个例子,窗口大小就是半小时,然后按每条数据进行滑动,在窗口内判断该条数据对应的用户是否是恶意用户。

with temp  as (
select  1101 as order_id  ,'a'  as user_id  ,"已支付" as order_status,  "2023-01-01 10:00:00"  as operate_time
union all 
select  1102 as order_id  ,'a'  as user_id  ,"已取消" as order_status,  "2023-01-01 10:10:00"  as operate_time
union all 
select  1103 as order_id  ,'a'  as user_id  ,"待支付" as order_status,  "2023-01-01 10:20:00"  as operate_time
union all 
select  1104 as order_id  ,'b'  as user_id  ,"已取消" as order_status,  "2023-01-01 10:30:00"  as operate_time
union all 
select  1105 as order_id  ,'a'  as user_id  ,"待确认" as order_status,  "2023-01-01 10:50:00"  as operate_time
union all 
select  1106 as order_id  ,'a'  as user_id  ,"已取消" as order_status,  "2023-01-01 11:00:00"  as operate_time
union all 
select  1107 as order_id  ,'b'  as user_id  ,"已取消" as order_status,  "2023-01-01 11:40:00"  as operate_time
union all 
select  1108 as order_id  ,'b'  as user_id  ,"已取消" as order_status,  "2023-01-01 11:50:00"  as operate_time
union all 
select  1109 as order_id  ,'b'  as user_id  ,"已支付" as order_status,  "2023-01-01 12:00:00"  as operate_time
union all 
select  1110 as order_id  ,'b'  as user_id  ,"已取消" as order_status,  "2023-01-01 12:11:00"  as operate_time
union all 
select  1111 as order_id  ,'c'  as user_id  ,"已取消" as order_status,  "2023-01-01 12:20:00"  as operate_time
union all 
select  1112 as order_id  ,'c'  as user_id  ,"已取消" as order_status,  "2023-01-01 12:30:00"  as operate_time
union all 
select  1113 as order_id  ,'c'  as user_id  ,"已取消" as order_status,  "2023-01-01 12:55:00"  as operate_time
union all 
select  1114 as order_id  ,'c'  as user_id  ,"已取消" as order_status,  "2023-01-01 13:00:00"  as operate_time
)

select 
distinct user_id
from 
(select 
user_id
,order_id
,count(order_id) over (partition by user_id order by operate_time range between 1800 preceding and current row) as cancel_order_cnt 
--通过range between以当前行为锚点,圈定数据范围为operate_time为近30分钟内,然后算该范围内的取消订单数
from ( select 
    order_id
    ,user_id
    ,unix_timestamp(operate_time) as operate_time from temp ---由于range...between只能整数比较,这里先转换为秒 where order_status='已取消' 
)t1 
)t2 
where cancel_order_cnt>=2

--结果
"user_id"
"b"
"c"

8.互相关注的人

问题:

现在有一张relation表,里面只有两个字段:from_user和to_user,代表关注关系从from指向to,即from_user关注了to_user。现在要找出互相关注的所有人。

from_user    to_user
孙悟空          唐僧
唐僧            如来佛祖
唐僧            观音菩萨
观音菩萨         如来佛祖
唐僧            孙悟空
孙悟空          玉皇大帝
玉皇大帝        如来佛祖
如来佛祖         观音菩萨
如来佛祖         玉皇大帝
如来佛祖         唐僧
孙悟空          猪八戒
猪八戒            嫦娥
猪八戒           孙悟空
猪八戒           唐僧
猪八戒          沙僧
沙僧            猪八戒
沙僧            玉皇大帝
沙僧            孙悟空
沙僧            唐僧

SQL解答:

解答思路一:使用自关联即可,这种方式简单也最易理解。适合数据量不是很大的情况,因为会导致数据膨胀。

with tmp as
(
select '孙悟空' as from_user ,     '唐僧'   as to_user
union all
select '唐僧' as from_user ,     '如来佛祖'   as to_user
union all
select '唐僧' as from_user ,     '观音菩萨'   as to_user
union all
select '观音菩萨' as from_user ,     '如来佛祖'   as to_user
union all
select '唐僧' as from_user ,     '孙悟空'   as to_user
union all
select '孙悟空' as from_user ,     '玉皇大帝'   as to_user
union all
select '玉皇大帝' as from_user ,     '如来佛祖'   as to_user
union all
select '如来佛祖' as from_user ,     '观音菩萨'   as to_user
union all
select '如来佛祖' as from_user ,     '玉皇大帝'   as to_user
union all
select '如来佛祖' as from_user ,     '唐僧'   as to_user
union all
select '孙悟空' as from_user ,     '猪八戒'   as to_user
union all
select '猪八戒' as from_user ,     '嫦娥'   as to_user
union all
select '猪八戒' as from_user ,     '孙悟空'   as to_user
union all
select '猪八戒' as from_user ,     '唐僧'   as to_user
union all
select '猪八戒' as from_user ,     '沙僧'   as to_user
union all
select '沙僧' as from_user ,     '猪八戒'   as to_user
union all
select '沙僧' as from_user ,     '玉皇大帝'   as to_user
union all
select '沙僧' as from_user ,     '孙悟空'   as to_user
union all
select '沙僧' as from_user ,     '唐僧'   as to_user
)
select
a.from_user,
a.to_user,
if(b.from_user is not null, 1, 0) as is_friend -- 1:互相关注 
from tmp a
left join tmp b
on a.from_user=b.to_user and a.to_user=b.from_user where if(b.from_user is not null, 1, 0) =1
;

9.间隔连续问题(连续的升级版)

问题:

下面是某游戏公司记录的用户每日登录数据, 计算每个用户最大的连续登录天数,定义连续登录时可以间隔一天。举例:如果一个用户在 1,3,5,6,9 登录了游戏,则视为连续 6 天登录。

user_id          dt
1001    2021-12-12
1002    2021-12-12
1001    2021-12-13
1001    2021-12-14
1001    2021-12-16
1002    2021-12-16
1001    2021-12-19
1002    2021-12-17
1001    2021-12-20

SQL解答:

这是个连续问题的升级版,当满足某种要求时我们也是算作连续的,所以不能使用传统的连续编号,然后做差值的解法了。核心思路解析如下
| 登录日期 | 第一步:上一个日期 | 第二步:判断登录日期与上一个日期差值是否在2之内 | 第三步:然后根据标记开窗做sum |
| ——– | —————— | ———————————————– | :—————————- |
| 1 | 1 | 0 | 0 |
| 3 | 1 | 0 | 0 |
| 5 | 3 | 0 | 0 |
| 6 | 5 | 0 | 0 |
| 9 | 6 | 1 | 1 |
| 11 | 9 | 0 | 1 |

这种解法是比较常见的,很多场景都可以这样使用。还有比如计算用户的会话数,当两次会话时间超过1分钟时就算做不同的会话,也可以这样做。

with temp as (       
select '1001' as user_id , '2021-12-12' as  dt
union all 
select '1002' as user_id , '2021-12-12' as  dt
union all
select '1001' as user_id , '2021-12-13' as  dt
union all
select '1001' as user_id , '2021-12-14' as  dt
union all
select '1001' as user_id , '2021-12-16' as  dt
union all
select '1002' as user_id , '2021-12-16' as  dt
union all
select '1001' as user_id , '2021-12-19' as  dt
union all
select '1002' as user_id , '2021-12-17' as  dt
union all
select '1001' as user_id , '2021-12-20' as  dt
)

select
user_id
,max(diff) as max_login_days
from
( select
    user_id 
    ,user_group
    ,datediff(max(dt),min(dt))+1 as diff  --拿到每个用户下,连续时间里面最大日期与最小日期的差值加1就得到来连续天数
    from
    (
select
        user_id
        ,dt
        -- 如果当前日期与上一个日期的差值在2之内,那么就给0,否则给1
        ,sum(if(datediff(dt,last_dt)<=2,0,1)) over(partition by user_id order by dt) as user_group
        from
        (
select
            user_id
            ,dt
            ,lag(dt,1,dt) over(partition by user_id order by dt) as last_dt --根据user_id分组,拿到当前行的上一个日期,没有上一个就给自己本身的值
            from temp
)t1
)t1
group by user_id ,user_group
)t1
group by user_id 
;

或者

with temp as (       
select '1001' as user_id , '2021-12-12' as  dt
union all 
select '1002' as user_id , '2021-12-12' as  dt
union all
select '1001' as user_id , '2021-12-13' as  dt
union all
select '1001' as user_id , '2021-12-14' as  dt
union all
select '1001' as user_id , '2021-12-16' as  dt
union all
select '1002' as user_id , '2021-12-16' as  dt
union all
select '1001' as user_id , '2021-12-19' as  dt
union all
select '1002' as user_id , '2021-12-17' as  dt
union all
select '1001' as user_id , '2021-12-20' as  dt
)
select
user_id
,max(diff) as max_login_days
from
( select
    user_id 
    ,user_group
    ,datediff(max(dt),min(dt))+1 as diff  --拿到每个用户下,连续时间里面最大日期与最小日期的差值加1就得到来连续天数
    from
    (
select
        user_id
        ,dt
        -- 如果当前日期与上一个日期的差值在2之内,那么就给0,否则给1
        ,sum(if(datediff(dt,last_dt)<=2,0,1)) over(partition by user_id order by dt) as user_group
        from
        (
select user_id,dt,min(dt) over (partition by user_id order by dt rows between 1 preceding and current row ) as last_dt from temp 
)t1
)t1
group by user_id ,user_group
)t1
group by user_id 
;

补充:hive登录相关传送门
hive连续登录问题:hive连续登录问题
hive之连续登录问题(2):hive之连续登录问题(2)
Hive之连续登录问题(补充):Hive之连续登录问题(补充)

10.有效值追溯

问题:

现在有一张商品入库表,包括商品id、商品成本和入库日期3个字段,由于某些原因,导致部分商品的成本缺失(为0或者没有值都是缺失),这样不利于我们计算成本。所以现在要把缺失的商品进价补充完整,补充的依据是使用相同商品的最近一次有效成本作为当前商品的成本。比如2023-11-04号101商品的cost就需要用300.39填充。

product_id   cost       date    
101         300.39    2023-11-01  
102          500      2023-11-02
101           0       2023-11-03
101                   2023-11-04
102          600      2023-11-04
102                   2023-11-05
103          983      2023-11-06 
123456789

SQL解答:

该场景核心考察的就是开窗函数(last_value)的熟练使用。在同一个窗口内取最后一条有效值作为当前行的值。题不难,但是是一种很好的解题思路。

with temp as
(
    select 101 as product_id,300.39 as cost,'2023-11-01' as date
    union all
    select 102 as product_id,500 as cost,'2023-11-02' as date
    union all
    select 101 as product_id,0 as cost,'2023-11-03' as date
    union all
    select 101 as product_id,null as cost,'2023-11-04' as date
    union all
    select 102 as product_id,600 as cost,'2023-11-04' as date
    union all
    select 102 as product_id,null as cost,'2023-11-05' as date
    union all
    select 103 as product_id,983 as cost,'2023-11-06' as date
)
select
product_id
-- last_value第二个参数设置为true,表示取最后一个值时跳过null值。默认不写为false
,last_value(cost,true) over(partition by product_id order by date) as valid_last_value
,date
from(
   select
    product_id
    ,if(cost=0 or cost is null,null,cost) as cost  --当cost为0或null时就给null值
    ,date
    from temp
) t1
12345678910111213141516171819202122232425262728

补充last_value函数及first_value函数

1、last_value函数
Hive 中,LAST_VALUE() 是一种窗口函数,用于获取指定列的最后一个值。它的语法如下:

LAST_VALUE(expr,ignore_nulls) OVER (
  [PARTITION BY partition_expression, ... ]
  ORDER BY sort_expression [ASC|DESC], ...
  [ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | range_frame]
)
12345

其中,expr 表示要获取最后一个值的列名或表达式,ignore_nulls 表示是否忽略 NULL 值,如果设置为 true,则 LAST_VALUE() 函数会忽略 NULL 值并返回最后一个非 NULL 值,否则返回 NULL。PARTITION BY 子句用于指定分区字段,ORDER BY 子句用于指定排序规则,ROWS 子句用于指定窗口范围。

SELECT content_id, create_time, value, LAST_VALUE(value, true) OVER (PARTITION BY content_id ORDER BY create_time) as last_value
FROM my_table;
12

2、first_value函数
在 Hive 中,FIRST_VALUE() 是一种窗口函数,用于获取指定列的第一个值。它的语法如下:

FIRST_VALUE(expr) OVER (
  [PARTITION BY partition_expression, ... ]
  ORDER BY sort_expression [ASC|DESC], ...
  [ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | range_frame]
)
12345

其中,expr 表示要获取第一个值的列名或表达式,PARTITION BY 子句用于指定分区字段,ORDER BY 子句用于指定排序规则,ROWS 子句用于指定窗口范围。FIRST_VALUE() 函数的括号中除了参数 expr 之外,没有其他可选参数。FIRST_VALUE() 函数的主要作用是返回指定列的第一个值,并且不支持忽略 NULL 值。

SELECT content_id, create_time, value, FIRST_VALUE(value) OVER (PARTITION BY content_id ORDER BY create_time) as first_value
FROM my_table;
12

11.累计占比

问题:

现在有一张每个年份的每个部门的收入表。现在需要算每个部门的收入占同类型部门的收入的占比和当年整个公司的收入占比。要求一条SQL计算出来。比如研发部和产品部属于同类型的,都是产研;财务部和人事部都属于职能。

year    dept    income
2023    研发部    5000
2023    产品部    6000
2023    财务部    7000
2023    人事部    8000
2022    研发部    10000
2022    产品部    8000
2022    财务部    9000
2022   人事部    8000

SQL解答:

考察sum() over 开窗函数的使用。

with temp as
(
    select '2023' as year,"研发部" as dept,5000 as income
    union all
    select '2023' as year,"产品部" as dept,6000 as income
    union all
    select '2023' as year,"财务部" as dept,7000 as income
    union all
    select '2023' as year,"人事部" as dept,8000 as income
    union all 
    select '2022' as year,"研发部" as dept,10000 as income
    union all
    select '2022' as year,"产品部" as dept,8000 as income
    union all
    select '2022' as year,"财务部" as dept,9000 as income
    union all
    select '2022' as year,"人事部" as dept,8000 as income
)
select
year
,dept
,income
,round(income/similar_dept_income,2) as similar_dept_income_rate
,round(income/year_income,2) as year_income_rate
from
(
    select
    year
    ,dept
    ,income
    ,sum(income) over(partition by year,case 
                        when dept in("研发部","产品部") then "产研"
                        when dept in("财务部","人事部") then "职能"
                        end
    ) as similar_dept_income  --同类型部门收入
    ,sum(income) over(partition by year) as year_income
    from temp
) t1
;
----结果为:
序号       year      dept       income     similar_dept_income_rate     year_income_rate
1           2022     研发部      10000              0.56                      0.29
2           2022     产品部      8000               0.44                      0.23
3           2022     财务部      9000               0.53                      0.26
4           2022     人事部      8000               0.47                      0.23
5           2023     研发部      5000               0.45                      0.19
6           2023     产品部      6000               0.55                      0.23
7           2023     财务部      7000               0.47                      0.27
8          2023     人事部      8000               0.53                      0.31

12.访问会话切割

问题:

如下为某电商公司用户访问网站的数据,包括用户id和访问时间两个字段。现有如下规则:如果某个用户的连续的访问记录时间间隔小于60秒,则属于同一个会话,现在需要计算每个用户有多少个会话。比如A用户在第1秒,60秒,200秒,230秒有三次访问记录,则该用户有2个会话,其中第一个会话是第1秒和第60秒的记录,第二个会话是第200秒和230秒的记录。

user_id     ts
1001    16920000000
1001    16920000050
1002    16920000065
1002    16920000080
1001    16920000150
1002    16920000160

SQL解答:

先按用户分组、时间排序后取每行数据的前一行的时间,然后判断当前行的时间与前一行时间的差值,看是否在给定的范围内,然后再做开窗累加就可以得到每个用户不同的会话编号了。思路如下图:

user_id ts 判断与上一行差值是否小于60 开窗累加当做会话编号
A 1 0 0
A 60 0 0
A 200 1 1
A 230 0 1
with tmp as (
    select 1001 as user_id,16920000000 as ts
    union all
    select 1001 as user_id,16920000050 as ts
    union all
    select 1002 as user_id,16920000065 as ts
    union all
    select 1002 as user_id,16920000080 as ts
    union all
    select 1001 as user_id,16920000150 as ts
    union all
    select 1002 as user_id,16920000160 as ts
)

select
    user_id
    ,count(distinct user_group) as user_group_cnt
from
(
 select
    user_id
    ,ts
    -- 开窗做累加
    ,sum(flag) over(partition by user_id order by ts) as user_group
    from
    (
select
        user_id
        ,ts
        -- 判断当前行的时间与上一行的差值
        ,if(ts-last_ts<60,0,1) as flag
        from
        (
select
 user_id
 ,ts
 -- 取当前行的上一个时间,没有上一行就给自身的时间
 ,lag(ts,1,ts) over(partition by user_id order by ts) as last_ts
 from tmp
)t1
)t1
)t1
group by user_id;

13.计算部门的平均工资

问题:

现在要计算每个部门的平均工资(工资和/员工数),但是要去掉部门的最高工资和最低工资(如果一个部门最高或最低工资有并列的,去掉一个最高的和一个最低的)后,计算部门的平均工资。

     101     ,"研发部"     ,50000         
     102     ,"研发部"     ,50000         
     103     ,"研发部"     ,10000     
     104     ,"研发部"     ,20000     
     105     ,"研发部"     ,30000     
     106     ,"市场部"     ,20000     
     107     ,"市场部"     ,30000     
     108     ,"产品部"     ,20000     
     109     ,"产品部"     ,30000     
     110     ,"产品部"     ,25000 
     111    ,"市场部"    ,14000
     112    ,"市场部"    ,8000
     113    ,"产品部"    ,20000 
1234567891011121314

SQL解答:

使用开窗函数找出该部门的最高工资和最低工资,并按最高到最低和最低到最高分别给排名,最后排除掉满足工资要求且排名在最前面的员工即可。

with temp as
(
    select 101 as user_id,"研发部" as dept,50000 as salary
    union all
    select 102 as user_id,"研发部" as dept,50000 as salary
    union all
    select 103 as user_id,"研发部" as dept,10000 as salary
    union all
    select 104 as user_id,"研发部" as dept,20000 as salary
    union all
    select 105 as user_id,"研发部" as dept,30000 as salary
    union all
    select 106 as user_id,"市场部" as dept,20000 as salary
    union all
    select 107 as user_id,"市场部" as dept,30000 as salary
    union all
    select 108 as user_id,"产品部" as dept,20000 as salary
    union all
    select 109 as user_id,"产品部" as dept,30000 as salary
    union all
    select 110 as user_id,"产品部" as dept,25000 as salary
    union all 
    select 111 as user_id,"市场部" as dept,14000 as salary
    union all
    select 112 as user_id,"市场部" as dept,8000 as salary
    union all
    select 113 as user_id,"产品部" as dept,20000 as salary
)
select
dept
,dept_total_salary
,dept_employee_cnt
,round(dept_total_salary/dept_employee_cnt,2) as dept_avg_salary
from
(
    select
    dept
    ,sum(if((salary=min_salary and min_salary_rn=1) or (salary=max_salary and max_salary_rn=1),0,salary)) as dept_total_salary --去掉一个最高工资和一个最低工资后剩下的部门总工资
    ,sum(if((salary=min_salary and min_salary_rn=1) or (salary=max_salary and max_salary_rn=1),0,1)) as dept_employee_cnt  --去掉一个最高分和一个最低分后的剩余人数
    from
    (
        select
        user_id
        ,dept
        ,salary
        ,min(salary) over(partition by dept) as min_salary --部门的最高工资
        ,max(salary) over(partition by dept) as max_salary --部门的最低工资
        ,row_number() over(partition by dept order by salary desc) as max_salary_rn  --为了使用编号去掉其中一个最高分
        ,row_number() over(partition by dept order by salary) as min_salary_rn  --为了使用编号去掉其中一个最低分
        from temp 
    )t1
    group by dept
)t1
;

----结果为:
dept        dept_total_salary        dept_employee_cnt        dept_avg_salary    
产品部            45000                        2                    22500.0
市场部          34000                       2                   17000.0
研发部          100000                      3                   33333.33

14.分组topN

问题:

下面是某个班级的成绩表,需要筛选出每个科目前2名的学生信息。如果分数一样,名次是并列的,后面的同学名次就不连续。比如有2个同学是第一名,那么下一个同学的名次就是第3名,呈现1,1,3的名次排列。

stu_id  stu_name    subject     score
1001    张三        语文           90
1001    张三        数学           80
1001    张三        英语           70
1002    李四        语文           90
1002    李四        数学           75
1002    李四        英语           80
1003    王五        语文           80
1003    王五        数学           70
1003    王五        英语           60
1234567891011

SQL解答:

非常典型的分组topN的问题,是面试经常被问到的。直接使用开窗函数排名即可,注意下row_number、rank、dense_rank三个开窗函数不同场景下的使用。

with temp as
(
    select 1001 as stu_id,'张三' as stu_name,'语文' as subject,90 as score
    union all
    select 1001 as stu_id,'张三' as stu_name,'数学' as subject,80 as score
    union all
    select 1001 as stu_id,'张三' as stu_name,'英语' as subject,70 as score
    union all
    select 1002 as stu_id,'李四' as stu_name,'语文' as subject,90 as score
    union all
    select 1002 as stu_id,'李四' as stu_name,'数学' as subject,75 as score
    union all
    select 1002 as stu_id,'李四' as stu_name,'英语' as subject,80 as score
    union all
    select 1003 as stu_id,'王五' as stu_name,'语文' as subject,80 as score
    union all
    select 1003 as stu_id,'王五' as stu_name,'数学' as subject,70 as score
    union all
    select 1003 as stu_id,'王五' as stu_name,'英语' as subject,60 as score
)
select
stu_id
,stu_name
,subject
,score
from
(
   select 
    stu_id
    ,stu_name
    ,subject
    ,score
    ,rank() over(partition by subject order by score desc) as rk
    from temp
) t1
where rk<=2;

---结果:
stu_id        stu_name        subject            score    
1001          张三            数学           80    
1002          李四            数学           75
1002          李四            英语           80    
1001          张三            英语           70    
1001          张三            语文           90
1002          李四            语文           90

15.每年的在校人数

问题:

year表示学生入学年度,num表示对应年度录取学生人数,stu_len表示录取学生的学制;说明:例如录取年度2018学制是3年,表示该批学生在校年份为20182019、20192020、2020-2021,在算每年的在校人数时,2018/2019/2020/2021年份都需要算上。
以下是示例数据:

id    year    num    stu_len
1    2018    2000        3
2    2019    2000        3
3    2020    1000        4
3    2020    2000        3
12345

根据以上示例计算出每年的在校人数

SQL解答:

由于需要计算每年的在校人数,所以先要造出连续的年份。然后与源表进行关联,关联条件保证年份在入学年份和结束年份之间即可。

with temp as
(
    select 2018 as year,3 as stu_len,2000 as num
    union all
    select 2019 as year,3 as stu_len,2000 as num
    union all
    select 2020 as year,4 as stu_len,1000 as num
    union all
    select 2020 as year,3 as stu_len,2000 as num
)

select
t1.year
,sum(t2.num) as stu_num
from
(
select
    t1.min_year+tab.pos as year
    from
    (
        select
        min(year) as min_year
        ,max(year+stu_len) as max_year
        from temp
    )t1
    lateral view posexplode(split(repeat(',',max_year-min_year),',')) tab as pos,val
)t1
inner join temp t2
on t1.year between t2.year AND t2.year + t2.stu_len
group by t1.year

16.sql实现分钟级的趋势图

问题

Hive或者ODPS中,怎么用sql实现分钟级的趋势图?比如从交易表中,如何统计0点到每分钟的交易趋势图?

原表:trade_A(trade_id,pay_time(格式是2020-08-05 10:30:28),pay_gmv)。希望用sql实现分钟级的0点到当前分钟的GMV。
结果表:result_A(minute_rn(分钟顺序),pay_gmv_td(每分钟的交易额,都是0点到当前分钟的累加值))。
以下是示例数据:

with tmp as (
    select 101 as trade_id,'2020-08-05 00:30:28' as pay_time,100 as pay_gmv
    union all
    select 102 as trade_id,'2020-08-05 00:30:58' as pay_time,200 as pay_gmv
    union all
    select 103 as trade_id,'2020-08-05 00:35:28' as pay_time,300 as pay_gmv
    union all
    select 104 as trade_id,'2020-08-05 01:36:28' as pay_time,400 as pay_gmv
    union all
    select 105 as trade_id,'2020-08-06 00:20:28' as pay_time,500 as pay_gmv
    union all
    select 106 as trade_id,'2020-08-06 00:21:28' as pay_time,600 as pay_gmv
)

123456789101112131415

sql解答

其实这个题的核心就是使用sql生成分钟级的序列,与之前SQL面试题挑战08:补全缺失日的月销售累计的场景类似。然后关联原表,做开窗累加即可。而且生成分钟级的这段代码是可以收藏以后作为工具代码使用的。

select
minute_rn
-- 开窗累加即可
,sum(pay_gmv) over(partition by date_format(minute_rn,'yyyy-MM-dd') order by minute_rn) as pay_gmv_td -- 按当前时间按天开窗累加,得到每分钟的累计销售额
from
(
    -- 根据数据中的日期生成连续的分钟序列。从最小日期的0点到最大日期的23:59分
    select
    t1.minute_rn
    ,nvl(t2.pay_gmv,0) as pay_gmv
    from
    (
        select
        from_unixtime(unix_timestamp(t1.min_pay_time)+tab.pos*60,'yyyy-MM-dd HH:mm') as minute_rn
        from
        (
            select
            min(date_format(pay_time,'yyyy-MM-dd 00:00:00')) as min_pay_time --取最小日期
            ,max(date_format(pay_time,'yyyy-MM-dd 23:59:59')) as max_pay_time --取最大日期
            from tmp
        )t1
        lateral view posexplode(split(repeat(',',(unix_timestamp(max_pay_time)-unix_timestamp(min_pay_time))/60),',')) tab as pos,val
    )t1
    left join
    (   
        -- 先按同分钟的数据进行一次聚合
        select
        date_format(pay_time,'yyyy-MM-dd HH:mm') as pay_time
        ,sum(pay_gmv) as pay_gmv
        from tmp
        group by date_format(pay_time,'yyyy-MM-dd HH:mm')
    )t2
    on t1.minute_rn=t2.pay_time
)t1
;

17.补全缺失日的月销售累计

问题:

现有一张员工的销售记录表,表样式如下。现在需要统计每个员工在2023年10月份,截止到每天的月累计销售额。注意:存在有的员工在某几天是没有销售记录的。

   sale_date   emp_id  emp_name sale_amount
 '2023-10-02' ,'101' ,'张三' , 1000 
 '2023-10-03' ,'101' ,'张三' , 3000 
 '2023-10-05' ,'101' ,'张三' , 4000 
 '2023-10-10' ,'101' ,'张三' , 2000 
 '2023-10-13' ,'101' ,'张三' , 5000 
 '2023-10-15' ,'101' ,'张三' , 4000 
 '2023-10-27' ,'101' ,'张三' , 2000  
 '2023-10-01' ,'102' ,'李四' , 1111 
 '2023-10-03' ,'102' ,'李四' , 2222 
 '2023-10-08' ,'102' ,'李四' , 3333 
 '2023-10-11' ,'102' ,'李四' , 111 
 '2023-10-23' ,'102' ,'李四' , 4550 
 '2023-10-28' ,'102' ,'李四' , 6666 
1234567891011121314

SQL解决:

如果是每个员工在每天都有销售记录,那么直接开窗就可以计算出来当月截至到每天的累计销售额。现在由于销售记录有缺失,他虽然在某天没有销售记录,但是他是有当月累计销售额的,且当月累计销售额与前一天的累计销售额一样,这种依然需要统计出来。所以我们首先考虑将每个人每天的销售记录补齐,当天没有销售记录的那么销售额给0,然后就可以开窗计算当月截至到每天的累计销售额了。

with temp as (
select '2023-10-02' as sale_date,'101' as emp_id,'张三' as emp_name, 1000 as sale_amount
union all
select '2023-10-03' as sale_date,'101' as emp_id,'张三' as emp_name, 3000 as sale_amount
union all
select '2023-10-05' as sale_date,'101' as emp_id,'张三' as emp_name, 4000 as sale_amount
union all
select '2023-10-10' as sale_date,'101' as emp_id,'张三' as emp_name, 2000 as sale_amount
union all
select '2023-10-13' as sale_date,'101' as emp_id,'张三' as emp_name, 5000 as sale_amount
union all
select '2023-10-15' as sale_date,'101' as emp_id,'张三' as emp_name, 4000 as sale_amount
union all
select '2023-10-27' as sale_date,'101' as emp_id,'张三' as emp_name, 2000 as sale_amount
union all 
select '2023-10-01' as sale_date,'102' as emp_id,'李四' as emp_name, 1111 as sale_amount
union all
select '2023-10-03' as sale_date,'102' as emp_id,'李四' as emp_name, 2222 as sale_amount
union all
select '2023-10-08' as sale_date,'102' as emp_id,'李四' as emp_name, 3333 as sale_amount
union all 
select '2023-10-11' as sale_date,'102' as emp_id,'李四' as emp_name, 111 as sale_amount
union all
select '2023-10-23' as sale_date,'102' as emp_id,'李四' as emp_name, 4550 as sale_amount
union all
select '2023-10-28' as sale_date,'102' as emp_id,'李四' as emp_name, 6666 as sale_amount
)


select 
    dt
    ,t1.emp_id
    ,t1.emp_name
    ,nvl(t2.sale_amount,0) as sale_amount
    ,sum(if(t2.sale_amount is null ,0,t2.sale_amount))over(partition by t1.emp_id order by t1.dt) as total_sale_amount
from  
(select
    date_add(t.start_date,tab.pos) as dt
    ,t.emp_id
    ,t.emp_name
    from
    (
        select
        emp_id
        ,emp_name
        ,'2023-10-01' as start_date
        ,'2023-10-31' as end_date
        from temp
        group by emp_id,emp_name
    )t
    lateral view posexplode(split(repeat(',',datediff(end_date,start_date)),',')) tab as pos,val
) t1
left join
(
    select
    sale_date
    ,emp_id
    ,emp_name
    ,sale_amount
    from temp
)t2
on t1.dt=t2.sale_date and t1.emp_id=t2.emp_id and t1.emp_name=t2.emp_name
;

https://ykg8hl7h33.feishu.cn/docx/VNo7dJLJfoDwT1xNNVpcLg8Snlc

作者:admin  创建时间:2024-03-13 19:47
最后编辑:admin  更新时间:2024-04-07 15:40