1.举例数据
id stt edt
1001 2022-05-20 12:12:12 2022-05-20 15:18:16
1002 2022-05-20 12:13:14 2022-05-20 16:18:17
1003 2022-05-20 13:18:12 2022-05-20 16:18:16
1004 2022-05-20 14:12:12 2022-05-20 18:18:19
1005 2022-05-20 15:12:12 2022-05-20 17:20:16
1001 2022-05-20 15:50:12 2022-05-20 16:18:16
1006 2022-05-20 16:12:12 2022-05-20 19:18:16
2:整体思路:采用流式处理的方法,将上下线时间分别打上flag标记,上线为1,下线为-1,并union all到一起(t1)。然后开窗按时间字段排序,计算第一行到当前行flag的和,最大的即为高峰期在线人数(t2)。然后取出最大值,可能不止一个(t3)。在t2表的基础上,求持续时间,因为t2表中上线下线时间点在同一列,所以将当前行下一行的时间(下线时间)移上来,就可以得到最高峰的时间段t4。然后用t3得出的在线人数高峰值union all上t4的在线人数高峰值,即可得到高峰期在线人数以及高峰期的持续时间。
3.具体实现:
3.1 按照上下闲时间将数据打上标记(拼接一个字段),取名t1表
select id,stt dt,1 flag from test1
union all
select id,edt dt,-1 flag from test1;t1
运行结果:
+---------+----------------------+-----------+
| _u1.id | _u1.dt | _u1.flag |
+---------+----------------------+-----------+
| 1001 | 2022-05-20 15:18:16 | -1 |
| 1002 | 2022-05-20 16:18:17 | -1 |
| 1003 | 2022-05-20 16:18:16 | -1 |
| 1004 | 2022-05-20 18:18:19 | -1 |
| 1005 | 2022-05-20 17:20:16 | -1 |
| 1001 | 2022-05-20 16:18:16 | -1 |
| 1006 | 2022-05-20 19:18:16 | -1 |
| 1001 | 2022-05-20 12:12:12 | 1 |
| 1002 | 2022-05-20 12:13:14 | 1 |
| 1003 | 2022-05-20 13:18:12 | 1 |
| 1004 | 2022-05-20 14:12:12 | 1 |
| 1005 | 2022-05-20 15:12:12 | 1 |
| 1001 | 2022-05-20 15:50:12 | 1 |
| 1006 | 2022-05-20 16:12:12 | 1 |
+---------+----------------------+-----------3.2 按dt字段排序开窗,计算flag到当前行的和,即为高峰期同时在线人数
select
dt,
sum(flag) over(order by dt) ps
from
(
select id,stt dt,1 flag from test1
union all
select id,edt dt,-1 flag from test1
)t1;运行结果:
+----------------------+-----+
| dt | ps |
+----------------------+-----+
| 2022-05-20 12:12:12 | 1 |
| 2022-05-20 12:13:14 | 2 |
| 2022-05-20 13:18:12 | 3 |
| 2022-05-20 14:12:12 | 4 |
| 2022-05-20 15:12:12 | 5 |
| 2022-05-20 15:18:16 | 4 |
| 2022-05-20 15:50:12 | 5 |
| 2022-05-20 16:12:12 | 6 |
| 2022-05-20 16:18:16 | 4 |
| 2022-05-20 16:18:16 | 4 |
| 2022-05-20 16:18:17 | 3 |
| 2022-05-20 17:20:16 | 2 |
| 2022-05-20 18:18:19 | 1 |
| 2022-05-20 19:18:16 | 0 |
+----------------------+-----+3.3 取出高峰时期最高人数,就是6
select
max(ps) ps
from
(
select
dt,
sum(flag) over(order by dt) ps
from
(
select id,stt dt,1 flag from test1
union all
select id,edt dt,-1 flag from test1
)t1
)t2;3.4 因为要获取高峰期的持续事件段,所以需要得到高峰期下降之后的第一个事件段,
也就是高峰期事件段的的下一行的时间,对每一行取下一行的时间进行拼接
select
ps,
dt,
lead(dt,1) over(order by dt) lead_dt
from
(
select
dt,
sum(flag) over(order by dt) ps
from
(
select id,stt dt,1 flag from test1
union all
select id,edt dt,-1 flag from test1
)t1
)t3;运行结果:
+-----+----------------------+----------------------+
| ps | dt | lead_dt |
+-----+----------------------+----------------------+
| 1 | 2022-05-20 12:12:12 | 2022-05-20 12:13:14 |
| 2 | 2022-05-20 12:13:14 | 2022-05-20 13:18:12 |
| 3 | 2022-05-20 13:18:12 | 2022-05-20 14:12:12 |
| 4 | 2022-05-20 14:12:12 | 2022-05-20 15:12:12 |
| 5 | 2022-05-20 15:12:12 | 2022-05-20 15:18:16 |
| 4 | 2022-05-20 15:18:16 | 2022-05-20 15:50:12 |
| 5 | 2022-05-20 15:50:12 | 2022-05-20 16:12:12 |
| 6 | 2022-05-20 16:12:12 | 2022-05-20 16:18:16 |
| 4 | 2022-05-20 16:18:16 | 2022-05-20 16:18:16 |
| 4 | 2022-05-20 16:18:16 | 2022-05-20 16:18:17 |
| 3 | 2022-05-20 16:18:17 | 2022-05-20 17:20:16 |
| 2 | 2022-05-20 17:20:16 | 2022-05-20 18:18:19 |
| 1 | 2022-05-20 18:18:19 | 2022-05-20 19:18:16 |
| 0 | 2022-05-20 19:18:16 | NULL |
+-----+----------------------+----------------------+3.5 用t3查到的最高人数人数与t4进行join 即可得到高峰期人数及持续时间段
select
t3.ps,
t4.dt dtt,
t4.lead_dt edt
from
(
select
max(ps) ps
from
(
select
dt,
sum(flag) over(order by dt) ps
from
(
select id,stt dt,1 flag from test1
union all
select id,edt dt,-1 flag from test1
)t1
)t2
)t3
join
(
select
ps,
dt,
lead(dt,1) over(order by dt) lead_dt
from
(
select
dt,
sum(flag) over(order by dt) ps
from
(
select id,stt dt,1 flag from test1
union all
select id,edt dt,-1 flag from test1
)t1
)t3
)t4
on t3.ps=t4.ps;最终结果:
+--------+----------------------+----------------------+
| t3.ps | dtt | edt |
+--------+----------------------+----------------------+
| 6 | 2022-05-20 16:12:12 | 2022-05-20 16:18:16 |
+--------+----------------------+----------------------+总结:我个人认为难点在于第一步,因为要求的事件段内同时在线人数高峰,不是仅限于从每个人计算,所以不必盯着每个人的上下线时间思考,而应该整体去思考。将所有人的上下线时间放在一起排个序,上线一个就+1,下线一个就-1,最终计算第一行到当前行在线人数的和,即可得到答案。
版权声明:本文为qq_50408152原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。