SQL之存在性问题分析-HQL面试题39_石榴姐yyds-CSDN博客_sql 石榴姐
存在性问题。求截止当前月退费总人数
退费人数:上月存在,这月不存在的学生个数。
表stu中字段day日期,stu_id学生id。
求每个月缴费名单,每个月分别合并stu_id。
按月份排序,获取下一月合并后的值。
当前的stu_id是否在下一月array数组中,如果存在标记0,不存在标记1。
对flag按月求和即为下一个月退费人数。
再将上一个月的退费人数获取到当前行,为每月的退费人数。
进行累加计算截止到当前月的退费总人数。
--
select
month,
sum(lag_month_cnt) over(order by month)
from(
select
month,
lag(next_month_cnt, 1, 0) over(order by month) as lag_month_cnt
from(
select
distinct t0.month as month,
sum(if(!array_contains(t1.lead_stu_id_arr,t0.stu_id),1,0)) over(partition by t0.month) as next_month_cnt
from(
select
substr(day,1,7) as month,
stu_id
from
stu
) t0
left join(
select
month,
lead(stu_id_arr, 1) over(order by month) as lead_stu_id_arr
from(
select
substr(day, 1, 7) as month,
collect_list(stu_id) as stu_id_arr
from
stu
group by
substr(day, 1, 7)
) m
) t1 on t0.month = t1.month
) n
) o21、获取积分最多的人
SQL之获取积分最多的人--HQL面试题37_石榴姐yyds-CSDN博客_获得积分最多的人
查找积分最高的用户的id、名字、总积分。
用户表user:id,name
积分表grade_info:name,grade_num,type
type为add,积分是增加grade_num。type为reduce,积分是减少grade_num。
select t0.user_id as user_id,
t1.name as name,
t0.grade as grade
from(
select user_id,grade
from(
select user_id,
grade,
dense_rank() over(order by grade desc) as rn
from(
select user_id,sum(if(type='add',grade_num,-1*grade_num)) as grade
from grade_info
group by user_id
) m
) n
where rn=1
) t0
left join user t1
on t0.user_id = t1.id22、用户中两人一定认识的组合数
SQL之用户中两人一定认识的组合数--HQL面试题36【快手数仓面试题】_石榴姐yyds-CSDN博客
城市网吧访问数据,字段:网吧wid,访客uid,上线时间,下线时间。
规则1,如果有两个用户在一家网吧的前后上下线时间在10分钟以内,则两人可能认识。
规则2,如果这两个用户在三家以上网吧出现【规则1】的情况,则两人一定认识。
统计该城市上网用户中两人一定认识的组合数。
select sum(flag) as cnt
from(
select uuid,if(count(wid)>=3,1,0) as flag
from(
select t0.wid as wid,concat_ws('',t0.uid,t1.uid) as uuid
from table0 as t0
join table0 as t1
where t0.wid=t1.wid
and (abs(unix_timestamp(t0.ontime,'yyyy-MM-dd HH:mm:ss')-unix_timestamp(t1.ontime,'yyyy-MM-dd HH:mm:ss'))<600 or abs(unix_timestamp(t0.offtime,'yyyy-MM-dd HH:mm:ss')-unix_timestamp(t1.offtime,'yyyy-MM-dd HH:mm:ss'))<600 )
and t0.uid > t1.uid
) t
group by uuid
) m23、N天窗口内趋势异常分析
一种基于批处理的N天窗口内趋势异常分析【物联网大数据应用】_石榴姐yyds-CSDN博客
24、连续增长问题
SQL连续增长问题--HQL面试题35_石榴姐yyds-CSDN博客_sql连续增长问题
假设我们有一张订单表shop_order,字段shop_id,order_time,order_amt。
查找至少连续3天销售金额连续增长的商户shop_id。
with tmp as(
select
shop_id,to_date(order_time) as order_date,sum(order_amt) as order_amt
from
shop_order
group by
shop_id,to_date(order_time)
)
select shop_id
from(
select *
,sum(case when order_amt_diff<=0 then 1 else 0 end) over(partition by shop_id,flag1 order by order_date) flag2 --增长性分组条件
from(
select *
,sum(if(order_date_diff>1,1,0)) over(partition by shop_id order by order_date) as flag1 --连续性分组条件
from(
select shop_id
,order_date
,order_amt
,datediff(order_date,lag(order_date,1,date_sub(order_date,1)) over(partition by shop_id order by order_date)) as order_date_diff --用于判断是否按天连续
,order_amt - lag(order_amt,1,0) over(partition by shop_id order by order_date) as order_amt_diff --判断是否增长
from tmp
) t
) m
) n
group by shop_id,flag1,flag2
having count(1) >=325、连续性问题
经典连续性问题--数学思维在SQL编程中的应用【HiveSQL面试题3】_石榴姐yyds-CSDN博客
求连续3天登录用户。
表login:字段user_id,字段login_date。
select t.user_id
from(
select user_id
,datediff(login_date,lag(login_date,2,-1) over(partition by user_id order by login_date)) as diff
from login
) t
where diff = 2
group by t.user_id26、间隔连续问题【重分组思想】
SQL之间隔连续问题【重分组思想】--HiveSQL面试题32_石榴姐yyds-CSDN博客
计算每个用户最大的连续登录天数,可以间隔一天。
如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。
select id
,max(con_diff)
from(
select id
,groupid
,datediff(max(dt),min(dt))+1 as con_diff
from(
select
id,
dt,
sum(if(dt_diff>2,1,0)) over(partition by id order by dt) groupid
from(
select
id,
dt,
datediff(dt,lag(dt,1,dt) over(partition by id order by dt)) as dt_diff
from login
) t
) m
group by id,groupid
) n
group by id27、重分组算法实践【累加器思想】
SQL之重分组算法实践【累加器思想】--Hivesql面试题31_石榴姐yyds-CSDN博客
某个用户相邻的访问记录如果时间间隔小于 60 秒,则分为同一个组
select id,
ts,
sum(if(ts_diff>=60,1,0)) over(partition by id order by ts) as groupid
from(
select
id,
ts,
ts-lag(ts,1,0) over(partition by id order by ts) as ts_diff
from visit
)t28、出场排名top3的英雄出场次数及出场率
SQL英雄出场排名top3的出场次数及出场率问题分析--HiveSQL面试题26_石榴姐yyds-CSDN博客
表heros:id出场编号,names每场出场的一些英雄名字如aa,bb,cc
select name
,cnt
,ratio
from(
select name
,cnt
,cast(cnt/sum(cnt) over() as decimal(3,1)) as ratio
,dense_rank() over(order by cnt desc) as dr
from(
select name
,count(1) as cnt
from heros lateral view explode(split(names,',')) t as name
group by name
) m
) n
where dr<=329、嵌套window子句的应用
SQL条件判断语句嵌套window子句的应用【易错点】--HiveSql面试题25_石榴姐yyds-CSDN博客
表字段:用户user_id,商品good_name,商品类型goods_type,自然排序rk。其中goods_type中值为26代表广告。
获取每个用户非广告类型的商品位置自然排序。
case when是在窗口函数之后执行的。以下语句不是正确结果。
select
user_id,
goods_name,
goods_type,
rk,
case when goods_type!=26 then row_number() over(partition by user_id order by rk) else null end as naturl_rank
from window_goods_test窗口函数partition by子句中可以嵌套条件判断语句。
select user_id
,goods_name
,goods_type
,rk
,if(goods_type!=26,row_number() over(partition by if(goods_type!=26,user_id,rand()) order by rk),null) naturl_rank
from window_goods_test
order by rk
--
此处partition by分组中if 语句中的else后不置为NULL而是随机数,是因为如果置为NULL,goods_type=26的数较多的情况下会被分到一组造成数据倾斜,因此采用了rand()函数。30、用户行为分析join的应用
HiveSQL面试题21--用户行为分析join的应用【小红书面试题】_石榴姐yyds-CSDN博客
用户订单表:用户id、购买的商品id、下单时间。
用户收藏商品表:用户id、收藏的商品id、收藏时间。
问题:查询所有用户的商品行为特征状况:已购买、购买未收藏、收藏未购买、收藏且购买。
select
distinct
case when m.user_id is not null then m.user_id else n.user_id end as user_id,
case when m.goods_id is not null then m.goods_id else n.goods_id end as goods_id,
case when m.user_id is not null and m.goods_id is not null then 1 else 0 end as alreadybuy,
case when (m.user_id is not null and m.goods_id is not null) and (n.user_id is null or n.goods_id is null) then 1 else 0 end as buy_not_collect,
case when (m.user_id is null or m.goods_id is null) and (n.user_id is not null and n.goods_id is not null) then 1 else 0 end as collect_not_buy,
case when (m.user_id is not null and m.goods_id is not null) and (n.user_id is not null and n.goods_id is not null) then 1 else 0 end as collect_and_buy
from userorder as m full join usercollect as n
on m.user_id = n.user_id and m.goods_id=n.goods_id31、平台圈主与用户映射关系问题
HiveSql面试题6--小打卡圈主与用户映射关系问题(SQL之join思维典型案例代表)_石榴姐yyds-CSDN博客
join思维典型案例
计算平台的圈主建了多少圈子,各有多少用户
tb_habit圈子表:master_id圈主,habit_id圈子
user_habit_relation用户与圈子关系表:habit_id圈子,user_id用户
select c.master_id
,count(distinct c.a_habit_id)
,count(distinct c.user_id)
from(
select a.master_id as master_id
,a.habit_id as a_habit_id
,b.habit_id as b_habit_id
,b.user_id as user_id
from tb_habit a left join user_habit_relation b
on a.habit_id = b.habit_id
) c
group by c.master_id留存率
SQL实现次日、三日及七日用户留存率的计算_蕾欧娜与26的博客-CSDN博客_sql 次日留存率
select
log_day '日期',
count(user_id_d0) '新增数量',
count(user_id_d1) / count(user_id_d0) '次日留存率',
count(user_id_d3) / count(user_id_d0) '3日留存率',
count(user_id_d7) / count(user_id_d0) '7日留存率',
from (
select
distinct log_day,
a.user_id_d0,
b.device_id as user_id_d1,
c.device_id as user_id_d3,
d.device_id as user_id_d7
from (
select
distinct date(time) as log_day,
user_id as user_id_d0
from login
group by user_id
order by log_day) a
left join login b
on a.user_id_d0 = b.user_id
and datediff(date(b.time),a.log_day) = 1
left join login c
on a.user_id_d0 = c.user_id
and datediff(date(c.time),a.log_day) = 2
left join login d
on a.user_id_d0 = d.user_id
and datediff(date(d.time),a.log_day) = 6
) t
group by log_day;