Hive中inline(array(struct))直接解析struct结构中的内容

从struct 类型的 named_struct_1字段 解析其中内容:page_id, page_name, log_date

create table tmp_user_scan_info_struct_mid stored as orc as
select 
    uid,
    named_struct('page_id', page_id, 
                 'page_name', page_name, 
                 'log_date', log_date
                 ) as named_struct_1
from tmp_user_scan_info;

select * from tmp_user_scan_info_struct_mid;
+------+---------------------------------------------------------+--+
| uid  |                     named_struct_1                      |
+------+---------------------------------------------------------+--+
| 0    | {"page_id":10,"page_name":"A","log_date":"2015-01-01"}  |
| 0    | {"page_id":20,"page_name":"B","log_date":"2015-01-02"}  |
| 1    | {"page_id":10,"page_name":"A","log_date":"2016-02-01"}  |
| 1    | {"page_id":20,"page_name":"B","log_date":"2016-02-02"}  |
+------+---------------------------------------------------------+--+
select
    uid,
    page_id,
    page_name,
    log_date
from tmp_user_scan_info_struct_mid t
lateral view outer inline(array(named_struct_1)) tmp as page_id, page_name, log_date;

select inline(array(named_struct_1)) from tmp_user_scan_info_struct_mid;
+------------+----------+-------------+--+
| page_name  | page_id  |  log_date   |
+------------+----------+-------------+--+
| A          | 10       | 2015-01-01  |
| B          | 20       | 2016-02-02  |
+------------+----------+-------------+--+

-------------------- 支持多列别名(多字段名)---------------
select inline( array(struct('A',10,date '2015-01-01'),
                     struct('B',20,date '2016-02-02')
                     ) 
             ) as (page_name, page_id, log_date);
+------------+----------+-------------+--+
| page_name  | page_id  |  log_date   |
+------------+----------+-------------+--+
| A          | 10       | 2015-01-01  |
| B          | 20       | 2016-02-02  |
+------------+----------+-------------+--+
--数据准备 ----------------------------------------------
create table tmp_user_scan_info stored as orc as
select 0 as uid, 10 as page_id, 'A' as page_name, '2015-01-01' as log_date union all
select 0 as uid, 20 as page_id, 'B' as page_name, '2015-01-02' as log_date union all
select 1 as uid, 10 as page_id, 'A' as page_name, '2016-02-01' as log_date union all
select 1 as uid, 20 as page_id, 'B' as page_name, '2016-02-02' as log_date;

select * from tmp_user_scan_info;
+------+----------+------------+-------------+--+
| uid  | page_id  | page_name  |  log_date   |
+------+----------+------------+-------------+--+
| 0    | 10       | A          | 2015-01-01  |
| 0    | 20       | B          | 2015-01-02  |
| 1    | 10       | A          | 2016-02-01  |
| 1    | 20       | B          | 2016-02-02  |
+------+----------+------------+-------------+--+

----- 多行转多列 ---------
select
 uid,
 concat_ws(',', 
           collect_list(concat_ws('|', array(page_id, page_name, log_date)))
           ) as extend_info
from tmp_user_scan_info
group by uid;
+------+----------------------------------+--+
| uid  |           extend_info            |
+------+----------------------------------+--+
| 0    | 10|A|2015-01-01,20|B|2015-01-02  |
| 1    | 10|A|2016-02-01,20|B|2016-02-02  |
+------+----------------------------------+--+

select
    uid,
    concat_ws(',', collect_set( concat_ws('|',
                                          cast(page_id as string), 
                                          page_name, 
                                          log_date
                                         )
                               )
             ) as pageId_vs_pageName_vs_logDate
from tmp_user_scan_info_struct_mid t
lateral view outer inline(array(named_struct_1)) tmp as page_id, page_name, log_date
group by uid;
+------+----------------------------------+--+
| uid  |  pageid_vs_pagename_vs_logdate   |
+------+----------------------------------+--+
| 0    | 10|A|2015-01-01,20|B|2015-01-02  |
| 1    | 10|A|2016-02-01,20|B|2016-02-02  |
+------+----------------------------------+--+

版权声明:本文为JnYoung原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。