Hive DML数据操作 – 源码巴士

1.向表中装载数据

//创建表
create table student(id string, name string) row format delimited fields terminated by '\t';

//1.加载本地文件（linux本地）到hive,（本质是拷贝一份到hive上）,put可以实现文件上传的功能,上传后的文件执行count(*),put上传的文件会出错,而load上传的数据不会出错
load data local inpath '/opt/module/hive/datas/student.txt' into table student;


//2.从HDFS加载文件到hive（本质上也就是在hdfs上从执行mv操作）
//首先将文件从本地上传到hdfs,有两种方式:
在shell命令下:hadoop fs -put /opt/module/hive/data/student.txt  
/user/zhang/hive;
进入hive:hive (default)> dfs -put /opt/module/hive/student.txt /temp;
//其次在hdfs上将文件添加进hive表
load  data inpath '/user/zhang/hive/student.txt' into table student;//相当于在hdfs上执行mv操作

//3.加载数据覆盖表中已有的数据
hadoop fs -put /opt/module/hive/data/student.txt  /user/zhang/hive;
load data inpath '/user/zhang/hive/student.txt' overwrite into table student;

第二部分重点:
2. 通过查询语句向表中插入数据（Insert)

//创建一张表
create table student_par(id int, name string) row format delimited fields terminated by '\t';
//基本插入数据,追加的方式不会覆盖原表中数据
insert into table student values (1001,'chenweiting'),(1002,'wulei');
//覆盖原表中的数据
insert overwrite table student2 select id,name from student；

//案例
//9追加到末尾)
insert into table student2 select * from student;
//查询一张表中所需要的数据并将其添加到另一张表中(覆盖原文件)
insert overwrite table student2 select * from student;
//创建相同表导入相同数据
create table if not exists student2 as select id,name from student;
//创建相同的表结构,表内没有数据
create table if not exists student2 like student;

3.数据导出

//将hive表中的查询数据格式化导出到linux本地
insert overwrite local directory '/opt/module/hive/student' 
row format delimited fields terminated by ','
select * from student;

//将查询结果导出到HDFS上(没有local)其实,本质操作相当于在hdfs上执行cp操作,
//cp不走mapreduce执行速度更快,因此该方法并不常用,可以直接用cp操作代替
insert overwrite directory '/student'
row format delimited fields terminated by ','
select * from student;

//hadoop命令导入到本地
hive (default)> dfs -get /user/hive/warehouse/student2/* /opt/module/hive/student;
//hive shell命令导出,根据查询的结果将其写入到指定的文件
[zhang@hadoop102 hive]$ bin/hive -e 'select * from student' > /opt/module/hive/student2.txt
//复习一个小知识点 :> 代表覆盖, >>代表追加

//export/import 操作 
//export导出到HDFS上,export完之后,去studentlike目录下查看,发现连元数据也一起导出
export table student to '/studentlike';
//使用import的时候,只能导入export导出的表,要导入的表不能存在
import table studentimport from '/studentlike';
//清楚表中的数据,但是表还存在(只能删除管理表,不能删除外部表中的数据),
truncate table student;

3.查询
在/opt/module目录下新建一个目录,datas.创建以下两张部门文件dept.txt和员工文件emp.txt.
(1)原始数据:

dept:
10	ACCOUNTING	1700 
20	RESEARCH	1800 
30	SALES	1900
40	OPERATIONS	1700

emp:

7369	MITH	CLERK	7902	1980-12-17	800.00		20
7499	ALLEN	SALESMAN	7698	1981-2-20	1600.00	300.00	30
7521	WARD	SALESMAN	7698	1981-2-22	1250.00	500.00	30
7566	JONES	MANAGER	7839	1981-4-2	2975.00		20
7654	MARTIN	SALESMAN	7698	1981-9-28	1250.00	1400.00	30
7698	BLAKE	MANAGER	7839	1981-5-1	2850.00		30
7782	CLARK	MANAGER	7839	1981-6-9	2450.00		10
7788	SCOTT	ANALYST	7566	1987-4-19	3000.00		20
7839	KING	PRESIDENT		1981-11-17	5000.00		10
7844	TURNER	SALESMAN	7698	1981-9-8	1500.00		30
7876	ADAMS	CLERK	7788	1987-5-23	1100.00		20
7900	JAMES	CLERK	7698	1981-12-3	950.00		30
7902	FORD	ANALYST	7566	1981-12-3	3000.00		20
7934	MILLER	CLERK	7782	1982-1-23	1300.00		10

(1) 创建部门表

create table if not exists dept( deptno int,
dname string, loc int
)
row format delimited fields terminated by '\t';

(2)创建员工表

create table if not exists emp( empno int,
ename string, job string, mgr int,
hiredate string, sal double,
comm double, deptno int)
row format delimited fields terminated by '\t';

(3)导入数据

load data local inpath '/opt/module/datas/dept.txt' into table dept;
load data local inpath '/opt/module/datas/emp.txt' into table emp;

之后就就可以执行常见查询

hive (default)> select * from emp;
hive (default)> select empno,ename,job,mgr,hiredate,sal,comm,deptno from emp ;

注意:
（1） SQL 语言大小写不敏感。

（2） SQL 可以写在一行或者多行

（3）关键字不能被缩写也不能分行

（4）各子句一般要分行写。

1）	求总行数（count）
hive (default)> select count(*) cnt from emp;	
2）	求工资的最大值（max）
hive (default)> select max(sal) max_sal from emp;	
3）	求工资的最小值（min）
hive (default)> select min(sal) min_sal from emp;	
4）	求工资的总和（sum）
hive (default)> select sum(sal) sum_sal from emp;	
5）	求工资的平均值（avg）
hive (default)> select avg(sal) avg_sal from emp;

Limit语句:

典型的查询会返回多行数据。LIMIT 子句用于限制返回的行数。

hive (default)> select * from emp limit 5;
hive (default)> select * from emp limit 2;

一组实操的案例:

（1）	查询出薪水等于 5000 的所有员工
hive (default)> select * from emp where sal =5000;
（2）	查询工资在 500 到 1000 的员工信息(包括边界值500和1000)
hive (default)> select * from emp where sal between 500 and 1000;
（3）	查询 comm 为空的所有员工信息
hive (default)> select * from emp where comm is null;
（4）	查询工资是 1500 或 5000 的员工信息(这里只有1500,5000两个值)
hive (default)> select * from emp where sal IN (1500, 5000);

Like 和 RLike

使用 LIKE 运算选择类似的值

（1）	查找名字以A 开头的员工信息
hive (default)> select * from emp where ename LIKE 'A%';
（2）	查找名字中第二个字母为A 的员工信息
hive (default)> select * from emp where ename LIKE '_A%';
（3）	查找名字中带有A 的员工信息
hive (default)> select * from emp where ename RLIKE '[A]';

逻辑运算符（And/Or/Not）

（1）	查询薪水大于 1000，部门是 30
hive (default)> select * from emp where sal>1000 and deptno=30;
（2）	查询薪水大于 1000，或者部门是 30
hive (default)> select * from emp where sal>1000 or deptno=30;	
（3）	查询除了 20 部门和 30 部门以外的员工信息
hive (default)> select * from emp where deptno not IN(30, 20);

Group By 语句

（1）计算 emp 表每个部门的平均工资

hive (default)> select deptno,avg(sal) from emp  group by deptno;

（2）计算 emp 每个部门中每个岗位的最高薪水

hive (default)> select deptno,job,max(sal) from emp group by deptno,job;

原文链接：https://blog.csdn.net/weixin_43586713/article/details/116158354