hive 参数优化 – 源码巴士

运行速度快的参数设置

SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
use adm;
use adm;
SET mapreduce.job.running.reduce.limit = 800;
SET mapreduce.job.running.map.limit = 1000;
SET mapreduce.job.reduce.slowstart.completedmaps = 1.0;
SET mapred.output.compress = true;
SET hive.exec.compress.output = true;
SET mapred.output.compression.codec = com.hadoop.compression.lzo.LzopCodec ;
SET io.compression.codecs = com.hadoop.compression.lzo.LzopCodec ;
SET hive.merge.smallfiles.avgsize = 134217728;
SET hive.auto.convert.join = true;
SET hive.mapjoin.smalltable.filesize = 250000000;
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
SET hive.exec.parallel = true ;
SET hive.exec.reducers.bytes.per.reducer = 100000000 ;
SET hive.groupby.skewindata = true;

SET mapreduce.map.memory.mb = 6144;
SET mapreduce.map.java.opts = - Xmx6144M;
SET mapreduce.map.cpu.vcores = 4;
SET mapreduce.reduce.memory.mb = 8192;
SET mapreduce.reduce.java.opts = - Xmx6144M;
SET mapreduce.reduce.cpu.vcores = 8;
SET yarn.app.mapreduce.am.resource.cpu-vcores = 6;
SET yarn.app.mapreduce.am.resource.mb = 8192;
SET yarn.app.mapreduce.am.command-opts = - Xmx6144m;
SET mapreduce.task.io.sort.mb = 1024;

讲解：

--设置动态分区

SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
--单个任务并发的最大reduce数，0或负数没有限制
SET mapreduce.job.running.reduce.limit = 800;

--单个任务并发的最大map数，0或负数没有限制
SET mapreduce.job.running.map.limit = 1000;

// reduceSlowStart取参数mapreduce.job.reduce.slowstart.completedmaps，默认为0.05， // 其代表当Map Task完成的比例达到该值后才会为Reduce Task申请资源
SET mapreduce.job.reduce.slowstart.completedmaps = 1.0;

1、当输出数据量较大时，可以使用Hadoop提供的压缩机制对数据进行压缩，减少网络传输宽带以及存储消耗。

2、可以指定对map的输出结果进行压缩，也就是整个mapreduce中间过程进行压缩，也可以指定对reduce的输出结果也就是最终结果进行压缩。

3、其中对map输出进行压缩主要是为了减少shuffle过程中网络传输的数据量，而对reduce输出进行压缩主要是为了减少输出结果占用的HDFS存储。

SET mapred.output.compress = true;
SET hive.exec.compress.output = true;
SET mapred.output.compression.codec = com.hadoop.compression.lzo.LzopCodec ;
SET io.compression.codecs = com.hadoop.compression.lzo.LzopCodec ;

案例：

-jobconf  "mapred.compress.map.output=true" \

#表示的是在map阶段进行数据压缩。

-jobconf  "mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec" \

#表示的是map阶段的输出压缩为何种格式，常用的Gzip格式。

-jobconf  "mapred.output.compress=true" \

#表示reduce阶段最终的输出结果进行压缩。

-jobconf  "mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec" \

#同样压缩为Gzip格式。

--如果原先输出的文件平均大小小于这个值，则开启小文件合并。比如输出原本有100个文件，总大小1G，那平均每个文件大小只有10M，如果我们这个参数设置为16M，这时就会开启文件合并

SET hive.merge.smallfiles.avgsize = 134217728;

--Hive内置提供的优化机制之一就包括MapJoin。
在Hive v0.7之前，需要给出MapJoin的指示，Hive才会提供MapJoin的优化。Hive v0.7之后的版本已经不需要给出MapJoin的指示就进行优化。它是通过如下配置参数来控制的：hive> set hive.auto.convert.join=true;hive 0.11之后，在表的大小符合设置时（hive.auto.convert.join.noconditionaltask=true,hive.auto.convert.join.noconditionaltask.size=10000,hive.mapjoin.smalltable.filesize=25000000），默认会把join转换为map join（认 hive.ignore.mapjoin.hint为true，hive.auto.convert.join为true）,不过hive0.11的 map join bug比较多，可以通过在默认关闭map join convert,在需要时再设置hint：hive.auto.convert.join=false 。hive.ignore.mapjoin.hint=false.Hive v0.12.0版本，缺省状况下MapJoin优化是打开的。也就是hive.auto.convert.join=true。Hive还提供另外一个参数--表文件的大小作为开启和关闭MapJoin的阈值。hive.mapjoin.smalltable.filesize=25000000

SET hive.auto.convert.join = true;
SET hive.mapjoin.smalltable.filesize = 250000000;

--hive.merge.mapfiles，True时会合并map输出。
--hive.merge.mapredfiles，True时会合并reduce输出。
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles = true;

--hive.exec.parallel参数控制在同一个sql中的不同的job是否可以同时运行,默认为false.
SET hive.exec.parallel = true ;

--hive.exec.reducers.bytes.per.reducer：

-- 每个reducer的大小，

-- 默认是1G，输入文件如果是10G，那么就会起10个reducer；

SET hive.exec.reducers.bytes.per.reducer = 100000000 ;

--hive.groupby.skewindata=true;这个是给groupby优化的
SET hive.groupby.skewindata = true;

--频繁gc

--MR AppMaster资源设置
SET yarn.app.mapreduce.am.resource.cpu-vcores = 6;
SET yarn.app.mapreduce.am.resource.mb = 8192;
SET yarn.app.mapreduce.am.command-opts = - Xmx6144m;

--避免或减少缓存溢出的数量
SET mapreduce.task.io.sort.mb = 1024;

原文链接：https://blog.csdn.net/zhuiqiuuuu/article/details/118032454