UDTF介绍
UDTF(User-Defined Table-Generating Functions) 用来解决 输入一行输出多行(On-to-many maping) 的需求。
编写UDTF步骤
继承org.apache.hadoop.hive.ql.udf.generic.GenericUDTF,实现initialize, process, close三个方法。
- UDTF首先会调用initialize方法,此方法返回UDTF的返回行的信息(返回个数,类型)。
- 初始化完成后,会调用process方法,真正的处理过程在process函数中,在process中,每一次forward()调用产生一行;如果产生多列可以将多个列的值放在一个数组中,然后将该数组传入到forward()函数。
- 最后close()方法调用,对需要清理的方法进行清理。
注:在hive-0.13.0之后,initialize方法已经弃用
UDTF代码实现实例
package UDTF;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* 以空格拆分,判断个数
* 1室 整租20㎡ 共2层 [3个参数]
* 4室2厅2卫 次卧153㎡ 向北 低层/共6层 中等装修 普通住宅 [6个参数]
* 将以上两周情况拆分为7个字段,没有置空,依次为:【户型、面积、朝向、层数、装修状况、住宅类型、租赁方式】。
*
* @author leen 2017-02-16
*/
public class UdtfSplitStringToStrings extends GenericUDTF {
/**
* 重写initialize方法,指定返回字段名称
* 在hive-0.13.0之后,已经弃用
*
* @param arg0
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException {
//判断传入参数个数
if (arg0.length != 2) {
throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
}
//判断出入参数类型
if (arg0[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentException("ExplodeMap takes string as a parameter");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("hou_model");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("size");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("direction");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("floor");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("fitm_status");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("house_class");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("rent");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
/**
* @param arg0
* @throws HiveException
*/
@Override
public void process(Object[] arg0) throws HiveException {
String input = arg0[0].toString();
String split = arg0[1].toString();
String[] words = input.split(split);
String word2 = words[1];
Pattern p = Pattern.compile("([\\u4e00-\\u9fa5]+)(\\d+)");
Matcher m = p.matcher(word2);
if (m.find()) {
String[] words6 = new String[7];
if (words.length == 6) {
words6[0] = words[0];
words6[1] = m.group(2) + "m²";
words6[2] = words[2];
words6[3] = words[3];
words6[4] = words[4];
words6[5] = words[5];
words6[6] = m.group(1);
forward(words6);
}
String[] words3 = new String[7];
if (words.length == 3) {
words3[0] = words[0];
words3[1] = m.group(2) + "m²";
words3[2] = "";
words3[3] = words[2];
words3[4] = "";
words3[5] = "";
words3[6] = m.group(1);
forward(words3);
}
} else {
String[] words0 = {"", "", "", "", "", "", ""};
forward(words0);
}
}
@Override
public void close() throws HiveException {
// TODO Auto-generated method stub
}
}
使用方式
UDTF有两种使用方法,一种直接放到select后面,一种和lateral view一起使用。
1.直接放到select后面
add jar /home/leen/etl/jars/UdtfSplitStringToStrings.jar;
CREATE TEMPORARY FUNCTION split_string AS 'UDTF.UdtfSplitStringToStrings';
> select split_string(str) as (col1,col2) from test_tbl ;
// 不可以添加其他字段使用;
// 不可以嵌套调用;
// 不可以和group by/cluster by/distribute by/sort by一起使用2.和lateral view一起使用
add jar /home/leen/etl/jars/UdtfSplitStringToStrings.jar;
CREATE TEMPORARY FUNCTION split_string AS 'UDTF.UdtfSplitStringToStrings';
> select a.id, b..col1, b.col2 from test_tbl a lateral view split_string(a.str) b as col1, col2;
//此方法更为方便日常使用。执行过程相当于单独执行了两次抽取,然后union到一个表里。参考
https://cwiki.apache.org/confluence/display/Hive/HivePlugins
https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF
版权声明:本文为leen0304原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。