mapreduce针对n列orc文件的读写

建300列的ORC表,可以用execl简单建一个300列,10000行的数据,复制成以tab分割的txt文件
hdfs dfs -put ddd.txt hdfs://hadoop:9000/tmp/input/

create table test_orc_300(
c1  string      , 
c2  string      , 
c3  string      , 
c4  string      , 
c5  string      , 
c6  string      , 
c7  string      , 
c8  string      , 
c9  string      , 
c10 string      , 
c11 string      , 
c12 string      , 
c13 string      , 
c14 string      , 
c15 string      , 
c16 string      , 
c17 string      , 
c18 string      , 
c19 string      , 
c20 string      , 
c21 string      , 
c22 string      , 
c23 string      , 
c24 string      , 
c25 string      , 
c26 string      , 
c27 string      , 
c28 string      , 
c29 string      , 
c30 string      , 
c31 string      , 
c32 string      , 
c33 string      , 
c34 string      , 
c35 string      , 
c36 string      , 
c37 string      , 
c38 string      , 
c39 string      , 
c40 string      , 
c41 string      , 
c42 string      , 
c43 string      , 
c44 string      , 
c45 string      , 
c46 string      , 
c47 string      , 
c48 string      , 
c49 string      , 
c50 string      , 
c51 string      , 
c52 string      , 
c53 string      , 
c54 string      , 
c55 string      , 
c56 string      , 
c57 string      , 
c58 string      , 
c59 string      , 
c60 string      , 
c61 string      , 
c62 string      , 
c63 string      , 
c64 string      , 
c65 string      , 
c66 string      , 
c67 string      , 
c68 string      , 
c69 string      , 
c70 string      , 
c71 string      , 
c72 string      , 
c73 string      , 
c74 string      , 
c75 string      , 
c76 string      , 
c77 string      , 
c78 string      , 
c79 string      , 
c80 string      , 
c81 string      , 
c82 string      , 
c83 string      , 
c84 string      , 
c85 string      , 
c86 string      , 
c87 string      , 
c88 string      , 
c89 string      , 
c90 string      , 
c91 string      , 
c92 string      , 
c93 string      , 
c94 string      , 
c95 string      , 
c96 string      , 
c97 string      , 
c98 string      , 
c99 string      , 
c100    string    , 
c101    string    , 
c102    string    , 
c103    string    , 
c104    string    , 
c105    string    , 
c106    string    , 
c107    string    , 
c108    string    , 
c109    string    , 
c110    string    , 
c111    string    , 
c112    string    , 
c113    string    , 
c114    string    , 
c115    string    , 
c116    string    , 
c117    string    , 
c118    string    , 
c119    string    , 
c120    string    , 
c121    string    , 
c122    string    , 
c123    string    , 
c124    string    , 
c125    string    , 
c126    string    , 
c127    string    , 
c128    string    , 
c129    string    , 
c130    string    , 
c131    string    , 
c132    string    , 
c133    string    , 
c134    string    , 
c135    string    , 
c136    string    , 
c137    string    , 
c138    string    , 
c139    string    , 
c140    string    , 
c141    string    , 
c142    string    , 
c143    string    , 
c144    string    , 
c145    string    , 
c146    string    , 
c147    string    , 
c148    string    , 
c149    string    , 
c150    string    , 
c151    string    , 
c152    string    , 
c153    string    , 
c154    string    , 
c155    string    , 
c156    string    , 
c157    string    , 
c158    string    , 
c159    string    , 
c160    string    , 
c161    string    , 
c162    string    , 
c163    string    , 
c164    string    , 
c165    string    , 
c166    string    , 
c167    string    , 
c168    string    , 
c169    string    , 
c170    string    , 
c171    string    , 
c172    string    , 
c173    string    , 
c174    string    , 
c175    string    , 
c176    string    , 
c177    string    , 
c178    string    , 
c179    string    , 
c180    string    , 
c181    string    , 
c182    string    , 
c183    string    , 
c184    string    , 
c185    string    , 
c186    string    , 
c187    string    , 
c188    string    , 
c189    string    , 
c190    string    , 
c191    string    , 
c192    string    , 
c193    string    , 
c194    string    , 
c195    string    , 
c196    string    , 
c197    string    , 
c198    string    , 
c199    string    , 
c200    string    , 
c201    string    , 
c202    string    , 
c203    string    , 
c204    string    , 
c205    string    , 
c206    string    , 
c207    string    , 
c208    string    , 
c209    string    , 
c210    string    , 
c211    string    , 
c212    string    , 
c213    string    , 
c214    string    , 
c215    string    , 
c216    string    , 
c217    string    , 
c218    string    , 
c219    string    , 
c220    string    , 
c221    string    , 
c222    string    , 
c223    string    , 
c224    string    , 
c225    string    , 
c226    string    , 
c227    string    , 
c228    string    , 
c229    string    , 
c230    string    , 
c231    string    , 
c232    string    , 
c233    string    , 
c234    string    , 
c235    string    , 
c236    string    , 
c237    string    , 
c238    string    , 
c239    string    , 
c240    string    , 
c241    string    , 
c242    string    , 
c243    string    , 
c244    string    , 
c245    string    , 
c246    string    , 
c247    string    , 
c248    string    , 
c249    string    , 
c250    string    , 
c251    string    , 
c252    string    , 
c253    string    , 
c254    string    , 
c255    string    , 
c256    string    , 
c257    string    , 
c258    string    , 
c259    string    , 
c260    string    , 
c261    string    , 
c262    string    , 
c263    string    , 
c264    string    , 
c265    string    , 
c266    string    , 
c267    string    , 
c268    string    , 
c269    string    , 
c270    string    , 
c271    string    , 
c272    string    , 
c273    string    , 
c274    string    , 
c275    string    , 
c276    string    , 
c277    string    , 
c278    string    , 
c279    string    , 
c280    string    , 
c281    string    , 
c282    string    , 
c283    string    , 
c284    string    , 
c285    string    , 
c286    string    , 
c287    string    , 
c288    string    , 
c289    string    , 
c290    string    , 
c291    string    , 
c292    string    , 
c293    string    , 
c294    string    , 
c295    string    , 
c296    string    , 
c297    string    , 
c298    string    , 
c299    string    , 
c300    string      
) stored as orc ; 

ORC读文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcInputFormat;
import java.io.IOException;


public class OrcReaderMR {
    public static class OrcMap extends Mapper<NullWritable,OrcStruct,NullWritable,Text>{
        private Text text = new Text();
        public void map(NullWritable key, OrcStruct value,
                        Context output) throws IOException, InterruptedException {
            StringBuffer bf = new StringBuffer();
            for(int i=0;i<value.getNumFields();i++){
                WritableComparable fieldValue = value.getFieldValue(i);
                bf.append(fieldValue.toString()).append("\t");
            }
           text.set(bf.toString());
            output.write(NullWritable.get(),text);
            }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
        System.setProperty("HADOOP_USER_NAME", "root");
        //设置开发环境变量
        System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");

        Job job = Job.getInstance(conf);
        job.setJarByClass(OrcReaderMR.class);
        job.setJobName("OrcReaderMR");
        job.setMapperClass(OrcMap.class);
        job.setInputFormatClass(OrcInputFormat.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(TextOutputFormat.class);

        // 指定该mapreduce程序数据的输入路径
        Path inputPath = new Path("/user/hive/warehouse/test_orc_300");

        // 指定该mapreduce程序数据的输出路径
        Path outputPath = new Path("/user/hive/warehouse/test_orc_300_out");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);
        boolean waitForCompletion = job.waitForCompletion(true);
        System.exit(waitForCompletion ? 0 : 1);
    }
}

运行完后查看文件内容

hdfs dfs -cat hdfs://hadoop:9000/user/hive/warehouse/test_orc_300_out/part-m-00000

ORC写文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.orc.OrcConf;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcOutputFormat;
import parquet.filter2.predicate.Operators;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class OrcWriterMR {
    public static class OrcWriterMapper
            extends Mapper<LongWritable,Text,NullWritable,OrcStruct> {


        private TypeDescription schema =
                TypeDescription.fromString("struct<c1:string,c2:string,c3:string,c4:string,c5:string,c6:string,c7:string,c8:string,c9:string,c10:string,c11:string,c12:string,c13:string,c14:string,c15:string,c16:string,c17:string,c18:string,c19:string,c20:string,c21:string,c22:string,c23:string,c24:string,c25:string,c26:string,c27:string,c28:string,c29:string,c30:string,c31:string,c32:string,c33:string,c34:string,c35:string,c36:string,c37:string,c38:string,c39:string,c40:string,c41:string,c42:string,c43:string,c44:string,c45:string,c46:string,c47:string,c48:string,c49:string,c50:string,c51:string,c52:string,c53:string,c54:string,c55:string,c56:string,c57:string,c58:string,c59:string,c60:string,c61:string,c62:string,c63:string,c64:string,c65:string,c66:string,c67:string,c68:string,c69:string,c70:string,c71:string,c72:string,c73:string,c74:string,c75:string,c76:string,c77:string,c78:string,c79:string,c80:string,c81:string,c82:string,c83:string,c84:string,c85:string,c86:string,c87:string,c88:string,c89:string,c90:string,c91:string,c92:string,c93:string,c94:string,c95:string,c96:string,c97:string,c98:string,c99:string,c100:string,c101:string,c102:string,c103:string,c104:string,c105:string,c106:string,c107:string,c108:string,c109:string,c110:string,c111:string,c112:string,c113:string,c114:string,c115:string,c116:string,c117:string,c118:string,c119:string,c120:string,c121:string,c122:string,c123:string,c124:string,c125:string,c126:string,c127:string,c128:string,c129:string,c130:string,c131:string,c132:string,c133:string,c134:string,c135:string,c136:string,c137:string,c138:string,c139:string,c140:string,c141:string,c142:string,c143:string,c144:string,c145:string,c146:string,c147:string,c148:string,c149:string,c150:string,c151:string,c152:string,c153:string,c154:string,c155:string,c156:string,c157:string,c158:string,c159:string,c160:string,c161:string,c162:string,c163:string,c164:string,c165:string,c166:string,c167:string,c168:string,c169:string,c170:string,c171:string,c172:string,c173:string,c174:string,c175:string,c176:string,c177:string,c178:string,c179:string,c180:string,c181:string,c182:string,c183:string,c184:string,c185:string,c186:string,c187:string,c188:string,c189:string,c190:string,c191:string,c192:string,c193:string,c194:string,c195:string,c196:string,c197:string,c198:string,c199:string,c200:string,c201:string,c202:string,c203:string,c204:string,c205:string,c206:string,c207:string,c208:string,c209:string,c210:string,c211:string,c212:string,c213:string,c214:string,c215:string,c216:string,c217:string,c218:string,c219:string,c220:string,c221:string,c222:string,c223:string,c224:string,c225:string,c226:string,c227:string,c228:string,c229:string,c230:string,c231:string,c232:string,c233:string,c234:string,c235:string,c236:string,c237:string,c238:string,c239:string,c240:string,c241:string,c242:string,c243:string,c244:string,c245:string,c246:string,c247:string,c248:string,c249:string,c250:string,c251:string,c252:string,c253:string,c254:string,c255:string,c256:string,c257:string,c258:string,c259:string,c260:string,c261:string,c262:string,c263:string,c264:string,c265:string,c266:string,c267:string,c268:string,c269:string,c270:string,c271:string,c272:string,c273:string,c274:string,c275:string,c276:string,c277:string,c278:string,c279:string,c280:string,c281:string,c282:string,c283:string,c284:string,c285:string,c286:string,c287:string,c288:string,c289:string,c290:string,c291:string,c292:string,c293:string,c294:string,c295:string,c296:string,c297:string,c298:string,c299:string,c300:string>");

        private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);


        private final NullWritable nada = NullWritable.get();



       // private IntWritable age = new IntWritable();

        public void map(LongWritable key, Text value,
                        Context output
        ) throws IOException, InterruptedException {
            List<Text> list =new ArrayList<Text>();
            for(int i=0;i<300;i++){
                Text text = new Text();
                list.add(text);
            }

            if(!"".equals(value.toString())){
                String[] arr = value.toString().split("\t");
                for(int n=0;n<300;n++) {
                    list.get(n).set(arr[n]);
                    pair.setFieldValue(n, list.get(n));
                }
                output.write(nada, pair);
            }
        }
    }



    public static void main(String[] args) throws Exception {
        // 指定mapreduce运行的hdfs相关的参数
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
        OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf,"struct<c1:string,c2:string,c3:string,c4:string,c5:string,c6:string,c7:string,c8:string,c9:string,c10:string,c11:string,c12:string,c13:string,c14:string,c15:string,c16:string,c17:string,c18:string,c19:string,c20:string,c21:string,c22:string,c23:string,c24:string,c25:string,c26:string,c27:string,c28:string,c29:string,c30:string,c31:string,c32:string,c33:string,c34:string,c35:string,c36:string,c37:string,c38:string,c39:string,c40:string,c41:string,c42:string,c43:string,c44:string,c45:string,c46:string,c47:string,c48:string,c49:string,c50:string,c51:string,c52:string,c53:string,c54:string,c55:string,c56:string,c57:string,c58:string,c59:string,c60:string,c61:string,c62:string,c63:string,c64:string,c65:string,c66:string,c67:string,c68:string,c69:string,c70:string,c71:string,c72:string,c73:string,c74:string,c75:string,c76:string,c77:string,c78:string,c79:string,c80:string,c81:string,c82:string,c83:string,c84:string,c85:string,c86:string,c87:string,c88:string,c89:string,c90:string,c91:string,c92:string,c93:string,c94:string,c95:string,c96:string,c97:string,c98:string,c99:string,c100:string,c101:string,c102:string,c103:string,c104:string,c105:string,c106:string,c107:string,c108:string,c109:string,c110:string,c111:string,c112:string,c113:string,c114:string,c115:string,c116:string,c117:string,c118:string,c119:string,c120:string,c121:string,c122:string,c123:string,c124:string,c125:string,c126:string,c127:string,c128:string,c129:string,c130:string,c131:string,c132:string,c133:string,c134:string,c135:string,c136:string,c137:string,c138:string,c139:string,c140:string,c141:string,c142:string,c143:string,c144:string,c145:string,c146:string,c147:string,c148:string,c149:string,c150:string,c151:string,c152:string,c153:string,c154:string,c155:string,c156:string,c157:string,c158:string,c159:string,c160:string,c161:string,c162:string,c163:string,c164:string,c165:string,c166:string,c167:string,c168:string,c169:string,c170:string,c171:string,c172:string,c173:string,c174:string,c175:string,c176:string,c177:string,c178:string,c179:string,c180:string,c181:string,c182:string,c183:string,c184:string,c185:string,c186:string,c187:string,c188:string,c189:string,c190:string,c191:string,c192:string,c193:string,c194:string,c195:string,c196:string,c197:string,c198:string,c199:string,c200:string,c201:string,c202:string,c203:string,c204:string,c205:string,c206:string,c207:string,c208:string,c209:string,c210:string,c211:string,c212:string,c213:string,c214:string,c215:string,c216:string,c217:string,c218:string,c219:string,c220:string,c221:string,c222:string,c223:string,c224:string,c225:string,c226:string,c227:string,c228:string,c229:string,c230:string,c231:string,c232:string,c233:string,c234:string,c235:string,c236:string,c237:string,c238:string,c239:string,c240:string,c241:string,c242:string,c243:string,c244:string,c245:string,c246:string,c247:string,c248:string,c249:string,c250:string,c251:string,c252:string,c253:string,c254:string,c255:string,c256:string,c257:string,c258:string,c259:string,c260:string,c261:string,c262:string,c263:string,c264:string,c265:string,c266:string,c267:string,c268:string,c269:string,c270:string,c271:string,c272:string,c273:string,c274:string,c275:string,c276:string,c277:string,c278:string,c279:string,c280:string,c281:string,c282:string,c283:string,c284:string,c285:string,c286:string,c287:string,c288:string,c289:string,c290:string,c291:string,c292:string,c293:string,c294:string,c295:string,c296:string,c297:string,c298:string,c299:string,c300:string>");

        //分布式集群设置
        // conf.set("mapred.jar", System.getProperty("user.dir")+"/WordCount.jar");
        System.setProperty("HADOOP_USER_NAME", "root");
        //设置开发环境变量
        System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");

        // 设置mapreduce运行模式,这也是默认值
        // conf.set("mapreduce.framework.name", "yarn");
        // conf.set("yarn.resourcemanager.hostname", "hadoop");

        // 获取job对象
        Job job = Job.getInstance(conf);

        // 设置jar包所在路径
        job.setJarByClass(OrcWriterMR.class);
        job.setJobName("OrcWriterMR");
        job.setNumReduceTasks(0);
        // 指定mapper类和reducer类
        job.setMapperClass(OrcWriterMapper.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(OrcOutputFormat.class);
        // 指定该mapreduce程序数据的输入路径
        Path inputPath = new Path("/tmp/input/300.txt");
        // 指定该mapreduce程序数据的输出路径
        Path outputPath = new Path("/user/hive/warehouse/test_orc_300");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        boolean waitForCompletion = job.waitForCompletion(true);
        System.exit(waitForCompletion ? 0 : 1);
    }
}

运行完后查看表的内容
select * from test_orc_300;


版权声明:本文为sunsiyuan521原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。