Hadoop MR 之(四) InputFormat 类 / OutputFormat类

前言

在前一章内, 我们讲述了如何自定义Combiner/ Partition /GroupingComaprator类.
本章我们讲解下如何自定义输入与输出. 也就是InputFormat类与OutPutFormat类.

本文相关代码, 可在我的Github项目 https://github.com/SeanYanxml/bigdata/ 目录下可以找到. PS: (如果觉得项目不错, 可以给我一个Star.)

知识准备

我们上章中, 稍微讲解了下Hadoop MR的 Shuffle操作. 本章, 主要自定义的为InputFormat与OutputFormat部分.(图略,等待补充.)

编码实现

自定义InputFormat

在处理某些小文件时候, 我们可以将其合并读取. 基本代码如下所示:


/**
 * RecordReader的核心工作逻辑.
 * 
 * 通过nextKeyValue()方法读取数据构造选择返回的<key,value>
 * 通过 getCurrentKey()与getCurrentValue()返回构造好的<key,value>.
 * */
public class MergeLittleFileInputFormat extends FileInputFormat<LongWritable, Text> {

	// 判断传入的文件是否能够被切片.
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
		return false;
	}

	@Override
	public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
		MergeLittleFileReader reader = new MergeLittleFileReader();
		reader.initialize(split, context);
		return reader;
	}
	
	static class MergeLittleFileReader extends RecordReader<LongWritable, Text>{

		private FileSplit fileSplit;
		private TaskAttemptContext context;

		private Configuration conf;
		private int count = 0;
		private LongWritable key = new  LongWritable();
		private Text value = new Text();
		private boolean processed = false;
		
		@Override
		public void close() throws IOException {
			
		}

		@Override
		public LongWritable getCurrentKey()throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return key;
		}

		@Override
		public Text getCurrentValue() throws IOException, InterruptedException {
			return value;
		}

		@Override
		public float getProgress() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return 0;
		}

		@Override
		public void initialize(InputSplit split, TaskAttemptContext context)
				throws IOException, InterruptedException {
			this.fileSplit = (FileSplit)split;
			this.context = context;
			this.conf = context.getConfiguration();
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			if(!processed){
				byte[] contents = new byte[(int) fileSplit.getLength()];
				Path file = fileSplit.getPath();
				FileSystem fs = file.getFileSystem(conf);
				FSDataInputStream in = null;
				try{
					in = fs.open(file);
					IOUtils.readFully(in, contents, 0, contents.length);
					// key 随便设置一个值
					key.set(count++);
					value.set(contents,0,contents.length);
				}finally{
					IOUtils.closeStream(in);
				}
 			}
			return false;
		}
		
	}
	
	// 简单的启动demo
	public static void main(String[] args) throws IOException {
		Configuration conf =  new Configuration();		
		conf.set("mapreduce.framework.name", "yarn");
//		conf.set("mapreduce.framework.name", "local");
		conf.set("yarn.resourcemanager.hostname", "localhost");
		conf.set("fs.defaultFS", "hdfs://localhost:9000/");
		
		Job job =  Job.getInstance(conf);
		job.setInputFormatClass(MergeLittleFileInputFormat.class);
	}
	
}

通过job.setInputFormatClass(MergeLittleFileInputFormat.class);就可以使用自定义的InputFormat进行读取文件了.

自定义OutputFormat

在处理日志的时候,有时候我们需要将日志按照时间, 将结果输出到不同的文件. 基本代码如下所示:


/**
 * mapTask或者reducetask在最终输出时, 先调用Outformat的getRecordWriter方法拿到一个RecordWriter
 * 然后再调用RecordWriter的write(k,v)方法将数据写出.
 * 
 * */
public class LogEnhanceOutputFormat extends FileOutputFormat<IntWritable, Text>{

	@Override
	public RecordWriter<IntWritable, Text> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
		// 可以根据key值 写入不同的文件 <1, 2222> <2, 3131>
		// key为1写入1号文件内 key为2写入2号文件内.
		FileSystem fs = FileSystem.get(context.getConfiguration());
		Path enhancePath = new Path("hdfs://localhost:9000/log/enhance/output/enhance.log");
		Path tocrawPath = new Path("hdfs://localhost:9000/log/enhance/output/tocraw.log");

		FSDataOutputStream enhanceStream = fs.create(enhancePath);
		FSDataOutputStream tocrawStream = fs.create(tocrawPath);

		
		
		return new EnHanceRecordWriter(enhanceStream,tocrawStream);
	}
	
	/**
	 * 构造一个自己的recordWriter
	 * */
	static class EnHanceRecordWriter extends RecordWriter<IntWritable, Text>{
		FSDataOutputStream enhanceStream = null;
		FSDataOutputStream tocrawStream = null;
		public EnHanceRecordWriter(){}
		
		public EnHanceRecordWriter(FSDataOutputStream enhanceStream, FSDataOutputStream tocrawStream){
			this.enhanceStream = enhanceStream;
			this.tocrawStream = tocrawStream;
		}
		@Override
		public void close(TaskAttemptContext context) throws IOException, InterruptedException {
			if(null != enhanceStream){
				enhanceStream.close();
				tocrawStream.close();
			}
		}

		@Override
		public void write(IntWritable key, Text value) throws IOException, InterruptedException {
			int keyNum = key.get();
			// 写入第一个流
			if(keyNum > 1000){
				enhanceStream.write(value.toString().getBytes());
			}else{
				// 写入第二个流
				tocrawStream.write(value.toString().getBytes());
			}
		}
		
	}
	
	public static void main(String[] args) throws IOException {
		Configuration conf =  new Configuration();		
		conf.set("mapreduce.framework.name", "yarn");
//		conf.set("mapreduce.framework.name", "local");
		conf.set("yarn.resourcemanager.hostname", "localhost");
		conf.set("fs.defaultFS", "hdfs://localhost:9000/");
		
		Job job =  Job.getInstance(conf);
		job.setOutputFormatClass(LogEnhanceOutputFormat.class);
	}

}

通过job.setOutputFormatClass(LogEnhanceOutputFormat.class);就可以自定义输出文件了.