5.MR应用_经典案例二次排序

一.执行命令:设置MR参数

yarn jar hdp-jar-with-dependencies.jar \

cn.tl.WordCount \

-Dmapred.output.compress=true \

-Dmapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \

-Dmapred.reduce.tasks=2 \

/user/input /user/output1

2.自定义partition

/**
 * 自定义Partition的定义
 * 自定义规则:单词首字母在a - p为一个分区，q - z为另一个分区
 */

public static class MyHashPartitioner<K, V> extends Partitioner<K, V> {

    public int getPartition(K key, V value, int numReduceTasks) {

        return (key.toString().charAt(0) < 'q' ? 0 : 1) % numReduceTasks;

    }

}

3.wordcount二次排序

package cn.tl.secondsort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

// 启动mr的driver类
public class SecondSort {
    /**
     * 自定义的newKey
     */
    public static class NewKeyWritable implements
            WritableComparable<NewKeyWritable> {
        // 组合key,key1是分区key，key2是二次排序key
        private String key1;
        private int key2;
        public NewKeyWritable() {
        }
        public NewKeyWritable(String key1, int key2) {
            this.set(key1, key2);
        }
        // 一次性将两个key设置成完
        public void set(String key1, int key2) {
            this.key1 = key1;
            this.key2 = key2;
        }
        // 当map端写出的时候的序列化方法,即map如何将对象写出去,保证与读取的顺序一致
        @Override
        public void write(DataOutput arg0) throws IOException {
            arg0.writeUTF(key1);
            arg0.writeInt(key2);
        }
        // 在reducer读取数据时候的反序列化方法,即reduce如何将对象读取出来,保证与写入的顺序一致
        @Override
        public void readFields(DataInput arg0) throws IOException {
            this.key1 = arg0.readUTF();
            this.key2 = arg0.readInt();
        }
        // 自定义比较器方法，先比较key1,确定分区号。在分区号相同的情况下，去比较key2
        // 就不需要单独写一个Comparator了
        public int compareTo(NewKeyWritable o) {
            int compare = this.key1.compareTo(o.key1);
            if (compare != 0) {
                return compare;
            } else {
                // 降序排列，故将o放到前边即可
                return Integer.valueOf(o.key2).compareTo(
                        Integer.valueOf(this.getkey2()));
            }
        }
        public int getkey2() {
            return key2;
        }
        public void setkey2(int key2) {
            this.key2 = key2;
        }
        public String getkey1() {
            return key1;
        }
        public void setkey1(String key1) {
            this.key1 = key1;
        }
    }
    // map类，实现map函数
    public static class LineProcessMapper extends
            Mapper<Object, Text, NewKeyWritable, IntWritable> {
        // 暂存每个传过来的词的值，省掉重复申请空间
        private NewKeyWritableoutputKey = new NewKeyWritable();
        private IntWritable outputValue = new IntWritable();
        // 核心map方法的具体实现,逐个<key,value>对去处理
        public void map(Object key, Text value, Context context)
                throws IOException, InterruptedException {
            // 通过context对象，将map的输出逐个输出
            String tempLine = value.toString();
            if (tempLine != null && tempLine.trim().length() > 0) {
                String[] columnArray = tempLine.split("\\s");
                outputKey.set(columnArray[0], Integer.parseInt(columnArray[1]));
                outputValue.set(Integer.parseInt(columnArray[1]));
                context.write(outputKey, outputValue);
            }
        }
    }
    /**
     * 自定义分区类，包证同key的记录,如S1,S2等,能映射到相同的reduce端去处理
     */
    public static class SecondPartitioner extends
            Partitioner<NewKeyWritable, IntWritable> {
        // 采集默认的HashPartiton实现即可
        @Override
        public int getPartition(NewKeyWritablekey, IntWritable value,
                int numPartitions) {
            /*
             * 默认的实现 (key.hashCode() & Integer.MAX_VALUE) % numPartitions
             * 让key中first字段作为分区依据
             */
            return (key.getkey1().hashCode() & Integer.MAX_VALUE)
                    % numPartitions;
        }
    }
    /**
     * 在shuffle阶段的sort全局排序完成后，如何对数据记录进行分组
     */
    public static class SecondSortGroupComparator extends WritableComparator {
        // 对象NewKeyWritable.class注册，让比较器知道该对象并能够初始化
        protected SecondSortGroupComparator() {
            super(NewKeyWritable.class, true);
        }
        @Override
        public int compare(WritableComparable first, WritableComparable second) {
            if (first == null || second == null) {
                return 0;
            }
            NewKeyWritable newKey1 = (NewKeyWritable) first;
            NewKeyWritable newKey2 = (NewKeyWritable) second;
            // 自定义按原始数据中第一个key分组
            return newKey1.getkey1().compareTo(newKey2.getkey1());
        }
    }
    // reduce类，实现reduce函数
    public static class SortReducer extends
            Reducer<NewKeyWritable, IntWritable, Text, IntWritable> {
        private Text outputKey = new Text();
        // 核心reduce方法的具体实现,逐个<key,List(v1,v2)>去处理
        public void reduce(NewKeyWritable keyPair,
                Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            // 进来时已经排序完成
            outputKey.set(keyPair.getkey1());
            for (IntWritable val : values) {
                context.write(outputKey, val);
            }
        }
    }
    // 启动mr的driver方法
    public static void main(String[] args) throws Exception {
        // 得到集群配置参数
        Configuration conf = new Configuration();
        // 参数解析器
        GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
        String[] remainingArgs = optionParser.getRemainingArgs();
        if ((remainingArgs.length != 2)) {
            System.err
                    .println("Usage: yarn jar jar_path main_class_path -D参数列表 <in> <out>");
            System.exit(2);
        }
        // 设置到本次的job实例中
        Job job = Job.getInstance(conf, "mr二次排序");
        // 指定本次执行的主类是WordCount
        job.setJarByClass(SecondSort.class);
        // 指定map类
        job.setMapperClass(LineProcessMapper.class);
        // 指定partition类
        job.setPartitionerClass(SecondPartitioner.class);
        job.setGroupingComparatorClass(SecondSortGroupComparator.class);
        // 指定reducer类
        job.setReducerClass(SortReducer.class);
        // 指定job输出的key和value的类型,如果map和reduce输出类型不完全相同，需要重新设置map的output的key和value的class类型
        job.setMapOutputKeyClass(NewKeyWritable.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // 指定输入数据的路径
        FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
        // 指定输出路径,并要求该输出路径一定是不存在的
        FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
        // 指定job执行模式，等待任务执行完成后，提交任务的客户端才会退出!
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

这里的重点是对mr过程的理解。在我看来mr除了分布式、并行之外，它的运行过程的核心就是：对Key进行排序。因此，我们很多mr的变形都要围绕着"如何来构造合适的Key"进行。同时，注意：Partitioner规则、Key排序规则、Group分组规则。

4.传递参数

Configuration传递：在Job启动时，通过Configuration存储白名单数据，传递给多个Mapper和Reducer。

package cn.tl.mr;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;

//启动mr的driver类
public class ConfigSetTransferDriver {
	public static Logger logger = Logger.getLogger(ConfigSetTransferDriver.class);

	// map类，实现map函数
	public static class LineProcessMapper extends Mapper<Object, Text, Text, IntWritable> {
		// 暂存每个传过来的词的值，省掉重复申请空间
		private Text outputKey = new Text();
		private IntWritable outputValue = new IntWritable();
		// 过滤whitename的set集合
		private Set<String> whiteNameSet = new HashSet<String>();

		// 每个map任务有且仅会执行一次setup方法，用于初始化map函数执行前的所需参数
		@Override
		protected void setup(Mapper<Object, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			Configuration conf = context.getConfiguration();
			String whitelistString = conf.get("whitelist");
			String[] whiteNameArray = whitelistString.split("\\s");
			whiteNameSet.addAll(Arrays.asList(whiteNameArray));
		}

		// 核心map方法的具体实现,逐个<key,value>对去处理
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
			// 通过context对象，将map的输出逐个输出
			String tempLine = value.toString();
			if (tempLine != null && tempLine.trim().length() > 0) {
				String[] columnArray = tempLine.split("\\s");
				if (whiteNameSet.contains(columnArray[0])) {
					outputKey.set(columnArray[0]);
					outputValue.set(Integer.parseInt(columnArray[1]));
					context.write(outputKey, outputValue);
				}
			}
		}
	}

	// reduce类，实现reduce函数
	public static class SortReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
		// 核心reduce方法的具体实现,逐个<key,List(v1,v2)>去处理
		public void reduce(Text key, Iterable<IntWritable> values, Context context)
				throws IOException, InterruptedException {
			// 加强型for,依次获取迭代器中的每个元素值
			for (IntWritable val : values) {
				// 将计算结果逐条输出
				context.write(key, val);
			}
		}
	}

	// 读取一个指定本地路径和文件编码的文件内容，转换成字符串
	public static String readFile(String filePath, String fileEncoding) {
		if (fileEncoding == null) {
			fileEncoding = System.getProperty("file.encoding");
		}
		File file = new File(filePath);
		BufferedReader br = null;
		String line = null;
		StringBuilder sb = new StringBuilder();
		try {
			br = new BufferedReader(new InputStreamReader(new FileInputStream(file), fileEncoding));
			while ((line = br.readLine()) != null) {
				sb.append(line + "\n");
			}
			return sb.toString();
		} catch (Exception e) {
			logger.info(e.getLocalizedMessage());
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					logger.info(e.getLocalizedMessage());
					logger.info("关闭IOUtil流时出现错误!");
				}
			}
		}
		return null;
	}

	// 配置文件读取与值传递
	public static void readConfigAndTransfer(Configuration conf, String filePath) {
		// 读取本地配置文件
		String source = readFile(filePath, "utf-8");
		// 将配置文件中的值通过conf set的方式传递 到计算节点中
		conf.set("whitelist", source);
		// 通过日志打印的方式，将读取到的值，打印出来，如不打印日志，可去除以下代码段
		logger.info("whitelist=" + source);
	}

	// 启动mr的driver方法
	public static void main(String[] args) throws Exception {
		// 得到集群配置参数
		Configuration conf = new Configuration();
		// 参数解析器
		GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
		String[] remainingArgs = optionParser.getRemainingArgs();
		if ((remainingArgs.length < 3)) {
			System.err.println("Usage: yarn jar jar_path main_class_path -D参数列表 <in> <out>");
			System.exit(2);
		}
		// 配置参数读取与传递
		readConfigAndTransfer(conf, remainingArgs[2]);
		// 设置到本次的job实例中
		Job job = Job.getInstance(conf, "configuration传参");
		// 指定本次执行的主类是WordCount
		job.setJarByClass(ConfigSetTransferDriver.class);
		// 指定map类
		job.setMapperClass(LineProcessMapper.class);
		// 指定reducer类
		job.setReducerClass(SortReducer.class);
		// 指定job输出的key和value的类型,如果map和reduce输出类型不完全相同，需要重新设置map的output的key和value的class类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		// 指定输入数据的路径
		FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
		// 指定输出路径,并要求该输出路径一定是不存在的
		FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
		// 指定job执行模式，等待任务执行完成后，提交任务的客户端才会退出!
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

DistributedCache：通过DistributedCache传递参数，会将HDFS上的白名单文件自动同步到每个Map任务的本地临时目录中，然后，每个Mapper或Reducer任务可以读取本地临时目录的该文件获取参数。

package cn.tl.mr;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;

public class ConfigSetDistributeCacheDriver {
	public static Logger logger = Logger.getLogger(ConfigSetDistributeCacheDriver.class);

	// map类，实现map函数
	public static class LineProcessMapper extends Mapper<Object, Text, Text, IntWritable> {
		// 暂存每个传过来的词的值，省掉重复申请空间
		private Text outputKey = new Text();
		private IntWritable outputValue = new IntWritable();
		// 过滤whitename的set集合
		private Set<String> whiteNameSet = new HashSet<String>();

		public static String readFile(String filePath, String fileEncoding) {
			if (fileEncoding == null) {
				fileEncoding = System.getProperty("file.encoding");
			}
			File file = new File(filePath);
			BufferedReader br = null;
			String line = null;
			StringBuilder sb = new StringBuilder();
			try {
				br = new BufferedReader(new InputStreamReader(new FileInputStream(file), fileEncoding));
				while ((line = br.readLine()) != null) {
					sb.append(line + "\n");
				}
				return sb.toString();
			} catch (Exception e) {
				logger.info(e.getLocalizedMessage());
			} finally {
				if (br != null) {
					try {
						br.close();
					} catch (IOException e) {
						logger.info(e.getLocalizedMessage());
						logger.info("关闭IOUtil流时出现错误!");
					}
				}
			}
			return null;
		}

		@Override
		protected void setup(Mapper<Object, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			// 通过上下文，获取本地缓存的配置文件列表
			Path[] localCacheFiles = context.getLocalCacheFiles();
			// 因为只缓存了一个，故直接读取第1个即可
			String whiteListSource = readFile(localCacheFiles[0].toString(), "utf-8");
			// 将读取的文件路径进行日志打印
			logger.info("localCacheFiles=" + Arrays.toString(localCacheFiles));
			// 将读出来的内容按空白分隔，形成一个字符串数组
			String[] whiteNameArray = whiteListSource.split("\\s");
			// 将字符串数组转化成set结构，专门用于去重
//			whiteNameSet.addAll(Arrays.asList(whiteNameArray));
			for (String str : whiteNameArray) {
				whiteNameSet.add(str);
			}
		}

		// 核心map方法的具体实现,逐个<key,value>对去处理
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
			// 通过context对象，将map的输出逐个输出
			String tempLine = value.toString();
			if (tempLine != null && tempLine.trim().length() > 0) {
				String[] columnArray = tempLine.split("\\s");
				if (whiteNameSet.contains(columnArray[0])) {
					outputKey.set(columnArray[0]);
					outputValue.set(Integer.parseInt(columnArray[1]));
					context.write(outputKey, outputValue);
				}
			}
		}
	}

	// reduce类，实现reduce函数
	public static class SortReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
		// 核心reduce方法的具体实现,逐个<key,List(v1,v2)>去处理
		public void reduce(Text key, Iterable<IntWritable> values, Context context)
				throws IOException, InterruptedException {
			// 加强型for,依次获取迭代器中的每个元素值
			for (IntWritable val : values) {
				// 将计算结果逐条输出
				context.write(key, val);
			}
		}
	}

	// 启动mr的driver方法
	public static void main(String[] args) throws Exception {
		// 得到集群配置参数
		Configuration conf = new Configuration();
		// 参数解析器
		GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
		String[] remainingArgs = optionParser.getRemainingArgs();
		if ((remainingArgs.length < 3)) {
			System.err.println("Usage: yarn jar jar_path main_class_path -D参数列表 <in> <out>");
			System.exit(2);
		}
		// 设置到本次的job实例中
		Job job = Job.getInstance(conf, "DistributeCache传参");
		/**
		 * 将hdfs的路径添加到cache file列表中，框架将会自动将hdfs文件分发到map任务的临时目录中
		 */
		// DistributeCache传递配置文件,下方代码为2.x的实现
		job.addCacheFile(new Path(remainingArgs[2]).toUri());
		// 指定本次执行的主类是WordCount
		job.setJarByClass(ConfigSetDistributeCacheDriver.class);
		// 指定map类
		job.setMapperClass(LineProcessMapper.class);
		// 指定reducer类
		job.setReducerClass(SortReducer.class);
		// 指定job输出的key和value的类型,如果map和reduce输出类型不完全相同，需要重新设置map的output的key和value的class类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		// 指定输入数据的路径
		FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
		// 指定输出路径,并要求该输出路径一定是不存在的
		FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
		// 指定job执行模式，等待任务执行完成后，提交任务的客户端才会退出!
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

5.MR应用_经典案例二次排序

猜你喜欢