网上已经有了很多成熟的教程,但是对于不懂MapReduce相关概念的新手理解起来均有些困难。
博主读了一天代码,终于将代码理解了,特此给大家分享
一、带详细注释的代码
//第一部分是导入各种包,没什么好讲的
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.WritableComparable;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//PhoneFlowDriver类中有FlowWritable,PhoneFlowMapper,PhoneFlowReducer和main
public class PhoneFlowDriver {
//自定义FlowWritable
static class FlowWritable implements WritableComparable<FlowWritable> {
private String phonenum; //电话号码
private long upflow; //上行流量
private long downflow; //下行流量
private long sumflow; //总流量
//空的构造方法
public FlowWritable() {}
//构造方法,利用电话号码,上行流量和下行流量来构造
public FlowWritable(String phonenum, long upflow, long downflow) {
super();
this.phonenum = phonenum;
this.upflow = upflow;
this.downflow = downflow;
this.sumflow = this.upflow + this.downflow;
}
//以下几个方法功能可以参照方法名称了解,我就不讲了
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.phonenum);
dataOutput.writeLong(this.upflow);
dataOutput.writeLong(this.downflow);
dataOutput.writeLong(this.sumflow);
}
public void readFields(DataInput dataInput) throws IOException {
this.phonenum = dataInput.readUTF();
this.upflow = dataInput.readLong();
this.downflow = dataInput.readLong();
this.sumflow = dataInput.readLong();
}
public String toString() {
return this.phonenum+"\t"+this.upflow+"\t"+this.downflow+"\t"+this.sumflow;
}
public String getPhone() {
return phonenum;
}
public void setPhone(String phone) {
this.phonenum = phone;
}
public long getUp() {
return upflow;
}
public void setUp(long up) {
this.upflow = up;
}
public long getDown() {
return downflow;
}
public void setDown(long down) {
this.downflow = down;
}
public long getSum() {
return sumflow;
}
//这个方法是排序方法,规定MapReduce键值对的键如何排序,如果看不懂这句话可以先跳过,全部看完再感悟
public int compareTo(FlowWritable o) {
return this.getSum()>o.getSum()?-1:1;
}
}
//Mapper前两个泛型是输入数据类型,第一个是Key的类型,第二个是Value的类型,<key, value>就是一个键值对。后两个是输出数据类型
//默认Key是要处理的文本中一行的起始偏移量,Value是这一行的内容
public static class PhoneFlowMapper extends Mapper<LongWritable, Text, FlowWritable, NullWritable> {
//mapreduce每读一行数据就调用一次该方法
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] tables = value.toString().split("\t"); // 切分成数组,里面存放的是手机号码和上下行流量
String phonenum = tables[1];
long upflow = Long.parseLong(tables[7]);
long downflow = Long.parseLong(tables[8]);
//NullWritable是Writable的一个特殊类,实现方法为空实现,不从数据流中读数据,也不写入数据,只充当占位符,
// 如在MapReduce中,如果你不需要使用键或值,你就可以将键或值声明为NullWritable
//注意FlowWritable是构造方法,方法参数是phonenum,upflow,downflow;
//这使得这个键值对的键已经包含了电话,上行流量,下行流量和总流量,所以值就用不上了,为空
//注意这里会按照键值对的键来排序。我在FlowWritable的compareTo方法中已经规定了逆序排序
context.write(new FlowWritable(phonenum,upflow,downflow), NullWritable.get());
}
}
//Mapreduce的保障之一就是送到Reducer端的数据总是根据Reducer的输入键进行排序的
public static class PhoneFlowReducer extends Reducer<FlowWritable, NullWritable, FlowWritable, NullWritable> {
@Override
protected void reduce(FlowWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static class FlowSortRunner extends Configured implements Tool {
//run方法就是将各种操作夹到job中,这个job会在hadoop集群中运行
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(PhoneFlowDriver.class);
job.setMapperClass(PhoneFlowMapper.class);
job.setReducerClass(PhoneFlowReducer.class);
//设置map程序的输出key、value
job.setMapOutputKeyClass(FlowWritable.class);
job.setMapOutputValueClass(NullWritable.class);
//设置 输出 key、value
job.setOutputKeyClass(FlowWritable.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));//输入数据路径
//检查一下参数所指定的输出路径是否存在,如果已存在,先删除
Path output = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(output)){
fs.delete(output, true);
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));//输出数据路径
return job.waitForCompletion(true)?0:1;
}
}
//主程序入口
public static void main(String[] args) throws Exception {
int status = ToolRunner.run(new Configuration(), new FlowSortRunner(), args);
System.exit(status);
}
}
二、测试情况(含测试数据分享)
这个是原始数据
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200
1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200
1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200
1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
这个是分析后的数据,我只截了运行结果图片,可以看到数据已经按照流量降序排好了。