适用场景:适合两个大表连接操作
用法:Join操作在reduce task中完成 【默认的join方式】,map端按照连接字段进行hash,reduce 端完成连接操作
代码实现:
package join.map;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VLongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JoinOn {
public static void main(String[] args) throws Exception {
//临时配置windows的环境变量
System.setProperty("hadoop.home.dir", "D:\\workspace\\hadoop-2.2.0");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JoinOn.class);
job.setMapperClass(JOMapper.class);
job.setReducerClass(JOReducer.class);
job.setMapOutputKeyClass(VLongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)? 0:1);
}
public static class JOMapper extends Mapper<LongWritable, Text, VLongWritable, Text>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获取当前分片所对应的文件名
String name = ((FileSplit)context.getInputSplit()).getPath().getName();
String[] splited = value.toString().split("\t");
if(name.endsWith("sales")){
//sales表
//<key,value> --> <id, things+':'+name+'\t'+id>
context.write(new VLongWritable(Long.parseLong(splited[1])), new Text(name+":"+value.toString()));
}else if(name.endsWith("things")) {
//<key,value> --> <id, sales+':'+id+'\t'+name>
context.write(new VLongWritable(Long.parseLong(splited[0])), new Text(name+":"+value.toString()));
}
}
}
public static class JOReducer extends Reducer<VLongWritable, Text, Text, Text>{
@Override
protected void reduce(VLongWritable key, Iterable<Text> v2s, Context context)
throws IOException, InterruptedException {
//分别存储sales和things两表的name
List<String> sales=new ArrayList<String>();
List<String> things=new ArrayList<String>();
for(Text text : v2s){
String[] splited = text.toString().split(":");
//sales表中的数据
if(splited[0].endsWith("sales")){
//加入集合
sales.add(splited[1]);
}
//things表中数据
else if(splited[0].endsWith("things")){
things.add(splited[1]);
}
}
//笛卡尔积
/**
* 左外连接:只要求左表中有数据即可
*/
if(sales.size()!=0 /*&& things.size()!=0*/){
for(String sale : sales){
//如果右表中没有数据,则使用 NULL 代替
if(things.size()==0){
context.write(new Text(sale), new Text("NULL"+"\t"+"NILL"));
}else {//如果右表中有数据,则直接输出
for(String thing : things){
context.write(new Text(sale), new Text(thing));
}
}
}
}
}
}
}
MR过程分解
input
//sales.txt
Joe 2
Hank 4
Ali 0
Eve 3
Hank 2
//things.txt
2 Tie
4 Coat
3 Hat
1 Scarf
map
key -> value
2 -> sales:Joe 2
4 -> sales:Hank 4
0 -> sales:Ali 0
3 -> sales:Eve 3
2 -> sales:Hank 2
key -> value
2 -> things:2 Tie
4 -> things:4 Coat
3 -> things:3 Hat
1 -> things:1 Scarf
shuffle
2 [sales:Joe 2;sales:Hank 2;things:2 Tie]
4 [sales:Hank 4;things:4 Coat]
0 [sales:Ali 0;]
3 [sales:Eve 3;things:3 Hat]
reduce
2 salesList: Joe 2;Hank 2; ----> Joe 2 2 Tie
thingsList: 2 Tie; Hank 2 2 Tie
4 salesList: Hank 4; ----> Hank 4 4 Coat
thingsList: 4 Coat;
0 salesList: Ali 0; ----> Ali 0 NULL NULL
3 salesList: Eve 3; ----> Eve 3 3 Hat
thingsList: 3 Hat;
output
//sales.txt join things.txt
Joe 2 2 Tie
Hank 2 2 Tie
Hank 4 4 Coat
Ali 0 NULL NULL
Eve 3 3 Hat