MapReduce编程之Join多种应用场景与使用

MapReduce编程之Join多种应用场景与使用

Join操作概述

在关系型数据库中 Join 是非常常见的操作，各种优化手段已经到了极致。在海量数据的环境下，不可避免的也会碰到这种类型的需求，例如在数据分析时需要连接从不同的数据源中获取到数据。不同于传统的单机模式，在分布式存储下采用 MapReduce 编程模型，也有相应的处理措施和优化方法。

我们先简要地描述待解决的问题。假设有两个数据集：气象站数据库和天气记录数据库，并考虑如何合二为一。一个典型的查询是：输出气象站的历史信息，同时各行记录也包含气象站的元数据信息。

Reduce join

在Reudce端进行连接是MapReduce框架实现join操作最常见的方式，其具体的实现原理如下：

Map端的主要工作：为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key，其余部分和新加的标志作为value，最后进行输出。

reduce端的主要工作：在reduce端以连接字段作为key的分组已经完成，我们只需要在每一个分组当中将那些来源于不同文件的记录(在map阶段已经打标志)分开，最后进行合并就ok

了

Reduce Join 实现方式一

● 适用场景：两个表连接

● 实现方式：二次排序

● 代码实现：

JoinStationMapper 处理来自气象站数据，代码如下所示。

import java.io.IOException;
import org.apache.hadoop.io.LongWritable; 
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class JoinStationMapper extends Mapper< LongWritable,Text,TextPair,Text>{
protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedExce String[] arr = StringUtils.split(value.toString(),"\\s+");//解析气象站数据if(arr.length==2){//满足这种数据格式

//key=气象站id value=气象站名称

context.write(new TextPair(arr[0],"0"),new Text(arr[1]));
}
}
}

JoinRecordMapper 处理来自天气记录数据，代码如下所示。

 import java.io.IOException;

import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class JoinRecordMapper extends Mapper< LongWritable,Text,TextPair,Text>{

protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedExce String[] arr = StringUtils.split(value.toString(),"\\s+");//解析天气记录数据 if(arr.length==3){

//key=气象站id value=天气记录数据

context.write(new TextPair(arr[0],"1"),new Text(arr[1]+"\t"+arr[2]));
}
}
}

自定义TextPair作为JoinStationMapper和JoinRecordMapper的输出key。

import org.apache.hadoop.io.WritableComparable;
import java.io.*;
import org.apache.hadoop.io.*;
public class TextPair implements WritableComparable {
privateText first;//Text 类型的实例变量 first 
privateText second;//Text 类型的实例变量 second
public TextPair() {
set(new Text(),new Text());
}
public TextPair(String first, String second) {

set(new Text(first),new Text(second));
}
public TextPair(Text first, Text second) {

set(first, second);

}
public void set(Text first, Text second) {
this.first = first;
this.second = second;
}
public Text getFirst() {

 

return first;

 

}


public Text getSecond() {

return second;

}

//将对象转换为字节流并写入到输出流out中

public void write(DataOutput out)throws IOException {

first.write(out);

second.write(out);
}
//从输入流in中读取字节流反序列化为对象

public void readFields(DataInput in)throws IOException {

first.readFields(in);

second.readFields(in);
}

@Override


public int hashCode() {

return first.hashCode() *163+second.hashCode();

}

@Override

public boolean equals(Object o) {

if(o instanceof TextPair) {

TextPair tp = (TextPair) o;


return first.equals(tp.first) && second.equals(tp.second);
}

return false;


}

@Override

public String toString() {


return first +"\t"+ second;
}
//排序

public int compareTo(TextPair o) {
// TODO Auto‐generated method stub

if(!first.equals(o.first)){

return first.compareTo(o.first);

}else if(!second.equals(o.second)){

return second.compareTo(o.second);

}else{

return 0;

}

}

}

自定义分区KeyPartitioner

 import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Partitioner; 

//joinkey + "0"

public class KeyPartitioner	extends Partitioner< TextPair,Text>{

public int getPartition(TextPair key,Text value,int numPartitions){ 

return (key.getFirst().hashCode()&Integer.MAX_VALUE)% numPartitions;

}

}

自定义分组GroupingComparator

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

 import org.apache.hadoop.io.WritableComparator;

public class GroupingComparator extends WritableComparator{ 

protected GroupingComparator(){ super(TextPair.class, true);

}

@Override

//Compare two WritableComparables.

public int compare(WritableComparable w1, WritableComparable w2){ 

TextPair ip1 = (TextPair) w1;

TextPair ip2 = (TextPair) w2; Text l = ip1.getFirst(); Text r = ip2.getFirst();

 return l.compareTo(r);

}

}

由于 TextPair 经过了二次排序，所以 reducer 会先接收到气象站数据。因此从中抽取气象站名称，并将其作为后续每条输出记录的一部分写到输出文件。JoinReducer 的代码如下所示。

public class JoinReducer extends Reducer< TextPair,Text,Text,Text>{

protected void reduce(TextPair key, Iterable< Text> values,Context context) throws IOException,Int Iterator< Text> iter = values.iterator();

Text stationName = new Text(iter.next());//气象站名称 while(iter.hasNext()){

Text record = iter.next();//天气记录的每条数据

Text outValue = new Text(stationName.toString()+"\t"+record.toString()); context.write(key.getFirst(),outValue);

}}}

下面我们定义作业的驱动类 ReduceJoinBySecondarySort，在该类中，关键在于根据组合键的第一个字段（即气象站 ID）进行分区和分组，即使用一个自定义的 Partitioner 和一个

自定义的分组 comparator 作为TextPair 的嵌套类。ReduceJoinBySecondarySort 类的代

码如下所示。

import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.conf.Configured; 
import org.apache.hadoop.fs.FileSystem; 
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable; 
import org.apache.hadoop.io.WritableComparator;
 import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; 
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; 
/*
* 通过二次排序实现reduce join
* 适用场景：其中一个表的连接字段key唯一
*/
public class ReduceJoinBySecondarySort extends Configured implements Tool{
public int run(String[] args) throws Exception{ Configuration conf = new Configuration();// 读取配置文件
Path mypath = new Path(args[2]);
FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径

if (hdfs.isDirectory(mypath)) {
hdfs.delete(mypath, true);
}
Job job = Job.getInstance(conf, "join");// 新建一个任务

 job.setJarByClass(ReduceJoinBySecondarySort.class);// 主类
Path recordInputPath = new Path(args[0]);//天气记录数据源
Path stationInputPath = new Path(args[1]);//气象站数据源
Path outputPath = new Path(args[2]);//输出路径
MultipleInputs.addInputPath(job,recordInputPath,TextInputFormat.class,JoinRecordMapper.class);
MultipleInputs.addInputPath(job,stationInputPath,TextInputFormat.class,JoinStationMapper.class
FileOutputFormat.setOutputPath(job,outputPath); job.setReducerClass(JoinReducer.class);// Reducer
job.setPartitionerClass(KeyPartitioner.class);//自定义分区
job.setGroupingComparatorClass(GroupingComparator.class);//自定义分组
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception{
String[] args0 = {"hdfs://hadoop002:9000/records.txt" ,"hdfs://hadoop002:9000/station.txt" ,"hdfs://hadoop002:9000/ssReduceJoin‐out"
};
int exitCode = ToolRunner.run(new ReduceJoinBySecondarySort(),args); System.exit(exitCode);
}
}

该样本数据上运行程序，获得以下

输出结果。

011990‐99999	SIHCCAJAVRI	195005150700	0
011990‐99999	SIHCCAJAVRI	195005151200	22
011990‐99999	SIHCCAJAVRI	195005151800	‐11
012650‐99999	TYNSET‐HANSMOEN 194903241200		111
012650‐99999	TYNSET‐HANSMOEN 194903241800		78

Reduce Join 实现方式二

● 适用场景：两个表连接

● 实现方式：笛卡尔积

● 代码实现：

  import java.io.IOException;

import java.util.ArrayList; import java.util.List;

import org.apache.hadoop.conf.Configuration; 

import org.apache.hadoop.fs.FileSystem;

 import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper; 

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.util.GenericOptionsParser;

/*

* 两个大表

* 通过笛卡尔积实现 reduce join

* 适用场景：两个表的连接字段key都不唯一(包含一对多，多对多的关系)

*/

public class ReduceJoinByCartesianProduct {

 /**

为来自不同表(文件)的key/value对打标签以区别不同来源的记录。

然后用连接字段作为key，其余部分和新加的标志作为value，最后进行输出。

*/

public static class ReduceJoinByCartesianProductMapper extends Mapper<Object,Text,Text,Text>{ private Text joinKey=new Text();

private Text combineValue=new Text();

@Override

protected void map(Object key, Text value, Context context) throws IOException, InterruptedExc String pathName=((FileSplit)context.getInputSplit()).getPath().toString(); //如果数据来自于records，加一个records的标记 

if(pathName.endsWith("records.txt")){

String[] valueItems=StringUtils.split(value.toString(),"\\s+"); //过滤掉脏数据 if(valueItems.length!=3){

return;

}

joinKey.set(valueItems[0]);

combineValue.set("records.txt" + valueItems[1] + "\t" + valueItems[2]); }else if(pathName.endsWith("station.txt")){

//如果数据来自于station，加一个station的标记 String[] valueItems=StringUtils.split(value.toString(),"\\s+"); //过滤掉脏数据 

if(valueItems.length!=2){

return;

}

joinKey.set(valueItems[0]); combineValue.set("station.txt" + valueItems[1]);

}

context.write(joinKey,combineValue);

}

}

/* * reduce 端做笛卡尔积 */

public static class ReduceJoinByCartesianProductReducer extends Reducer<Text,Text,Text,Text>{ private List<String> leftTable=new ArrayList<String>();

private List<String> rightTable=new ArrayList<String>(); private Text result=new Text();

@Override

protected void reduce(Text key, Iterable<Text> values, Context context) throws IOExce //一定要清空数据 leftTable.clear();

rightTable.clear();

//相同key的记录会分组到一起，我们需要把相同key下来自于不同表的数据分开，然后做笛卡尔积

for(Text value : values){ String val=value.toString();

if(val.startsWith("station.txt")){

leftTable.add(val.replaceFirst("station.txt","")); }else if(val.startsWith("records.txt")){

rightTable.add(val.replaceFirst("records.txt",""));

}

}

//笛卡尔积

for(String leftPart:leftTable){ for(String rightPart:rightTable){

result.set(leftPart+"\t"+rightPart); context.write(key, result);

}

}

}

}

public static void main(String[] args) throws Exception{ Configuration conf = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) {

System.err.println("Usage: reducejoin <in> [<in>...] <out>"); System.exit(2);

}

//输出路径

Path mypath = new Path(otherArgs[otherArgs.length ‐ 1]); FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径

if (hdfs.isDirectory(mypath)) {

hdfs.delete(mypath, true);

}

Job job = Job.getInstance(conf, "ReduceJoinByCartesianProduct");

job.setJarByClass(ReduceJoinByCartesianProduct.class);

job.setMapperClass(ReduceJoinByCartesianProductMapper.class);

job.setReducerClass(ReduceJoinByCartesianProductReducer.class); 

job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //添加输入路径

 for (int i = 0; i < otherArgs.length - 1; ++i) {

FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

}

//添加输出路径

FileOutputFormat.setOutputPath(job,

new Path(otherArgs[otherArgs.length - 1]));

 System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

该样本数据上运行程序，获得以下

输出结果。

011990‐99999	SIHCCAJAVRI	195005150700	0
011990‐99999	SIHCCAJAVRI	195005151200	22
011990‐99999	SIHCCAJAVRI	195005151800	‐11
012650‐99999	TYNSET‐HANSMOEN 194903241200		111
012650‐99999	TYNSET‐HANSMOEN 194903241800		78

如何运行

上传jar包到集群上，然后hadoop jar hadoopxxx.jar 路径

Reduce Join 实现方式三

● 适用场景：一个大表和一个小表连接

● 实现方式：分布式缓存

分布式缓存知识点补充：

当 MapReduce 处理大型数据集间的 join 操作时，此时如果一个数据集很大而另外一个集合很小，以至于可以分发到集群中的每个节点之中。这种情况下，我们就用到了 Hadoop 的分布式缓存机制，它能够在任务运行过程中及时地将文件和存档复制到任务节点以供使用。为了节约网络宽带，在每一个作业中，各个文件通常只需要复制到一个节点一次。

1、用法

Hadoop 命令行选项中，有三个命令可以实现文件复制分发到任务的各个节点。

1)用户可以使用 -files 选项指定待分发的文件，文件内包含以逗号隔开的 URL 列表。文件可以存放在本地文件系统、HDFS、或其它 Hadoop 可读文件系统之中。如果尚未指定文件系统，则这些文件被默认是本地的。即使默认文件系统并非本地文件系统，这也是成立的。

2)用户可以使用 -archives 选项向自己的任务中复制存档文件，比如JAR 文件、ZIP 文件、 tar 文件和 gzipped tar文件，这些文件会被解档到任务节点。

3)用户可以使用 -libjars 选项把 JAR 文件添加到 mapper 和 reducer 任务的类路径中。如果

作业 JAR 文件并非包含很多库 JAR 文件，这点会很有用。

2、工作机制

当用户启动一个作业，Hadoop 会把由 -files、-archives、和 -libjars 等选项所指定的文件

复制到分布式文件系统之中。接着，在任务运行之前， tasktracker 将文件从分布式文件系统复制到本地磁盘（缓存）使任务能够访问文件。此时，这些文件就被视为 “本地化” 了。从任务的角度来看，这些文件就已经在那儿了，它并不关心这些文件是否来自 HDFS 。此外，有 -libjars 指定的文件会在任务启动前添加到任务的类路径（classpath）中。

3、分布式缓存 API

由于可以通过 Hadoop 命令行间接使用分布式缓存，大多数应用不需要使用分布式缓存API。然而，一些应用程序需要用到分布式缓存的更高级的特性，这就需要直接使用 API 了。 API 包括两部分：将数据放到缓存中的方法，以及从缓存中读取数据的方法。

1)首先掌握数据放到缓存中的方法，以下列举 Job 中可将数据放入到缓存中的相关方法：

public void addCacheFile(URI uri); public void addCacheArchive(URI uri);//以上两组方法将文件或存档添加到分布式缓存

 public void setCacheFiles(URI[] files);

public void setCacheArchives(URI[] archives);//以上两组方法将一次性向分布式缓存中添加一组文件或存档

 public void addFileToClassPath(Path file);

public void addArchiveToClassPath(Path archive);//以上两组方法将文件或存档添加到 MapReduce 任务的类路径

 public void createSymlink();

在缓存中可以存放两类对象：文件（files）和存档（achives）。文件被直接放置在任务节点上，而存档则会被解档之后再将具体文件放置在任务节点上。

2)其次掌握在 map 或者 reduce 任务中，使用 API 从缓存中读取数据。

public Path[] getLocalCacheFiles() throws IOException;

 public Path[] getLocalCacheArchives() throws IOException;

 public Path[] getFileClassPaths();

public Path[] getArchiveClassPaths();

我们可以使用 getLocalCacheFiles()和getLocalCacheArchives()方法获取缓存中的文件或者

存档的引用。当处理存档时，将会返回一个包含解档文件的目的目录。相应的，用户可以通

过 getFileClassPaths()和getArchivesClassPaths()方法获取被添加到任务的类路径下的文件和文档。

● 代码实现：

import java.io.BufferedReader;

 import java.io.FileNotFoundException;

 import java.io.FileReader;

import java.io.IOException;

 import java.io.InputStreamReader; 

import java.net.URI;

import java.util.Hashtable;

import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.conf.Configured; 

import org.apache.hadoop.fs.FSDataInputStream;

 import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job; 

import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.Reducer.Context;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool; 

import org.apache.hadoop.util.ToolRunner; 

import org.apache.hadoop.util.bloom.Key;

import com.dajiangtai.hadoop.join.SemiJoin.SemiJoinMapper; 

import com.dajiangtai.hadoop.join.SemiJoin.SemiJoinReducer;

 /*

* 通过分布式缓存实现Reduce Join

* 适用场景：其中一个表比较小，能放入内存

*/

public class ReduceJoinByDistributedCache	extends Configured implements Tool {

//直接输出大表数据

records.txt public static class ReduceJoinByDistributedCacheMapper extends

Mapper< LongWritable, Text, Text, Text> {

public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

String[] arr = StringUtils.split(value.toString(),"\\s+"); if (arr.length == 3) {

context.write(new Text(arr[0]), value);
}
}
}

//在reduce 端通过缓存文件实现join操作

 public static class ReduceJoinByDistributedCacheReducer extends

Reducer< Text, Text, Text, Text> { //定义Hashtable存放缓存数据

private Hashtable< String, String> table = new Hashtable< String, String>();

 /** * 获取分布式缓存文件*/
protected void setup(Context context) throws IOException, InterruptedException {
BufferedReader br;
String infoAddr = null; // 返回缓存文件路径
Path[] cacheFilesPaths = context.getLocalCacheFiles(); for (Path path : cacheFilesPaths) {
String pathStr = path.toString();
br = new BufferedReader(new FileReader(pathStr)); while (null != (infoAddr = br.readLine())) {
// 按行读取并解析气象站数据
String[] records = StringUtils.split(infoAddr.toString(), "\\s+");
if (null != records)//key为stationID，value为stationName
table.put(records[0], records[1]);
}
}
}
public void reduce(Text key, Iterable< Text> values, Context context) throws IOException, InterruptedException { //天气记录根据stationId 获取stationName
String stationName = table.get(key.toString()); for (Text value : values) {
context.write(new Text(stationName), value);
}
}
}
public int run(String[] args) throws Exception { Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) {
System.err.println("Usage: cache <in> [<in>...] <out>");
System.exit(2);
}
//输出路径
Path mypath = new Path(otherArgs[otherArgs.length ‐ 1]); FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径

if (hdfs.isDirectory(mypath)) {
hdfs.delete(mypath, true);
}
Job job = Job.getInstance(conf, "ReduceJoinByDistributedCache");
//添加缓存文件
job.addCacheFile(new Path(otherArgs[0]).toUri());//station.txt job.setJarByClass(ReduceJoinByDistributedCache.class);

job.setMapperClass(ReduceJoinByDistributedCacheMapper.class);

 job.setReducerClass(ReduceJoinByDistributedCacheReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class); //添加输入路径

 for (int i = 1; i < otherArgs.length - 1; ++i) {
 
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

}
 
//添加输出路径
FileOutputFormat.setOutputPath(job, new Path( otherArgs[otherArgs.length - 1]));
return job.waitForCompletion(true) ? 0 : 1;
 
}

public static void main(String[] args) throws Exception {
int ec = ToolRunner.run(new Configuration(),new ReduceJoinByDistributedCache(), args); System.exit(ec);

}

}

该样本数据上运行程序，获得以下输出结果。

011990‐99999	SIHCCAJAVRI	195005150700	0
011990‐99999	SIHCCAJAVRI	195005151200	22
011990‐99999	SIHCCAJAVRI	195005151800	‐11
012650‐99999	TYNSET‐HANSMOEN 194903241200		111
012650‐99999	TYNSET‐HANSMOEN 194903241800		78

运行方式

上传jar包到集群上，然后hadoop jar hadoopxxx.jar 路径

Reduce Join的不足

这里主要分析一下reduce join的一些不足。之所以会存在reduce join这种方式，是因为整体数据被分割了，每个map task只处理一部分数据而不能够获取到所有需要的join字段，因此我们可以充分利用mapreduce框架的特性，让他按照join key进行分区，将所有join key相同的记录集中起来进行处理，所以reduce join这种方式就出现了。

这种方式的缺点很明显就是会造成map和reduce端也就是shuffle阶段出现大量的数据传输，效率很低。

Map join

Map Join 实现方式一

● 使用场景：一张表十分小、一张表很大。

● 用法:

在提交作业的时候先将小表文件放到该作业的DistributedCache中，然后从

DistributeCache中取出该小表进行join (比如放到Hash Map等等容器中)。然后扫描大表，

看大表中的每条记录的join key /value值是否能够在内存中找到相同join key的记录，如果有则直接输出结果。

DistributedCache是分布式缓存的一种实现，它在整个MapReduce框架中起着相当重要的作用，他可以支撑我们写一些相当复杂高效的分布式程序。说回到这里，JobTracker在作业启动之前会获取到DistributedCache的资源uri列表，并将对应的文件分发到各个涉及到该作

业的任务的TaskTracker上。另外，关于DistributedCache和作业的关系，比如权限、存储路径区分、public和private等属性。

● 代码实现：

 import java.io.BufferedReader; 
import java.io.FileNotFoundException;
 import java.io.IOException;
import java.io.InputStreamReader;
 import java.net.URI;
import java.util.Hashtable;
import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured; 
import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /*
* 通过分布式缓存实现 map join
* 适用场景：一个小表，一个大表
*/
public class MapJoinByDistributedCache	extends Configured implements Tool {
/* * 直接在map 端进行join合并 */
public static class MapJoinMapper extends Mapper< LongWritable, Text, Text, Text> {
private Hashtable< String, String> table = new Hashtable< String, String>();//定义Hashtable
/** * 获取分布式缓存文件*/
@SuppressWarnings("deprecation")
protected void setup(Context context) throws IOException, InterruptedException {
Path[] localPaths = (Path[]) context.getLocalCacheFiles();//返回本地文件路径

 if (localPaths.length == 0) {
throw new FileNotFoundException( "Distributed cache file not found.");
}
FileSystem fs = FileSystem.getLocal(context.getConfiguration());//获取本地 FileSystem 实例

FSDataInputStream in = null;
in = fs.open(new Path(localPaths[0].toString()));// 打开输入流
BufferedReader br = new BufferedReader(new InputStreamReader(in));// 创建BufferedReader读取

String infoAddr = null;
while (null != (infoAddr = br.readLine())) {// 按行读取并解析气象站数据

 String[] records = infoAddr.split("\t");
table.put(records[0], records[1]);//key为stationID，value为stationName
}
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] valueItems = StringUtils.split(value.toString(),"\\s+");
String stationName = table.get(valueItems[0]);//天气记录根据stationId 获取stationName if(null !=stationName)
context.write(new Text(stationName), value);
}
}

public int run(String[] args) throws Exception { // TODO Auto‐generated method stub
Configuration conf = new Configuration();
Path out = new Path(args[2]);
FileSystem hdfs = out.getFileSystem(conf);// 创建输出路径 if (hdfs.isDirectory(out)) {
hdfs.delete(out, true);
}
Job job = Job.getInstance();//获取一个job实例 
job.setJarByClass(MapJoinByDistributedCache.class);
 FileInputFormat.addInputPath(job,
new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(
args[2])); //添加分布式缓存文件
 station.txt job.addCacheFile(new URI(args[1]));
job.setMapperClass(MapJoinMapper.class); 
job.setOutputKeyClass(Text.class);// 输出key类型 
job.setOutputValueClass(Text.class);// 输出value类型 
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception { String[] args0 = {"hdfs://hadoop002:9000/join/records.txt"
,"hdfs://hadoop002:9000/join/station.txt" ,"hdfs://hadoop002:9000/join/mapcache‐out"
};

int ec = ToolRunner.run(new Configuration(), new MapJoinByDistributedCache(), args);

System.exit(ec);

}

}

该样本数据上运行程序，获得以下

输出结果。

011990‐99999	SIHCCAJAVRI	195005150700	0
011990‐99999	SIHCCAJAVRI	195005151200	22
011990‐99999	SIHCCAJAVRI	195005151800	‐11
012650‐99999	TYNSET‐HANSMOEN 194903241200		111
012650‐99999	TYNSET‐HANSMOEN 194903241800		78

Map Join 实现方式二

● 使用场景：一张表在数据库、一张表很大。

另外还有一种比较变态的Map Join方式，就是结合HBase来做Map Join操作。这种方式完全可以突破内存的控制，使你毫无忌惮的使用Map Join，而且效率也非常不错。

Semi join

Map Join 实现方式一

● 使用场景：一个大表（内存放不下），一个超大表

● 实现方式：分布式缓存

● 用法:

SemiJoin就是所谓的半连接，其实仔细一看就是reduce join的一个变种，就是在map端过滤掉一些数据，在网络中只传输参与连接的数据不参与连接的数据不必在网络中进行传输，从而减少了shuffle的网络传输量，使整体效率得到提高，其他思想和reduce join是一模一样的。说得更加接地气一点就是将小表中参与join的key单独抽出来通过DistributedCach分发到相关节点，然后将其取出放到内存中(可以放到HashSet中)，在map阶段扫描连接表，将

join key不在内存HashSet中的记录过滤掉，让那些参与join的记录通过shuffle传输到reduce端进行join操作，其他的和reduce join都是一样的。

● 代码实现：

import java.io.BufferedReader; 

import java.io.FileNotFoundException; 

import java.io.FileReader;

import java.io.IOException;

 import java.io.InputStreamReader; 

import java.util.ArrayList;

 import java.util.HashSet;

import java.util.List; import java.util.Set;

import org.apache.hadoop.conf.Configuration; 

import org.apache.hadoop.fs.FSDataInputStream;

 import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.Text; 

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper; 

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.bloom.Key;

/*

* 一个大表，一个小表

* map 阶段：Semi Join解决小表整个记录内存放不下的场景，过滤大表

* reduce 阶段：reduce side join

*/

public class SemiJoin { /**

 

* 为来自不同表(文件)的key/value对打标签以区别不同来源的记录。

 

* 然后用连接字段作为key，其余部分和新加的标志作为value，最后进行输出。

*/

public static class SemiJoinMapper extends Mapper<Object, Text, Text, Text> {

// 定义Set集合保存小表中的key 

private Set<String> joinKeys = new HashSet<String>(); private Text joinKey = new Text();

 

private Text combineValue = new Text();

 

/** * 获取分布式缓存文件

*/

 

protected void setup(Context context) throws IOException, InterruptedException {

BufferedReader br; String infoAddr = null; // 返回缓存文件路径

 

Path[] cacheFilesPaths = context.getLocalCacheFiles(); for (Path path : cacheFilesPaths) {

String pathStr = path.toString();

 

br = new BufferedReader(new FileReader(pathStr)); while (null != (infoAddr = br.readLine())) {

// 按行读取并解析气象站数据

 

String[] records = StringUtils.split(infoAddr.toString(), "\\s+");

 

if (null != records)// key为stationID joinKeys.add(records[0]);

 

}

 

}

 

}

 

@Override

 

protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {

String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();

 

// 如果数据来自于records，加一个records的标记

 if (pathName.endsWith("records‐semi.txt")) {

String[] valueItems = StringUtils.split(value.toString(), "\\s+");

// 过滤掉脏数据 

if (valueItems.length != 3) {

return;

 

}

 

if (joinKeys.contains(valueItems[0])) { joinKey.set(valueItems[0]); combineValue.set("records‐semi.txt" + valueItems[1] + "\t"

 

+ valueItems[2]); context.write(joinKey, combineValue);

}

 

} else if (pathName.endsWith("station.txt")) {

 

/ 如果数据来自于station，加一个station的标记

 

String[] valueItems = StringUtils.split(value.toString(), "\\s+");

 

// 过滤掉脏数据 if (valueItems.length != 2) {

 

return;

 

}

 

joinKey.set(valueItems[0]); combineValue.set("station.txt" + valueItems[1]); context.write(joinKey, combineValue);

}

}

}
/* * reduce 端做笛卡尔积 */
public static class SemiJoinReducer extends Reducer<Text, Text, Text, Text> {
private List<String> leftTable = new ArrayList<String>();

 private List<String> rightTable = new ArrayList<String>();

 private Text result = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// 一定要清空数据

leftTable.clear(); rightTable.clear();
// 相同key的记录会分组到一起，我们需要把相同key下来自于不同表的数据分开，然后做笛卡尔

 for (Text value : values) {
String val = value.toString(); System.out.println("value=" + val); if (val.startsWith("station.txt")) {
leftTable.add(val.replaceFirst("station.txt", "")); } else if (val.startsWith("records‐semi.txt")) {
rightTable.add(val.replaceFirst("records‐semi.txt", ""));
}
}
// 笛卡尔积
for (String leftPart : leftTable) {
for (String rightPart : rightTable) { result.set(leftPart + "\t" + rightPart); context.write(key, result);
}

 

}

 

}

 

}

 

public static void main(String[] args) throws Exception { Configuration conf = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf, args)

.getRemainingArgs(); if (otherArgs.length < 2) {

System.err.println("Usage: semijoin <in> [<in>...] <out>");

System.exit(2);

}

//输出路径

Path mypath = new Path(otherArgs[otherArgs.length ‐ 1]);

 FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径 if (hdfs.isDirectory(mypath)) {

hdfs.delete(mypath, true);

}

Job job = Job.getInstance(conf, "SemiJoin");

//添加缓存文件

 

job.addCacheFile(new Path(otherArgs[0]).toUri());

 job.setJarByClass(SemiJoin.class); 

job.setMapperClass(SemiJoinMapper.class); 

job.setReducerClass(SemiJoinReducer.class); 

job.setOutputKeyClass(Text.class); 

job.setOutputValueClass(Text.class); //添加输入路径

 for (int i = 1; i < otherArgs.length - 1; ++i) {

FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

}

//添加输出路径

FileOutputFormat.setOutputPath(job, new Path( otherArgs[otherArgs.length - 1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

该样本数据上运行程序，获得以下

输出结果。

011990‐99999	SIHCCAJAVRI	195005150700	0
011990‐99999	SIHCCAJAVRI	195005151200	22
011990‐99999	SIHCCAJAVRI	195005151800	‐11
012650‐99999	TYNSET‐HANSMOEN 194903241200		111
012650‐99999	TYNSET‐HANSMOEN 194903241800		78

Reduce join + BloomFilter

● 使用场景：一个大表（表中的key内存仍然放不下），一个超大表

在某些情况下，SemiJoin抽取出来的小表的key集合在内存中仍然存放不下，这时候可以使

用BloomFiler以节省空间。

BloomFilter最常见的作用是：判断某个元素是否在一个集合里面。它最重要的两个方法是：

add() 和membershipTest ()。

因而可将小表中的key保存到BloomFilter中，在map阶段过滤大表，可能有一些不在小表中的记录没有过滤掉（但是在小表中的记录一定不会过滤掉），这没关系，只不过增加了少量的网络IO而已。

● BloomFilter参数计算方式：

n：小表中的记录数。

m：位数组大小，一般m是n的倍数，倍数越大误判率就越小，但是也有内存限制，不能太大，这个值需要反复测试得出。

k：hash个数，最优hash个数值为：k = ln2 * (m/n)

● 代码实现：

import java.io.BufferedReader; import java.io.FileReader;
import java.io.IOException; import java.util.ArrayList; import java.util.List;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.util.hash.Hash;

 /*
* 一个大表，一个小表
* map 阶段：BloomFilter 解决小表的key集合在内存中仍然存放不下的场景，过滤大表
* reduce 阶段：reduce side join
*/
public class BloomFilteringDriver { /**
* 为来自不同表(文件)的key/value对打标签以区别不同来源的记录
* 然后用连接字段作为key，其余部分和新加的标志作为value，最后进行输出。
*/
public static class BloomFilteringMapper extends Mapper<Object, Text, Text, Text> {
/ 第一个参数是vector的大小，这个值尽量给的大，可以避免hash对象的时候出现索引重复
/ 第二个参数是散列函数的个数
/ 第三个是hash的类型，虽然是int型，但是只有默认两个值
/ 哈希函数个数k、位数组大小m及字符串数量n之间存在相互关系
//n 为小表记录数,给定允许的错误率E，可以确定合适的位数组大小，即m >= log2(e) * (n * log2(1/ // 给定m和n，可以确定最优hash个数，即k = ln2 * (m/n)，此时错误率最小 private BloomFilter filter = new BloomFilter(10000, 6, Hash.MURMUR_HASH);
private Text joinKey = new Text(); private Text combineValue = new Text();
/** * 获取分布式缓存文件*/
@SuppressWarnings("deprecation")
protected void setup(Context context) throws IOException, InterruptedException {
BufferedReader br; String infoAddr = null; // 返回缓存文件路径
Path[] cacheFilesPaths = context.getLocalCacheFiles(); for (Path path : cacheFilesPaths) {
String pathStr = path.toString();

br = new BufferedReader(new FileReader(pathStr)); while (null != (infoAddr = br.readLine())) {

// 按行读取并解析气象站数据

String[] records = StringUtils.split(infoAddr.toString(), "\\s+");

 

if (null != records)// key为stationID filter.add(new Key(records[0].getBytes()));

 

}

 

}

 

}

 

@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String pathName = ((FileSplit) context.getInputSplit()).getPath().toString(); // 如果数据来自于records，加一个records的标记

if (pathName.endsWith("records‐semi.txt")) {
String[] valueItems = StringUtils.split(value.toString(), "\\s+");
// 过滤掉脏数据 if (valueItems.length != 3) {
return;
}
//通过filter 过滤大表中的数据 if (filter.membershipTest(new Key(valueItems[0].getBytes()))) {
joinKey.set(valueItems[0]); combineValue.set("records‐semi.txt" + valueItems[1] + "\t"
+ valueItems[2]); context.write(joinKey, combineValue);
}
} else if (pathName.endsWith("station.txt")) {
//如果数据来自于station，加一个station的标记
String[] valueItems = StringUtils.split(value.toString(), "\\s+");
// 过滤掉脏数据

if (valueItems.length != 2) {
return;
}
joinKey.set(valueItems[0]); combineValue.set("station.txt" + valueItems[1]); context.write(joinKey, combineValue);
}
}

 

}

 

/* * reduce 端做笛卡尔积 */

 

public static class BloomFilteringReducer extends Reducer<Text, Text, Text, Text> {

private List<String> leftTable = new ArrayList<String>(); private List<String> rightTable = new ArrayList<String>(); private Text result = new Text();

 

@Override

 

protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

 

/ 一定要清空数据 leftTable.clear(); rightTable.clear();

 

/ 相同key的记录会分组到一起，我们需要把相同key下来自于不同表的数据分开，然后做笛卡尔 for (Text value : values) {

String val = value.toString(); System.out.println("value=" + val);

if (val.startsWith("station.txt")) { leftTable.add(val.replaceFirst("station.txt", ""));

} else if (val.startsWith("records‐semi.txt")) { rightTable.add(val.replaceFirst("records‐semi.txt", ""));

}

 

}

 

// 笛卡尔积

 

for (String leftPart : leftTable) {

 

for (String rightPart : rightTable) { result.set(leftPart + "\t" + rightPart); context.write(key, result);

 

}

 

}

 

}

 

}

public static void main(String[] args) throws Exception { Configuration conf = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf, args)

 

.getRemainingArgs(); if (otherArgs.length < 2) {

System.err.println("Usage: BloomFilter <in> [<in>...] <out>");

 

System.exit(2);

 

}

 

//输出路径
Path mypath = new Path(otherArgs[otherArgs.length ‐ 1]);

FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径

 if (hdfs.isDirectory(mypath)) {

 

hdfs.delete(mypath, true);

 

}

 

Job job = Job.getInstance(conf, "bloomfilter");

 

//添加缓存文件

job.addCacheFile(new Path(otherArgs[0]).toUri()); job.setJarByClass(BloomFilteringDriver.class);

 job.setMapperClass(BloomFilteringMapper.class); job.setReducerClass(BloomFilteringReducer.class);

job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //添加输入文件

for (int i = 1; i < otherArgs.length - 1; ++i) {

 

FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

 

}



//设置输出路径

FileOutputFormat.setOutputPath(job, new Path( otherArgs[otherArgs.length - 1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

该样本数据上运行程序，获得以下输出结果。

011990‐99999	SIHCCAJAVRI	195005150700	0
011990‐99999	SIHCCAJAVRI	195005151200	22
011990‐99999	SIHCCAJAVRI	195005151800	‐11
012650‐99999	TYNSET‐HANSMOEN 194903241200		111
012650‐99999	TYNSET‐HANSMOEN 194903241800		78

总结

三种join方式适用于不同的场景，其处理效率上相差很大，其主要导致因素是网络传输。 Map join效率最高，其次是SemiJoin，最低的是reduce join。另外，写分布式大数据处理程序的时最好要对整体要处理的数据分布情况作一个了解，这可以提高我们代码的效率，使数据的倾斜度降到最低，使我们的代码倾向性更好。