基于hadoop的事务日志的搜索引擎的开发

这个项目所使用到的技术主要有hadoop的mapreduce，redis，ajax，json，struts2等等内容，前端框架使用的bootstrap。

首先是mapreduce的主程序：

package org.hq.mr;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool{

public static class Map extends Mapper<LongWritable,Text,Text,IntWritable>{
private final static IntWritable one=new IntWritable(1);
@Override
protected void map(LongWritable key,Text value,Context context){
String line=value.toString();
String a[]=line.split("\"");
if(a[1].indexOf("sug.jsp?query")>0){
String b[]=a[1].split("query=| ");
try {
String tmp=URLDecoder.decode(b[2], "UTF-8");
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Text word=new Text(b[2]);
try {
context.write(word, one);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

}
public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{

public void reduce(Text key,Iterator<IntWritable> values,Context context)throws Exception{
int sum=0;

while(values.hasNext()){
sum+=values.next().get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* @param args
* @throws IOException
*/
public int run(String[] args) throws IOException,Exception {
// TODO Auto-generated method stub
Configuration conf=getConf();
Job job=new Job(conf,"Load Redis");
job.setJarByClass(WordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(RedisOutputFormat.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));
RedisOutputFormat.setOutputPath(job, new Path(args[1]));

return job.waitForCompletion(true)?0:1;

}
public static void main(String args[]) throws Exception{
int ret=ToolRunner.run(new WordCount(), args);
System.exit(ret);
}

}

然后是由redis来对用户请求的数据进行键值对的存储：

package org.hq.mr;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.mapred.TaskAttemptContext;

import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import redis.clients.jedis.Jedis;

@SuppressWarnings("deprecation")
public class RedisOutputFormat<K,V> extends FileOutputFormat<K,V>{

public static class RedisRecordWriter<K,V> extends org.apache.hadoop.mapreduce.RecordWriter<K, V>{
private Jedis jedis;
RedisRecordWriter(Jedis jedis) {
// TODO Auto-generated constructor stub
this.jedis=jedis;
}

public void write(K key, V value) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
boolean nullkey=key==null||key instanceof NullWritable;
boolean nullvalue=value==null;

if(nullkey||nullvalue) return;
String s=key.toString();
for(int i=0;i<s.length();i++){
String k=s.substring(0,i+1);
int score=Integer.parseInt(value.toString());
jedis.zincrby(k, score, s);
}
}
@Override
public void close(org.apache.hadoop.mapreduce.TaskAttemptContext arg0)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
jedis.disconnect();
}

}
@SuppressWarnings({ "deprecation", "unchecked" })
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext arg0) throws IOException {
// TODO Auto-generated method stub
return (RecordWriter<K, V>) new RedisRecordWriter<K,V>(new Jedis("192.168.178.130"));
}
@Override
public org.apache.hadoop.mapreduce.RecordWriter<K, V> getRecordWriter(
org.apache.hadoop.mapreduce.TaskAttemptContext arg0)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
}

注意在java环境下，redis所使用的是jedis，所以需要将jedis的包导入进来

接下来就是对于前端的开发，我们使用了ajax和json来对用户输入的信息进行处理，使用了bootstrap作为前端框架，用struts2处理业务逻辑

sug.java:

package org.robby.suggestion;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import redis.clients.jedis.Jedis;

import com.opensymphony.xwork2.ActionSupport;

public class Sug extends ActionSupport{
String query;
Set<String> result;
public String getQuery() {
return query;
}
public void setQuery(String query) {
this.query = query;
}
public Set<String> getResult() {

System.out.println(query);
Jedis jedis=new Jedis("192.168.178.130");
String a[]=query.split(" ");

jedis.zinterstore("_tmpwords", a[0],a[1]);
result=jedis.zrevrange("_tmpwords", 0, 5);
jedis.del("_tmpwords");

return result;
}
public void setResult(Set<String> result) {
this.result = result;
}
public String execute() throws Exception{
return SUCCESS;
}

}

struts.xml:

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE struts PUBLIC "-//Apache Software Foundation//DTD Struts Configuration 2.1//EN" "http://struts.apache.org/dtds/struts-2.1.dtd">
<struts>
<constant name="struts.devMode" value="true" />
<constant name="struts.action.extension" value="do"/>
<package name="ajax" extends="json-default">
<action name="sug" class="org.robby.suggestion.Sug">
<result type="json"></result>
</action>
</package>
</struts>

以上所依赖的jar包：

bootstrap：

前端：

在linux上我们需要首先安装redis，版本是3.0：

在Linux上所使用的服务器是tomcat6，同时使用了一个java文件打包插件fatjar：

同时为了在Linux上实现自动处理，写了一个脚本文件：

tmpfile=`date '+%Y%m%d%H%M%S'`.txt
echo $tmpfile

for i in `find /home/hq/tmp/log -name localhost*`
do
echo $i
if[ -s $i ]
then
cat $i >>tmpfile
rm -f $i
fi
done

if [ -s $tmpfile]
then
hadoop fs -put $tmpfile /input
rm $tmpfile
hadoop fs -rmr /output
hadoop jar count.jar org.hq.mr.WordCount /input /output
hadoop fs -rm /input/*
fi

整个完成之后，前端就可以根据用户词语搜索频率提示下拉菜单了：

注意要学会hadoop的常用命令，以及redis的一下常用命令怎样打分怎样加入键值对

基于hadoop的事务日志的搜索引擎的开发

猜你喜欢