MapReduce的倒排索引
索引:
什么是索引:索引(Index)是帮助数据库高效获取数据的数据结构。索引是在基于数据库表创建的,它包含一个表中某些列的值以及记录对应的地址,并且把这些值存储在一个数据结构中。最常见的就是使用哈希表、B+树作为索引。
索引的具体分析:https ://blog.csdn.net/meiLin_Ya/article/details/80854232
用代码说事,先来看看我的数据吧:
package com.huhu.day05;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.huhu.day04.ProgenyCount;
public class InvertedIndex extends ToolRunner implements Tool {
private Configuration conf;
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
private FileSplit split;
private Text va = new Text();
@Override
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
split = (FileSplit) context.getInputSplit();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
System.err.println(line);
String filename = split.getPath().getName();
for (String s : line) {
va.set("fileName :" + filename + ":" + key.get() + "\t索引位置:" + value.toString().indexOf(s) + "\t");
context.write(new Text("搜索词:" + s + "\r"), new Text(va));
}
}
}
public static class MyReduce extends Reducer<Text, Text, Text, Text> {
@Override
protected void setup(Context context) throws IOException, InterruptedException {
}
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text v : values) {
sb.append(v.toString());
}
context.write(new Text(key), new Text(sb.toString()));
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
}
public static void main(String[] args) throws Exception {
InvertedIndex t = new InvertedIndex();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
}
@Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
}
@Override
public void setConf(Configuration arg0) {
}
@Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(ProgenyCount.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 默认分区
// job.setPartitionerClass(HashPartitioner.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("hdfs://ry-hadoop1:8020/in/day05/InvertedIndex"));
Path path = new Path("hdfs://ry-hadoop1:8020/out/day05.txt");
FileSystem fs = FileSystem.get(getConf());
if (fs.exists(path)) {
fs.delete(path, true);
}
FileOutputFormat.setOutputPath(job, path);
return job.waitForCompletion(true) ? 0 : 1;
}
}
索引很重要:
详情:https ://blog.csdn.net/meiLin_Ya/article/details/80854232