MapReduce的倒排索引

索引：

什么是索引：索引（Index）是帮助数据库高效获取数据的数据结构。索引是在基于数据库表创建的，它包含一个表中某些列的值以及记录对应的地址，并且把这些值存储在一个数据结构中。最常见的就是使用哈希表、B+树作为索引。

索引的具体分析：https ：//blog.csdn.net/meiLin_Ya/article/details/80854232

用代码说事，先来看看我的数据吧：

package com.huhu.day05;


import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


import com.huhu.day04.ProgenyCount;


public class InvertedIndex extends ToolRunner implements Tool {


	private Configuration conf;


	public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {


		private FileSplit split;
		private Text va = new Text();


		@Override
		protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			split = (FileSplit) context.getInputSplit();
		}


		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] line = value.toString().split(" ");
			System.err.println(line);
			String filename = split.getPath().getName();
			for (String s : line) {
				va.set("fileName :" + filename + ":" + key.get() + "\t索引位置：" + value.toString().indexOf(s) + "\t");
				context.write(new Text("搜索词:" + s + "\r"), new Text(va));
			}


		}
	}


	public static class MyReduce extends Reducer<Text, Text, Text, Text> {


		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
		}


		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			StringBuffer sb = new StringBuffer();
			for (Text v : values) {
				sb.append(v.toString());
			}
			context.write(new Text(key), new Text(sb.toString()));
		}


		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
		}
	}


	public static void main(String[] args) throws Exception {
		InvertedIndex t = new InvertedIndex();
		Configuration conf = t.getConf();
		String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (other.length != 2) {
			System.err.println("number is fail");
		}
		int run = ToolRunner.run(conf, t, args);
		System.exit(run);
	}


	@Override
	public Configuration getConf() {
		if (conf != null) {
			return conf;
		}
		return new Configuration();
	}


	@Override
	public void setConf(Configuration arg0) {


	}


	@Override
	public int run(String[] other) throws Exception {
		Configuration con = getConf();
		Job job = Job.getInstance(con);
		job.setJarByClass(ProgenyCount.class);
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);


		// 默认分区
		// job.setPartitionerClass(HashPartitioner.class);


		job.setReducerClass(MyReduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);


		FileInputFormat.addInputPath(job, new Path("hdfs://ry-hadoop1:8020/in/day05/InvertedIndex"));
		Path path = new Path("hdfs://ry-hadoop1:8020/out/day05.txt");
		FileSystem fs = FileSystem.get(getConf());
		if (fs.exists(path)) {
			fs.delete(path, true);
		}
		FileOutputFormat.setOutputPath(job, path);


		return job.waitForCompletion(true) ? 0 : 1;
	}


}

索引很重要：

详情：https ：//blog.csdn.net/meiLin_Ya/article/details/80854232

MapReduce的倒排索引

猜你喜欢