单词统计案例

要解决的问题: 统计最后一个单词出现的次数[如下面的格式]

a,b,c,d
e,d,s,d
a,s,g,w
…..

解决过程和反思:[注意不要跳坑]

/**
 * @Title: Demos.java
 * @Author:youxiangyang
 * @Date:下午6:50:07
 */
package mr;

import java.io.IOException;
import java.util.HashMap;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * @read here
 * 今天遇到这样一个问题:
 * 统计最后一个单词出现的次数[如下面的格式]
 * a,b,c,d
 * e,d,s,d
 * s,d,f,g
 * a,s,g,w
 *...........
 *下面是我自己的一个解决方法.但是比较复杂...额,不推荐.但是作为一个坑,我还是写出来了,请不要继续跳
 *我的思路是这样的..其实,这是一个wordcount的改版.所以,我在map阶段,对最后的一个单词添加了标识符
 *这样就可以在reduce阶段特别的搞出来这个.用if判断下,如果有.就放到一个map里.如果没有就放到另外一
 *个.然后,在cleanup里对所有的结果进行最后的处理.遍历两个map.如果map1的key截取后相同,就给这个单
 *词的次数加1...最后呢,输出来.解决思路不是很复杂.但绝对不是一个好方法.希望有伙伴可以给一个更好的.
 *
 *在最后,听了同学的一个方法,咳咳....到后来还是发现错了,不过他的想法还是挺好的   
 */
public class Demos {
    
    

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        //main方法就省略不写了,这个体谅下,因为main方法里的内容都是固定套路,节约下时间
        //仅提供了map和reduce

    }
    public static class wordMap extends Mapper<LongWritable, Text, Text, IntWritable>{
    
    

        @Override
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, Text,IntWritable>.Context context)
                throws IOException, InterruptedException {
            String[] lines = value.toString().split(",");
            for (int i = 0; i < lines.length; i++) {
                if (i==(lines.length-1)) {
                    context.write(new Text(lines[i]+"&&"), new IntWritable(1));
                }else {
                    context.write(new Text(lines[i]), new IntWritable(1));
                }

            }
        }
    }
    public static class wordReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
    
    

        HashMap<String, Integer> map1=new HashMap<String,Integer>();
        HashMap<String, Integer> map2=new HashMap<String,Integer>();
        @Override
        protected void reduce(Text k1, Iterable<IntWritable> v1,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            int nums = 0;
            for (IntWritable num : v1) {
                nums+=num.get();
            }
            if (k1.toString().contains("&&")) {
                map2.put(k1.toString(), nums);
            }else {
                map1.put(k1.toString(), nums);
            }
        }
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#cleanup(org.apache.hadoop.mapreduce.Reducer.Context)
         */
        @Override
        protected void cleanup(
                Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            //遍历map2,和map1
            int nums=0;
            for(String key:map2.keySet()){
                for(String key1:map1.keySet()){
                    if (key.toString().substring(-1).contains(key1)) {
                        nums=map1.get(key1)+map2.get(key);
                    }
                }
            }
            for(String key:map2.keySet()){
                for(String key1:map1.keySet()){
                //设置substring是负数是希望从后面开始截取
                    if (key.toString().substring(-1).contains(key1)) {
                        context.write(new Text(key), new IntWritable(map1.get(key1)+nums));
                    }
                }
            }
        }
    }

}

下面是同学给的建议:!!!注意啦.观点是错误的!!大家不要跳坑!!!!!!

/**
 * @Title: Demo02.java
 * @Author:youxiangyang
 * @Date:下午7:37:45
 */
package mr;

import java.io.IOException;

import org.apache.hadoop.hdfs.server.namenode.status_jsp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * @author AURA
 *
 */
public class Demo02 {
    
    
    public static void main(String[] args) {

    }
    public static class wmap extends Mapper<LongWritable, Text, Text, IntWritable>{
    
    
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
         */
        @Override
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String[] lines = value.toString().split(",");
            String endword = lines[lines.length-1].trim();
            int nums=0;
            for (int i = 0; i < lines.length; i++) {
                if (lines[i].trim().equals(endword)) {
                    nums++;
                }
            }

            context.write(new Text(endword), new IntWritable(1+nums));
            //乍一看,还挺好的,但是呢,这样会漏很多很多啊!!!
            //比如这行是这样:a,v,s,d
            //下一行是:q,w,s,v
            //按这样的方法统计,会少算一个"v"....
        }
    }
    public static class wreduce extends Reducer<Text, IntWritable, Text, IntWritable>{
    
    
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
         */
        @Override
        protected void reduce(Text key, Iterable<IntWritable> vIterable,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            int nums=0;
            for (IntWritable num: vIterable) {
                nums+=nums;
            }
            context.write(key, new IntWritable(nums));

        }
    }

}

反思:灵活一点…不要读死书….多多向别人学习.

猜你喜欢