代码实现【mini版】——给定a、b两个文件,各存放50亿个url,每个url各占64字节,内存限制是4G,让你找出a、b文件共同的url?

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/qq_41946557/article/details/102711573

根据https://blog.csdn.net/qq_41946557/article/details/102708186思想去写!!!

将文件中的信息当成我们文本中的每行文件,然后两个文件中分别是100条信息,然后进行读取  hash(url) %5

首先准备两个文件:xxoo.txt   ooxx.txt

取模并且存储到5个小文件中。代码清晰易懂。适合初学。

package com.henu;

import java.io.*;
/**
 * @author George
 * @description    大文件拆分小文件
 * 将文件中的信息当成我们文本中的每行文件,然后两个文件中分别是100条信息,然后进行读取  hash(url) %5
 **/
public class HashClass {
    public static void main(String[] args) throws IOException {
        //读取第一个文件     ooxx
        BufferedReader obr = new BufferedReader(new FileReader("d://ooxx.txt"));
        BufferedWriter obw1 = new BufferedWriter(new FileWriter("d://ooxx1.txt",true));
        BufferedWriter obw2 = new BufferedWriter(new FileWriter("d://ooxx2.txt",true));
        BufferedWriter obw3 = new BufferedWriter(new FileWriter("d://ooxx3.txt",true));
        BufferedWriter obw4 = new BufferedWriter(new FileWriter("d://ooxx4.txt",true));
        BufferedWriter obw5 = new BufferedWriter(new FileWriter("d://ooxx5.txt",true));

        String oline = "";
        while((oline = obr.readLine()) != null){
//            System.out.println(toHash(line));
//            System.out.println(oline);
            int x = toHash(oline);
            System.out.println(x);
            if (x == 0){
                obw1.write(oline);
                obw1.write("\r\n");
            }else if(x == -1){
                obw2.write(oline);
                obw2.write("\r\n");
            }else if(x == -2){
                obw3.write(oline);
                obw3.write("\r\n");
            }else if(x == -3){
                obw4.write(oline);
                obw4.write("\r\n");
            }else{
                obw5.write(oline);
                obw5.write("\r\n");
            }
        }
        obw1.close();
        obw2.close();
        obw3.close();
        obw4.close();
        obw5.close();
        obr.close();

        //读取第二个文件   xxoo
        BufferedReader xbr = new BufferedReader(new FileReader("d://xxoo.txt"));
        BufferedWriter xbr1 = new BufferedWriter(new FileWriter("d://xxoo1.txt",true));
        BufferedWriter xbr2 = new BufferedWriter(new FileWriter("d://xxoo2.txt",true));
        BufferedWriter xbr3 = new BufferedWriter(new FileWriter("d://xxoo3.txt",true));
        BufferedWriter xbr4 = new BufferedWriter(new FileWriter("d://xxoo4.txt",true));
        BufferedWriter xbr5 = new BufferedWriter(new FileWriter("d://xxoo5.txt",true));

        String xline = "";
        while((xline = xbr.readLine()) != null){
//            System.out.println(toHash(xline));
            int x = toHash(xline);
            if (x == 0){
                xbr1.write(xline);
                xbr1.write("\r\n");
            }else if(x == -1){
                xbr2.write(xline);
                xbr2.write("\r\n");
            }else if(x == -2){
                xbr3.write(xline);
                xbr3.write("\r\n");
            }else if(x == -3){
                xbr4.write(xline);
                xbr4.write("\r\n");
            }else{
                xbr5.write(xline);
                xbr5.write("\r\n");
            }
        }

        xbr1.close();
        xbr2.close();
        xbr3.close();
        xbr4.close();
        xbr5.close();
        xbr.close();

    }

    // 将字符串转成hash值
    public static int toHash(String key) {
        int arraySize = 5; // 数组大小一般取质数
        int hashCode = 0;
        for (int i = 0; i < key.length(); i++) { // 从字符串的左边开始计算
            int letterValue = key.charAt(i) - 96;// 将获取到的字符串转换成数字,比如a的码值是97,则97-96=1
            // 就代表a的值,同理b=2;
            hashCode = ((hashCode << 5) + letterValue) % arraySize;// 防止编码溢出,对每步结果都进行取模运算
        }
        return hashCode;
    }
}

* 进行比较,将ooxx1  - xxoo1
*            ooxx2 - xxoo2
*            ooxx3  - xxoo3
*  *         ooxx4 - xxoo4
*            ooxx5  - xxoo5

方法:将ooxx的对应文件放入set集合中。然后读取xxoo的文件,如果包含,则输出!!!!

package com.henu;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;

/**
 * @author George
 * @description   进行比较
 **/
public class HashClassEnd {

    public static void main(String[] args) throws IOException {

        BufferedReader br = null;
        HashSet<String> set = new HashSet<String>();
        String line = "";

        for (int j = 1; j < 6; j++) {
            String oPath = "d://ooxx" + j + ".txt";
            String xPath = "d://xxoo" + j + ".txt";
            br = new BufferedReader(new FileReader(oPath));
            while ((line = br.readLine()) != null){
                set.add(line);
            }
            br = new BufferedReader(new FileReader(xPath));
            while((line = br.readLine())  != null){
                if (set.contains(line)){
                    System.out.println(line);
                }
            }
        }
        br.close();
    }
}

输出结果:

"220764-7013"    "2014/9/22 13:25:00.000"    "0"    "0"    ""
"220764-7267"    "2014/9/22 10:45:00.000"    "0"    "0"    ""
"220764-7266"    "2014/9/22 11:45:00.000"    "2"    "0"    ""

结束!!!

欢迎指正,代码写的比较浅显,是为了理解,可以优化。

猜你喜欢

转载自blog.csdn.net/qq_41946557/article/details/102711573