版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/HLK_1135/article/details/78988141
参考文献:
- 拼写纠错功能实现
- 贝叶斯公式与拼写检查器
- big.txt
- 《数学之美》、《统计学习方法》
github源码:
https://github.com/hlk-1135/Dictionary
算法核心:贝叶斯算法:
运行效果:
当我们输入的单词有误时,贝叶斯算法开始派上用场了,对我们所输入的单词进行检查纠正,并进行一定的猜测。
public class SpellChecker {
private static final char[] alphabets = "abcdefghijklmnopqrstuvwxyz".toCharArray();
public void start() throws IOException {
//1.构建语言模型
String path = "E:\\big.txt";
Map<String, Double> languModel = buildLanguageModel(path);
Set<String> dictionary = languModel.keySet();
while((input = reader.readLine()) != null) {
input = input.trim().toLowerCase();
if("bye".equals(input))
break;
if(dictionary.contains(input))
continue;
long startTime = System.currentTimeMillis();
//3.在编辑距离内设置一个单词集,并删除字典中不存在的单词
Set<String> wordsInEditDistance = buildEditDistance1Set(languModel, input);
wordsInEditDistance.retainAll(dictionary);
if(wordsInEditDistance.isEmpty()) {
wordsInEditDistance = buildEditDistance2Set(languModel, input);
wordsInEditDistance.retainAll(dictionary);
if (wordsInEditDistance.isEmpty()) {
System.out.println("Failed to check this word!");
continue;
}
}
// 4.计算所以可能的概率
List<String> guessWords = guessRightWord(languModel, wordsInEditDistance);
System.out.printf("Do you want to input %s and Cost time: %.10f second(s)\n",
guessWords.toString(), (System.currentTimeMillis() - startTime) / 1000D);
}
}
/**
* 读取语料库big.txt,构建模型
* @param path
* @return
* @throws IOException
*/
private Map<String, Double> buildLanguageModel(String path) throws IOException {
Map<String, Double> languModel = new HashMap<String, Double>();
BufferedReader reader = new BufferedReader(new FileReader(path));
//去掉文档中除字母外的所有符号
Pattern pattern = Pattern.compile("[a-zA-Z]+");
String line;
int totalCount = 0;
while ((line = reader.readLine()) != null) {
String[] words = line.split(" ");
for(String word : words) {
if(pattern.matcher(word).matches()) {
word = word.toLowerCase();
Double wordCount = languModel.get(word);
if(wordCount == null) {
languModel.put(word, 1D);
} else {
languModel.put(word, wordCount+1D);
}
totalCount++;
}
}
}
reader.close();
for(Entry<String, Double> entry : languModel.entrySet())
entry.setValue(entry.getValue() / totalCount);
return languModel;
}
/**
* 编辑距离为1的单词集合
* @param languModel
* @param input
* @return
*/
private Set<String> buildEditDistance1Set(Map<String, Double> languModel,String input) {
Set<String> wordsInEditDistance = new HashSet<String>();
char[] characters = input.toCharArray();
// 删除:删除一个字母的情况,delete letter[i]
for(int i=0;i<input.length();i++) {
wordsInEditDistance.add(input.substring(0,i) + input.substring(i+1));
}
// 换位: 交换letter[i] and letter[i+1]
for(int i=0;i<input.length()-1;i++) {
wordsInEditDistance.add(input.substring(0,i) + characters[i+1]
+ characters[i] + input.substring(i+2));
}
// 替换: 将 letter[i]替换为a-z
for(int i=0;i<input.length();i++) {
for(char c : alphabets) {
wordsInEditDistance.add(input.substring(0,i) + c + input.substring(i+1));
}
}
// 插入: 插入一个新的字母 a-z
for(int i=0;i<input.length()+1;i++){
for(char c : alphabets) {
wordsInEditDistance.add(input.substring(0,i) + c + input.substring(i));
}
}
return wordsInEditDistance;
}
/**
* 编辑距离为2的集合.通过editDistance1函数得到编辑距离为1的集合,
* 该集合单词再通过editDistance1函数,就可以得到编辑距离为2的集合
* @param languModel
* @param input
* @return
*/
private Set<String> buildEditDistance2Set(Map<String, Double> languModel,String input) {
Set<String> wordsInEditDistance1 = buildEditDistance1Set(languModel, input);
Set<String> wordsInEditDistance2 = new HashSet<String>();
for(String editDistance1 : wordsInEditDistance1) {
wordsInEditDistance2.addAll(buildEditDistance1Set(languModel, input));
}
wordsInEditDistance2.addAll(wordsInEditDistance1);
return wordsInEditDistance2;
}
/**
* 从语料库中获取正确单词
* @param languModel
* @param wordsInEditDistance
* @return
*/
private List<String> guessRightWord(final Map<String, Double> languModel,Set<String> wordsInEditDistance){
List<String> words = new LinkedList<String>(wordsInEditDistance);
//按照单词在字库中出现的频率大小排序,频率越大出现的可能性越大
Collections.sort(words, new Comparator<String>() {
@Override
public int compare(String word1, String word2) {
return languModel.get(word2).compareTo(languModel.get(word1));
}
});
return words.size() > 5 ? words.subList(0, 5) : words;
}
}