代码实现:
package com.snnu.demo;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
/**
* @author yxx
* @create 2020-12-16 10:04
* 书籍第二章练习:词典分词
*/
public class Word2Exercise {
public static void main(String[] args) throws IOException {
/*
加载字典,输出字典大小及按字典序排序的第一个单词
*/
//D:\tool\data\dictionary\CoreNatureDictionary.mini.txt
TreeMap<String, CoreDictionary.Attribute> dictionary= IOUtil.loadDictionary("D:\\tool\\data\\dictionary\\CoreNatureDictionary.mini.txt");
// System.out.println(dictionary.size());
// System.out.println(dictionary.keySet().iterator().next());
// 完全切分
System.out.println("完全切分:");
System.out.println(segmentFully("就读北京大学", dictionary));
// 正向最长匹配
System.out.println("正向最长匹配:");
System.out.println(segmentForwardLongest("就读北京大学", dictionary));
System.out.println(segmentForwardLongest("研究生命起源", dictionary));
System.out.println(segmentForwardLongest("项目的研究", dictionary));
// 逆向最长匹配
System.out.println("逆向最长匹配:");
System.out.println(segmentBackwardLongest("研究生命起源", dictionary));
System.out.println(segmentBackwardLongest("项目的研究", dictionary));
// 双向最长匹配
System.out.println("双向最长匹配:");
String[] text = new String[]{
"项目的研究",
"商品和服务",
"研究生命起源",
"当下雨天地面积水",
"结婚的和尚未结婚的",
"欢迎新老师生前来就餐",
};
for (int i = 0; i < text.length; i++)
{
System.out.printf("| %d | %s | %s | %s | %s |\n", i + 1, text[i],
segmentForwardLongest(text[i], dictionary),
segmentBackwardLongest(text[i], dictionary),
segmentBidirectional(text[i], dictionary)
);
}
}
/**
* 完全切分式的中文分词算法
*
* @param text 待分词的文本
* @param dictionary 词典
* @return 单词列表
*/
public static List<String> segmentFully(String text, Map<String, CoreDictionary.Attribute> dictionary)
{
List<String> wordList = new LinkedList<String>();
for (int i = 0; i < text.length(); ++i)
{
for (int j = i + 1; j <= text.length(); ++j)
{
String word = text.substring(i, j);
if (dictionary.containsKey(word))
{
wordList.add(word);
}
}
}
return wordList;
}
/**
* 正向最长匹配的中文分词算法
*
* @param text 待分词的文本
* @param dictionary 词典
* @return 单词列表
*/
public static List<String> segmentForwardLongest(String text, Map<String, CoreDictionary.Attribute> dictionary)
{
List<String> wordList = new LinkedList<String>();
for (int i = 0; i < text.length(); )
{
String longestWord = text.substring(i, i + 1);
for (int j = i + 1; j <= text.length(); ++j)
{
String word = text.substring(i, j);
if (dictionary.containsKey(word))
{
if (word.length() > longestWord.length())
{
longestWord = word;
}
}
}
wordList.add(longestWord);
i += longestWord.length();
}
return wordList;
}
/**
* 逆向最长匹配的中文分词算法
*
* @param text 待分词的文本
* @param dictionary 词典
* @return 单词列表
*/
public static List<String> segmentBackwardLongest(String text, Map<String, CoreDictionary.Attribute> dictionary)
{
List<String> wordList = new LinkedList<String>();
for (int i = text.length() - 1; i >= 0; )
{
String longestWord = text.substring(i, i + 1);
for (int j = 0; j <= i; ++j)
{
String word = text.substring(j, i + 1);
if (dictionary.containsKey(word))
{
if (word.length() > longestWord.length())
{
longestWord = word;
}
}
}
wordList.add(0, longestWord);
i -= longestWord.length();
}
return wordList;
}
/**
* 统计分词结果中的单字数量
*
* @param wordList 分词结果
* @return 单字数量
*/
public static int countSingleChar(List<String> wordList)
{
int size = 0;
for (String word : wordList)
{
if (word.length() == 1)
++size;
}
return size;
}
/**
* 双向最长匹配的中文分词算法
*
* @param text 待分词的文本
* @param dictionary 词典
* @return 单词列表
*/
public static List<String> segmentBidirectional(String text, Map<String, CoreDictionary.Attribute> dictionary)
{
List<String> forwardLongest = segmentForwardLongest(text, dictionary);
List<String> backwardLongest = segmentBackwardLongest(text, dictionary);
if (forwardLongest.size() < backwardLongest.size())
return forwardLongest;
else if (forwardLongest.size() > backwardLongest.size())
return backwardLongest;
else
{
if (countSingleChar(forwardLongest) < countSingleChar(backwardLongest))
return forwardLongest;
else
return backwardLongest;
}
}
}