@Service // Spring中InitializingBean接口为bean提供了初始化方法的方式,它只包括afterPropertiesSet方法, // 凡是继承该接口的类,在初始化bean的时候会执行该方法。 public class SensitiveService implements InitializingBean { private static final Logger logger = LoggerFactory.getLogger(SensitiveService.class); @Override public void afterPropertiesSet() throws Exception { try { InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream("SensitiveWords.txt"); InputStreamReader reader = new InputStreamReader(is); BufferedReader bufferedReader = new BufferedReader(reader); String lineTxt; while ((lineTxt = bufferedReader.readLine()) != null) { addWord(lineTxt.trim()); } reader.close(); } catch (Exception e) { logger.error("读取敏感词文件失败" + e.getMessage()); } } private void addWord(String lineTxt) { TrieNode tempNode = rootNode; for (int i = 0; i < lineTxt.length(); i++) { Character c = lineTxt.charAt(i); if (isSymbol(c)) { continue; } TrieNode node = tempNode.getSubNode(c); if (node == null) { node = new TrieNode(); tempNode.addSubNodes(c, node); } tempNode = node; if (i == lineTxt.length() -1) { tempNode.setKeyWordEnd(true); } } } private boolean isSymbol(char c) { int ic = (int)c; return !CharUtils.isAsciiAlphanumeric(c) && (ic < 0x2E80 || ic > 0x9FFF); } public String filter(String text) { if (StringUtils.isBlank(text)) { return text; } String replacement = "***"; StringBuilder result = new StringBuilder(); TrieNode tempNode = rootNode; int begin = 0; int position = 0; while (position < text.length()) { char c = text.charAt(position); if (isSymbol(c)) { if (tempNode == rootNode) { result.append(c); begin++; } position++; continue; } tempNode = tempNode.getSubNode(c); if (tempNode == null) { result.append(text.charAt(begin)); position = begin + 1; begin = position; tempNode = rootNode; } else if (tempNode.isKeyWordEnd()) { result.append(replacement); position = position + 1; begin = position; tempNode = rootNode; } else { position++; } } return result.toString(); } private class TrieNode { private boolean end = false; private Map<Character, TrieNode> subNodes = new HashMap<>(); public void addSubNodes(Character key, TrieNode node) { subNodes.put(key, node); } TrieNode getSubNode(Character key) { return subNodes.get(key); } boolean isKeyWordEnd() { return end; } void setKeyWordEnd(boolean end) { this.end = end; } } private TrieNode rootNode = new TrieNode(); /* public static void main(String[] args) { SensitiveService s = new SensitiveService(); s.addWord("色情"); s.addWord("赌博"); System.out.println(s.filter(" 你 好色 情")); }*/ }
首先介绍内部类TrieNode,前缀树(字典树)节点,subNodes存的是所有子节点。
private class TrieNode { //判断是否为叶结点 private boolean end = false; //子节点集合 private Map<Character, TrieNode> subNodes = new HashMap<>(); //加入子节点 public void addSubNodes(Character key, TrieNode node) { subNodes.put(key, node); } //获得子节点 TrieNode getSubNode(Character key) { return subNodes.get(key); } boolean isKeyWordEnd() { return end; } void setKeyWordEnd(boolean end) { this.end = end; } }
初始化前缀树时需要addWord方法,假如把abc作为敏感词加入前缀树中,即add("abc"),i = 0时,运行到第8行,rootNode没有以a为key的子节点,node为空,则将a作为rootNode的子节点,然后tempNode指向a-Node,a-Node没有以b为key的子节点,node为空,将b作为a-Node的子节点,然后tempNode指向b-Node,b-Node没有以c为key的子节点,node为空,将c作为b-Node的子节点,然后tempNode指向c-Node,而c为最后一个字符,所以将其end属性设为true。
private void addWord(String lineTxt) { TrieNode tempNode = rootNode; for (int i = 0; i < lineTxt.length(); i++) { Character c = lineTxt.charAt(i); if (isSymbol(c)) { continue; } TrieNode node = tempNode.getSubNode(c); if (node == null) { node = new TrieNode(); tempNode.addSubNodes(c, node); } tempNode = node; if (i == lineTxt.length() -1) { tempNode.setKeyWordEnd(true); } } }核心方法,过滤filter(),tempNode是指向前缀树的当前节点,begin为每次的最前面的一个元素,position为每次的当前位置。
public String filter(String text) { if (StringUtils.isBlank(text)) { return text; } String replacement = "***"; StringBuilder result = new StringBuilder(); TrieNode tempNode = rootNode; int begin = 0; int position = 0; while (position < text.length()) { char c = text.charAt(position); if (isSymbol(c)) { if (tempNode == rootNode) { result.append(c); begin++; } position++; continue; } tempNode = tempNode.getSubNode(c); if (tempNode == null) { result.append(text.charAt(begin)); position = begin + 1; begin = position; tempNode = rootNode; } else if (tempNode.isKeyWordEnd()) { result.append(replacement); position = position + 1; begin = position; tempNode = rootNode; } else { position++; } } return result.toString(); }
private boolean isSymbol(char c) { int ic = (int)c; //东亚文字0x2E80-0x9FFF,既不是英文也不是东亚文字,返回true。 return !CharUtils.isAsciiAlphanumeric(c) && (ic < 0x2E80 || ic > 0x9FFF); }其中isSymbol的作用是防止敏感词字符之间插入特殊符号来避开过滤,比如色*情,此时我们需要把这样的符号直接跳过,加强过滤的效果。
过滤的模拟过程图 ,abc为敏感词,abcdefg...为需要过滤的字符串。