中文分词微服务包括分词方法有:RobinSeg(RS)、IKAnalyzer(IK)、JEAnalysis(JE)、MmSeg4j(MS)、PaoDing(PD)、SmallSeg4j(SS)。其中RS分词实现见我的文章:知更鸟中文分词RS设计实现 ,其他分词方法都采用发布的jar包进行封装装。
设计模式
主要涉及外观模式、适配器模式、工厂模式和单例模式。分词微服务类图如图所示:
设计原则:(1)针对接口编程,不要针对实现;(2)只和最紧密的类交互;(3)封装变化;(4)松耦合设计。
外观模式:提供一个统一的接口,用来访问子系统中的一群接口,外观定义了一个高层接口,让子系统更容易使用。我们采用统一的分词外观类封装各种分词接口,提供一个一致的高层接口。
适配器模式:将一个类的接口,转换成客户期望的另一个接口。适配器让原本接口不兼容的类可以合作无间。各种分词的的私有实现接口需要一个提供一个统一的接口调用。
工厂模式:定义一个创建对象的接口,但有子类决定要实例化的类是哪一个。提供统一的分词工厂,创建分类实例对象。
单例模式:确保一个类只有一个实例,并提供了一个全局访问点。由于各种分词对象的创建、加载词典等需要申请大量的内存,耗费大量的时间,所以所分词器实例都通过适配器进行控制只创建一个实例。
代码实现
中文分词接口抽象类
package com.robin.segment;
import com.robin.log.RobinLogger;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>中文分词接口抽象类</DD>
*
* @version Version1.0
* @author Robin
* @version <I> Date:2018-04-18</I>
* @author <I> E-mail:[email protected]</I>
*/
public abstract class AbstractSegmenter {
/** 日志 */
protected static final Logger LOGGER = RobinLogger.getLogger();
/**
* 分词抽象方法
*
* @param text 文本
* @param SEPARATOR 分隔符
* @return 已分词文本
*/
public abstract String segment(String text, String SEPARATOR);
}
统一分词器外观类
package com.robin.segment;
import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.RobinSeg;
import com.robin.segment.robinseg.SegmentArgs;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>统一分词器外观类</DD>
* <DD>外观模式</DD>
*
* @version 1.0
* @author Robin
* @version <I> Date:2018-04-19</I>
* @author <I> E-mail:[email protected]</I>
*/
public class SegmentFacade {
// 日志
private static final Logger LOGGER = RobinLogger.getLogger();
/**
* 获取分词器配置参数对象
*
* @param methodName 分词方法
* @return SegmentArgs
*/
public static SegmentArgs getSegmentArgsObj(SegmentMethod methodName) {
AbstractSegmenter segment = SegmentFactory.getSegInstance(methodName);
if (methodName.equals(SegmentMethod.RS)) {
return ((RobinSeg) segment).getSegmentConfInstance();
}
return null;
}
/**
* <DD>根据不同分词算法进行分词,</DD>
* <DD>传入算法名错误或默认情况下用RobinSeg分词。</DD>
*
* @param methodName 分词方法名称,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,
* “.RS”
* @param text 待分词文本
* @param separator 分隔符
* @return 使用分隔符分好词文本
*/
public static String split(SegmentMethod methodName, String text, String separator) {
AbstractSegmenter segmenter = SegmentFactory.getSegInstance(methodName);
return segmenter.segment(text, separator);
}
}
分词Action实现类
package com.robin.segment.action;
import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFacade;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.SegmentArgs;
import com.robin.segment.robinseg.SegmentArgs.SegAlgorithm;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
/**
* <DT><B>描述:</B></DT>
* <DD>分词Action实现类</DD>
*
* @version Version1.0
* @author Robin
* @version <I> V1.0 Date:2018-06-05</I>
* @author <I> E-mail:[email protected]</I>
*/
public class SegmentAction implements MircoServiceAction {
private static final Logger LOGGER = RobinLogger.getLogger();
public enum StatusCode {
OK,
JSON_ERR,
KIND_ERR,
VERSION_ERR,
SEGMETHOD_ERR,
SEPARATOR_ERR,
SEGMENT_FAILED,
TEXTS_NULL,
}
private class ActionStatus {
StatusCode statusCode;
String msg;
}
private JSONObject getErrorJson(ActionStatus actionStatus) {
JSONObject errJson = new JSONObject();
try {
errJson.put("status", actionStatus.statusCode.toString());
errJson.put("msg", actionStatus.msg);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return errJson;
}
private ActionStatus checkJSONObjectTerm(JSONObject jsonObj,
String key,
HashSet<String> valueSet,
StatusCode errStatusCode) {
ActionStatus actionStatus = new ActionStatus();
try {
if (!jsonObj.isNull(key)) {
String value = jsonObj.getString(key);
if (!valueSet.contains(value)) {
actionStatus.msg = "The value [" + value + "] of " + key + " is error.";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} else {
actionStatus.msg = "The input parameter is missing " + key + ".";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
private ActionStatus checkInputJSONObject(JSONObject jsonObj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
JSONObject argsJson;
HashSet<String> valueSet = new HashSet();
try {
valueSet.add("segment");
retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
valueSet.clear();
valueSet.add("v1");
retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");
valueSet.clear();
valueSet.add("RS");
valueSet.add("IK");
valueSet.add("JE");
valueSet.add("MS");
valueSet.add("PD");
valueSet.add("SS");
retActionStatus = checkJSONObjectTerm(segmentMetadata, "method", valueSet, StatusCode.SEGMETHOD_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
valueSet.clear();
valueSet.add(" ");
valueSet.add("|");
valueSet.add("/");
retActionStatus = checkJSONObjectTerm(segmentMetadata, "separator", valueSet, StatusCode.SEPARATOR_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
// 设置RobinSeg分词参数
String method = segmentMetadata.getString("method");
SegmentMethod segmentMethod = SegmentMethod.valueOf(method);
if ((segmentMethod.equals(SegmentMethod.RS)) && (!segmentMetadata.isNull("args"))) {
argsJson = segmentMetadata.getJSONObject("args");
SegmentArgs segmentArgs = SegmentFacade.getSegmentArgsObj(segmentMethod);
if (null != segmentArgs) {
if (!argsJson.isNull("algorithm")) {
String algorithm = argsJson.getString("algorithm");
segmentArgs.setSegAlgorithm(SegAlgorithm.valueOf(algorithm.toUpperCase()));
}
if (!argsJson.isNull("cleanSymbol")) {
Boolean flag = argsJson.getBoolean("cleanSymbol");
segmentArgs.setCleanSymbolFlag(flag);
}
if (!argsJson.isNull("markNewWord")) {
Boolean flag = argsJson.getBoolean("markNewWord");
segmentArgs.setMarkNewWordFlag(flag);
}
if (!argsJson.isNull("downcasing")) {
Boolean flag = argsJson.getBoolean("downcasing");
segmentArgs.setDowncasingFlag(flag);
}
if (!argsJson.isNull("mergePattern")) {
Boolean flag = argsJson.getBoolean("mergePattern");
segmentArgs.setMergePatternFlag(flag);
}
if (!argsJson.isNull("retrievalPattern")) {
Boolean flag = argsJson.getBoolean("retrievalPattern");
segmentArgs.setRetrievalPatternFlag(flag);
}
}
}
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
@Override
public Object action(Object obj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
if (!(obj instanceof JSONObject)) {
actionStatus.msg = "The action arguments is not JSONObject.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
actionStatus.statusCode = StatusCode.JSON_ERR;
return this.getErrorJson(actionStatus);
}
JSONObject jsonObj = (JSONObject) obj;
retActionStatus = this.checkInputJSONObject(jsonObj);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
LOGGER.log(Level.SEVERE, retActionStatus.msg);
return this.getErrorJson(retActionStatus);
}
SegmentMethod segmentMethod;
String separator;
JSONObject texts;
try {
JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");
String method = segmentMetadata.getString("method");
segmentMethod = SegmentMethod.valueOf(method);
separator = segmentMetadata.getString("separator");
texts = jsonObj.getJSONObject("texts");
long beginTime = System.currentTimeMillis();
if (null == texts) {
actionStatus.statusCode = StatusCode.TEXTS_NULL;
actionStatus.msg = "The input texts is null.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
return this.getErrorJson(actionStatus);
}
Iterator labelsIt = texts.keys();
while (labelsIt.hasNext()){
String label = (String) labelsIt.next();
JSONArray aLabelTexts = texts.getJSONArray(label);
int len = aLabelTexts.length();
for (int i = 0; i < len; i++) {
JSONObject textJson = aLabelTexts.getJSONObject(i);
String text = textJson.getString("text");
if (null != text) {
String result = SegmentFacade.split(segmentMethod, text, separator);
textJson.put("text", result);
}
}
}
long endTime = System.currentTimeMillis();
int spendTime = (int) (endTime - beginTime);
segmentMetadata.put("spendTime", spendTime);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
JSONObject rsp = new JSONObject();
try {
rsp.put("status", "OK");
rsp.put("result", jsonObj);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return rsp;
}
}
分词实例工厂方法类
package com.robin.segment;
import com.robin.segment.adapter.SmallSeg4jAdapter;
import com.robin.segment.adapter.MmSeg4jAdapter;
import com.robin.segment.adapter.IKAnalyzerAdapter;
import com.robin.segment.adapter.JEAnalysisAdapter;
import com.robin.segment.adapter.PaoDingAdapter;
import com.robin.log.RobinLogger;
import com.robin.segment.robinseg.RobinSeg;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>分词实例工厂方法类</DD>
*
* @version Version1.0
* @author Robin
* @version <I> Date:2018-04-19</I>
* @author <I> E-mail:[email protected]</I>
*/
public class SegmentFactory {
// 日志
private static final Logger LOGGER = RobinLogger.getLogger();
/** 分词算法名称标记 */
public enum SegmentMethod {
/** JE = "JEAnalysis" */
JE,
/** IK = "IKAnalyzer"*/
IK,
/** MS = "MmSeg4j" */
MS,
/** PD = "PaoDing" */
PD,
/** SS = "SmallSeg4j" */
SS,
/** RS = "RobinSeg" */
RS
}
/**
* 创建具体分词类实例
*
* @param methodName 分词方法名称,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,“.RS”
* @return 具体分词方法实例
*/
public static AbstractSegmenter getSegInstance(SegmentMethod methodName) {
if (null == methodName) {
methodName = SegmentMethod.RS;
}
switch (methodName) {
case JE:
return JEAnalysisAdapter.getInstance();
case IK:
return IKAnalyzerAdapter.getInstance();
case MS:
return MmSeg4jAdapter.getInstance();
case PD:
return PaoDingAdapter.getInstance();
case SS:
return SmallSeg4jAdapter.getInstance();
case RS:
return RobinSeg.getInstance();
default:
LOGGER.log(Level.WARNING, "分词方法名称错误,默认采用RobinSeg分词.");
return RobinSeg.getInstance();
}
}
}
IK适配器类
package com.robin.segment.adapter;
import com.robin.segment.AbstractSegmenter;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Level;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;
/**
* <DT><B>描述:</B></DT>
* <DD>适配IKAnalyzer3.2.0分词器</DD>
* <DD>适配器模式、单例模式</DD>
*
* @version Version1.0
* @author Robin
* @version <I> Date:2018-04-17</I>
* @author <I> E-mail:[email protected]</I>
*/
public class IKAnalyzerAdapter extends AbstractSegmenter {
/** 分词实例 */
protected static AbstractSegmenter instance = null;
private IKAnalyzerAdapter() {
}
/**
* 使用给定分隔符分词
*
* @param text 待分词文本
* @param separator
* @return 分好词文本
*/
@Override
public String segment(String text, String separator) {
//防御性编程
if (null == text || "".equals(text)) {
return "";
}
//使用最大词匹配建立分词器
IKSegmentation ikSeg = new IKSegmentation(new StringReader(text), true);
StringBuilder sb = new StringBuilder();
try {
Lexeme l = null;
while ((l = ikSeg.next()) != null) {
sb.append(l.getLexemeText().concat(separator));
}
} catch (IOException e) {
LOGGER.log(Level.SEVERE, e.getMessage());
}
return sb.toString();
}
/**
* 获取 IKAnalyzer 分词类的实例
*
* @return 分词类的单实例
*/
public static AbstractSegmenter getInstance() {
if (null == instance) {
instance = new IKAnalyzerAdapter();
}
return instance;
}
}
请求JSON
中文分词微服务请求JSON格式如下,红框标示了请求参数和原始文本。
响应JSON
中文分词微服务响应JSON格式如下,红框标示分词消耗时间和分词结果。