目录
Tokenizer
OpenNLP标记器将输入字符序列分段为标记。标记通常是单词、标点符号、数字等。标记化是一个两阶段的过程:首先,确定句子边界,然后识别每个句子中的标记。
支持以下类型的分词器:
Whitespace Tokenizer:空格标志器,以空格进行拆分
Simple Tokenizer:一个字符类的Tokenizer ,相同字符类的序列为tokens
Learnable Tokenizer: 基于检测标志边界的概率模型的最大熵标记器
模型训练
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
public class TokenizerTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "tokenizer.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "en-token-my.bin";
//按行读取数据
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<TokenSample> sampleStream = new TokenSampleStream(lineStream);
TokenizerFactory factory =new TokenizerFactory("en",new Dictionary(),false,null);
//训练模型
TokenizerModel model =TokenizerME.train(sampleStream, factory, TrainingParameters.defaultParams());
//保存模型
FileOutputStream fos=new FileOutputStream(new File(modelPath));
OutputStream modelOut = new BufferedOutputStream(fos);
model.serialize(modelOut);
}
}
句子分词
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
public class TokenizerPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//String filePath = fileResourcesDir + "sentenceDetector.txt";
String modelPath = modelResourcesDir + "en-token.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
TokenizerModel model = new TokenizerModel(modelIn);
//实例化模型
TokenizerME tokenizer = new TokenizerME(model);
//分词检测
String tokens[] = tokenizer.tokenize(" An input sample sentence.");
//获取概率参数
double tokenProbs[] = tokenizer.getTokenProbabilities();
for(String str:tokens){
System.out.println(str);
}
for(double pro:tokenProbs){
System.out.print(pro+",");
}
}
}