版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/EchoYY/article/details/78468225
此博文针对的是Lucene版本5.3.0,若您的Lucene版本为3.X,请移步这里http://write.blog.csdn.net/postedit/78291868(只提取关键词,未包含同义词检索)
本篇文章包含两个功能
1、精确提取自定义关键词
2、同义词检索与提取
废话不多说,直接撸代码
定义同义词分词类如下
package com.daelly.sample.lucene.analyzer.synonyms;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.wltea.analyzer.lucene.IKTokenizer;
public class SynonymsAnalyzer extends Analyzer {
private final String synonymsPath;
public SynonymsAnalyzer(String synonymsPath) {
if(synonymsPath==null || synonymsPath.isEmpty()) {
throw new IllegalArgumentException("synonymsPath must be provided!");
}
this.synonymsPath = synonymsPath;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
SynonymFilterFactory factory = null;
try {
factory = getSynonymFilterFactory();
} catch (IOException e) {
e.printStackTrace();
}
Tokenizer tokenizer = new IKTokenizer(true);
if(factory != null) {
TokenStream tokenStream = factory.create(tokenizer);
return new TokenStreamComponents(tokenizer,tokenStream);
}
return new TokenStreamComponents(tokenizer);
}
private SynonymFilterFactory getSynonymFilterFactory() throws IOException {
if(synonymsPath.contains("classpath:")) {
String path = synonymsPath.replace("classpath:", "");
Map args = new HashMap<String,String>();
args.put("synonyms", path);
SynonymFilterFactory factory = new SynonymFilterFactory(args );
factory.inform(new ClasspathResourceLoader());
return factory;
}
int index = synonymsPath.lastIndexOf(File.separator);
String dir = synonymsPath.substring(0,index);
String name = synonymsPath.substring(index+1);
Map args = new HashMap<String,String>();
args.put("synonyms", name);
SynonymFilterFactory factory = new SynonymFilterFactory(args);
Path baseDirectory = Paths.get(dir);
FilesystemResourceLoader loader = new FilesystemResourceLoader(baseDirectory);
factory.inform(loader);
return factory;
}
}
package com.apache.luence;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.daelly.sample.lucene.analyzer.synonyms.SynonymsAnalyzer;
public class AddIndex {
private static final String INDEXDIR = "D:\\TestSolr\\Index\\Test";
private static final String DATADIR = "D:\\TestSolr\\src\\resource\\node.dic";
private static final String ACTIONDIR = "D:\\TestSolr\\src\\resource\\data\\action.txt";
public AddIndex() {
try {
Directory directory = FSDirectory.open(Paths.get(INDEXDIR));
IndexWriterConfig config = new IndexWriterConfig(new SynonymsAnalyzer(ACTIONDIR));
IndexWriter iwriter = new IndexWriter(directory, config);
File files = new File(DATADIR);
List<String> contents = this.getContent(files);
for (String content : contents) {
Document doc = new Document();
doc.add(new TextField("content",content,Field.Store.YES));
iwriter.addDocument(doc);
}
iwriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private List<String> getContent(File files) {
List<String> strList = new ArrayList<String>();
try {
InputStream stream = new FileInputStream(files);
String code = "UTF-8";
BufferedReader br = new BufferedReader(new InputStreamReader(stream, code));
String str = br.readLine();
while (str != null) {
strList.add(str);
str = br.readLine();
}
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return strList;
}
public static void main(String[] args) {
AddIndex a = new AddIndex();
}
}
package com.apache.luence;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.plaf.synth.SynthSpinnerUI;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
*
* 通过索引字段来读取文档
*
*/
public class IKUtil {
private static final String INDEXDIR = "D:\\TestSolr\\Index\\Test";
private boolean search(String keyword){
boolean flag = false;
Directory directory = null;
// 读取索引并查询
DirectoryReader ireader = null;
TopDocs hits = null;
try {
flag = false;
directory = FSDirectory.open(Paths.get(INDEXDIR));
ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
Analyzer analyzer=new IKAnalyzer(true);
TermQuery query = new TermQuery(new Term("content", keyword));
hits=isearcher.search(query, 10);
} catch (IOException e) {
e.printStackTrace();
}
if(hits!=null&&hits.totalHits>0) {
flag = true;
}
try {
ireader.close();
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
return flag;
}
/**
* 获取输入文本中的关键词
* @param sInput
* @return
*/
public String[] getKeyWords(String sInput){
List<String> result = new ArrayList<String>();
Map<String, String> map = new HashMap<String, String>();
int i=0;
try {
IKAnalyzer analyzer = new IKAnalyzer(true);
TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(sInput));
CharTermAttribute term = (CharTermAttribute)tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
if(this.search(term.toString())==true) {
if (!map.containsValue(term.toString())) {
map.put("key" + i, term.toString());
i++;
}
}
}
tokenStream.end();
} catch (Exception e) {
e.printStackTrace();
}
for(int j=0;j<map.size();j++) {
result.add( map.get("key" + j));
}
return (String[])result.toArray(new String[result.size()]);
}
/**
* 获取同义词列表里的同义词
* @param src
* @return
*/
public List<String> getSynonyms(String src) {
List<String> results = new ArrayList<String>();
try {
Term term = new Term("content", src);
Query query = new TermQuery(term);
Directory directory = FSDirectory.open(Paths.get(INDEXDIR));
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(query, 10);
for(ScoreDoc scoreDoc : docs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
String synonyms = doc.get("content");
if("".equals(synonyms)&&synonyms==null) {
return null;
}
results.add(synonyms);
}
} catch (IOException e) {
e.printStackTrace();
}
return results;
}
public static void main(String[] args) {
String input = "这是一整个关键词哈哈";
String[] results = new IKUtil().getKeyWords(input);
// List<String> results = new IKUtil().getSynonyms(input);
for(String result:results) {
System.out.println(result);
}
}
}
首先测试一下精确提取关键词功能
索引文件如图
运行结果如图
如图可以看到"哈哈”被过滤掉了
下面再测试一下同义词功能
如图是同义词文件
这里需要空一行(其实我也不知道为什么……),然后编码格式utf-8
检验一下,还是用之前建立的索引文件
如图,如果还记得索引文件话会发现里面没有hello这个关键词,但是仍被识别出来了。
再检验一下搜索同义词的功能
这里更换一下注释的行数,选择134行被注掉的代码,同时注掉133行,运行结果如图
bingo
剩下的就需要各位小伙伴按需求进行修改就好了