Lucene7入门
前言
项目中遇到一个简易的知识库模块业务,因此考虑到使用搜索引擎技术,本人目前所知的常用的搜索引擎技术也就是lucene、solr、elasticsearch这些了,经度娘了解到:lucene是一套用于全文检索和搜寻的开源程序库,solr和elasticsearch都是基于lucene实现的全文搜索服务器,本着优先解决项目的原则选择了lucene(个人觉得solr、elasticsearch都需要搭服务相对于lucene来说还是lucene比较简单些),接着百度找资料,找案例照着敲了demo,粗略的入个门,随后发现网上都是些操作简单的数据类型,实际业务中肯定需要操作些复杂的数据类型,在搜索lucene处理复杂数据类型时搜到了elasticsearch的案例,于是就对比了下solr和elasticsearch,看到elasticsearch易于使用,好吧solr就自动被忽略了,然后继续照着网上教程下载、安装、敲demo,elasticsearch就这样也被我粗鲁的入了门,随着了解elasticsearch的高级查询发现elasticsearch完全能解决我项目的问题,虽然要启个elasticsearch服务,虽然我项目是传统的spring项目,demo却是springboot写的,我相信elasticsearch一定能融入到我的项目中去,好吧,打脸了,网上的方法我都试了,不知是版本原因(我装的最新的7.5)还是什么问题,那个cline始终注入失败,静心思量为了这一个功能模块再整个服务进去是否值得(主要原因还是实施的同事是不是又要怼我),怂了,继续lucene(完全没想过solr,至今solr还没被我入门),经我的静心研究发现lucene还是可以处理复杂数据的,复杂查询也是没问题的。
前辈经验:
https://blog.csdn.net/ltgsoldier1/article/details/96862056.
https://blog.csdn.net/qq_36059561/article/details/83334592
https://blog.csdn.net/haobao528/article/details/86107588
正题
1.引入依赖
<!-- lucene核心库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.7.0</version>
</dependency>
<!-- Lucene的查询解析器 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.7.0</version>
</dependency>
<!-- lucene的默认分词器库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.7.0</version>
</dependency>
<!-- lucene的高亮显示 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.7.0</version>
</dependency>
<!-- 第三方分词器 google开发 中文分词更专业-->
<!-- https://mvnrepository.com/artifact/com.github.magese/ik-analyzer -->
<dependency>
<groupId>com.github.magese</groupId>
<artifactId>ik-analyzer</artifactId>
<version>7.7.0</version>
</dependency>
</dependencies>
2.创建Model
@Data
public class KnowledgeBaseModel {
private String all;
private Integer id;
private Integer type;//0 维修记录 ,1 维修知识
private String title;//标题
private List<String> keyword;
private String equipmentName;//设备
private String equipmentTypeName;//设备类型
private List<String> partsName;//部件
private String description;//故障描述
private String description1;
private String userName;//人员
private String createTime;//创建时间
}
3.创建lucene工具类
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.File;
import java.io.IOException;
public class LuceneUtils {
private static Directory directory; // 索引库目录
private static Analyzer analyzer; // 分词器
private static IndexWriter indexWriter;
static {
try {
// 这里应是读取配置文件得到的索引库目录
directory = FSDirectory.open(new File("C:\\alloySevenService\\indexDir").toPath());
analyzer = new IKAnalyzer();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 获取全局唯一的IndexWriter对象
*/
public static IndexWriter getIndexWriter() {
// 在第一次使用IndexWriter是进行初始化
if (indexWriter == null) {
try {
indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));
} catch (Exception e) {
throw new RuntimeException(e);
}
// 指定一段代码,会在JVM退出之前执行。
Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
try {
indexWriter.close();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
});
}
return indexWriter;
}
public static Directory getDirectory() {
return directory;
}
public static Analyzer getAnalyzer() {
return analyzer;
}
}
4.创建Model/文档转换的工具类
import com.alloySeven.lucene.model.KnowledgeBaseModel;
import org.apache.lucene.document.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class KnowledgeBaseDocumentUtils {
public static Document knowledgeBaseToDocument(KnowledgeBaseModel knowledgeBaseModel){
Document document = new Document();
SimpleDateFormat sdf = new SimpleDateFormat("yyy-MM-dd HH:mm:ss");
String idStr = Integer.toString(knowledgeBaseModel.getId());
document.add(new StringField("id", idStr, Field.Store.YES));//唯一标识符 不分词
document.add(new StringField("type",knowledgeBaseModel.getType().toString(),Field.Store.YES));
if (knowledgeBaseModel.getTitle()!=null) document.add(new TextField("title", knowledgeBaseModel.getTitle(), Field.Store.YES));
if (knowledgeBaseModel.getEquipmentName()!=null) document.add(new StringField("equipmentName",knowledgeBaseModel.getEquipmentName(),Field.Store.YES));
if (knowledgeBaseModel.getEquipmentTypeName()!=null) document.add(new StringField("equipmentTypeName",knowledgeBaseModel.getEquipmentTypeName(), Field.Store.YES));
if (knowledgeBaseModel.getDescription()!=null) document.add(new TextField("description",knowledgeBaseModel.getDescription(), Field.Store.YES));
if (knowledgeBaseModel.getDescription1()!=null) document.add(new TextField("description1",knowledgeBaseModel.getDescription1(), Field.Store.YES));
if (knowledgeBaseModel.getUserName()!=null) document.add(new StringField("userName",knowledgeBaseModel.getUserName(), Field.Store.YES));
document.add(new StringField("createTime",sdf.format(new Date()), Field.Store.YES));
if (knowledgeBaseModel.getKeyword()!=null&&knowledgeBaseModel.getKeyword().size()>0){
for (String s : knowledgeBaseModel.getKeyword()) {
document.add(new StringField("keyword",s,Field.Store.YES));
}
}
//把需要搜索匹配的的字段 进行拼接组合
StringBuffer sb = new StringBuffer(knowledgeBaseModel.getType()+" "+knowledgeBaseModel.getEquipmentName()+" "+knowledgeBaseModel.getEquipmentTypeName()
+" "+knowledgeBaseModel.getEquipmentTypeName()+" "+knowledgeBaseModel.getDescription()+" "+knowledgeBaseModel.getDescription1()+" "+knowledgeBaseModel.getUserName());
if (knowledgeBaseModel.getPartsName()!=null&&knowledgeBaseModel.getPartsName().size()>0){
for (String s : knowledgeBaseModel.getPartsName()) {
document.add(new StringField("partsName",s,Field.Store.YES));
sb.append(" "+s);
}
}
//搜索字段
document.add(new TextField("all",sb.toString(), Field.Store.YES));
return document;
}
public static KnowledgeBaseModel documentToKnowledgeBase(Document document) throws ParseException {
KnowledgeBaseModel knowledgeBaseModel = new KnowledgeBaseModel();
String id = document.get("id");
knowledgeBaseModel.setId(Integer.parseInt(id));
knowledgeBaseModel.setTitle(document.get("title"));
knowledgeBaseModel.setType(Integer.parseInt(document.get("type")));
knowledgeBaseModel.setEquipmentName(document.get("equipmentName"));
knowledgeBaseModel.setEquipmentTypeName(document.get("equipmentTypeName"));
knowledgeBaseModel.setUserName(document.get("userName"));
knowledgeBaseModel.setDescription(document.get("description"));
knowledgeBaseModel.setDescription1(document.get("description1"));
knowledgeBaseModel.setCreateTime(document.get("createTime"));
String[] keywords = document.getValues("keyword");
List<String> keywordList = new ArrayList<>();
for (String keyword : keywords) {
keywordList.add(keyword);
}
knowledgeBaseModel.setKeyword(keywordList);
String[] partsNames = document.getValues("partsName");
List<String> partsNameList = new ArrayList<>();
for (String partsName : partsNames) {
partsNameList.add(partsName);
}
knowledgeBaseModel.setPartsName(partsNameList);
return knowledgeBaseModel;
}
}
5. 增删改查 service
1.接口
import com.alloySeven.lucene.model.KnowledgeBaseModel;
import com.alloySeven.util.PageSize;
import org.apache.lucene.queryparser.classic.ParseException;
import java.io.IOException;
public interface IKnowledgeBaseModelService {
void add(KnowledgeBaseModel knowledgeBaseModel);
void edit(KnowledgeBaseModel knowledgeBaseModel) throws IOException;
void del(KnowledgeBaseModel knowledgeBaseModel) throws IOException;
PageSize search(String all, String keyword, int page, int size) throws ParseException, IOException, java.text.ParseException;
}
2.实现
import com.alloySeven.lucene.model.KnowledgeBaseModel;
import com.alloySeven.lucene.util.KnowledgeBaseDocumentUtils;
import com.alloySeven.lucene.util.LuceneUtils;
import com.alloySeven.util.PageSize;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@Service
public class KnowledgeBaseModelServiceImpl implements IKnowledgeBaseModelService{
@Override
public void add(KnowledgeBaseModel knowledgeBaseModel) {
Document document = KnowledgeBaseDocumentUtils.knowledgeBaseToDocument(knowledgeBaseModel);
try {
LuceneUtils.getIndexWriter().addDocument(document);
LuceneUtils.getIndexWriter().commit(); // 提交更改
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void edit(KnowledgeBaseModel knowledgeBaseModel) throws IOException {
//修改先删除 后添加
del(knowledgeBaseModel);
add(knowledgeBaseModel);
}
@Override
public void del(KnowledgeBaseModel knowledgeBaseModel) throws IOException {
//多条件删除
Query query1 = new TermQuery(new Term("id",knowledgeBaseModel.getId().toString()));
Query query2 = new TermQuery(new Term("type",knowledgeBaseModel.getType().toString()));
BooleanQuery.Builder builder=new BooleanQuery.Builder();
builder.add(query1, BooleanClause.Occur.MUST);
builder.add(query2, BooleanClause.Occur.MUST);
BooleanQuery booleanQuery=builder.build();
LuceneUtils.getIndexWriter().deleteDocuments(booleanQuery);
LuceneUtils.getIndexWriter().commit();
}
@Override
public PageSize search(String all,String keyword,int page,int size) throws ParseException, IOException, java.text.ParseException {
PageSize pageSize = new PageSize<>();
//全文检索
QueryParser queryParser = new MultiFieldQueryParser(new String[]{"all"},LuceneUtils.getAnalyzer());
Query query1 = queryParser.parse(all);
BooleanQuery.Builder builder=new BooleanQuery.Builder();
builder.add(query1, BooleanClause.Occur.MUST);
if (keyword!=null&&!"".equals(keyword.trim())){
Query query2 = new TermQuery(new Term("keyword",keyword));
builder.add(query2, BooleanClause.Occur.MUST);
}
BooleanQuery booleanQuery=builder.build();
IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(LuceneUtils.getDirectory()));
TopDocs topDocs = null;
ScoreDoc sd;
if(page == 1) {
sd = null;
}else {
int num = size * (page - 1);
topDocs = indexSearcher.search(booleanQuery, num);
sd = topDocs.scoreDocs[num - 1];//获取上一页的最后一条
}
topDocs = indexSearcher.searchAfter(sd,booleanQuery,size);
long count = topDocs.totalHits;//符合条件的总记录数
pageSize.setPage(page);
pageSize.setSize(size);
pageSize.setTotal(count);
//3.处理数据并返回
List<KnowledgeBaseModel> list = new ArrayList<>();
for(int i=0;i<topDocs.scoreDocs.length;i++){
//根据内部编号获取真正的Document数据
int docId = topDocs.scoreDocs[i].doc;
Document doc = indexSearcher.doc(docId);
KnowledgeBaseModel knowledgeBaseModel = KnowledgeBaseDocumentUtils.documentToKnowledgeBase(doc);
list.add(knowledgeBaseModel);
}
pageSize.setList(list);
return pageSize;
}
}
总结
根据业务需求,查询是需要一个检索的内容和一个关键字的过滤的,所以search的时候使用BooleanQuery进行了两个条件的组合,all字段是对需要进行检索的字段内容进行整合方便进行检索,不必再去组合过多的条件,两个BooleanClause.Occur.MUST组合相当于and,删除的话也是一样根据条件检索,进行精确删除,刚开始的时候我把type的字段类型定义成了StoredField(当时是考虑不会根据这个字段检索,只是做区分用,删除的时候是需要根据这个字段去定位的)导致了检索不成功,删除失败。StoredField类型是不分词,不索引的。最后当然不能少了分页,indexSearcher.searchAfter(sd,booleanQuery,size),searchAfter()方法需要三个参数:ScoreDoc sd, Query query, int size。sd是上一页的最后一个scoreDoc,query是查询的内容,size是条数,第一页的话scoreDoc要为null,这个需要特别注意。
原本在获取 IndexWriter对象时是加了synchronized 的,但粗略翻了下源码后发现底层是考虑了这个线程安全问题的,那就保证唯一就行了吧,还有一些实体类对Model转换和controller的代码没贴,这些都是跟业务相关比较简单就不在这里叙述了。不对的地方望大佬们留言指导。
再次感谢这些前辈们的分享!
前辈经验:
https://blog.csdn.net/ltgsoldier1/article/details/96862056.
https://blog.csdn.net/qq_36059561/article/details/83334592
https://blog.csdn.net/haobao528/article/details/86107588