C#/.NET 系统优化专题（搜索引擎Lucene的使用）

讲Lucene之前先说说数据库索引的使用，如下图

对列的计算要避免，任何形式都要避免

in查询 or查询，索引会失效，可能是拆分

in 换exists，not in 不要用，不走索引

is null和is not null 都不走索引

<> 也不走索引，可以拆分成> 和<

join时，链接越少性能越高

左链接，以左边的结果为准，右链接反过来，连接字段要求带索引

lucene的基本说明

/// lucene.net：全文检索的工具包，不是应用，只是个类库，完成了全文检索的功能
/// 就是把数据拆分—存起来—查询时—拆分—匹配—结果
///
/// Analysis–分词器，负责把字符串拆分成原子，包含了标准分词，直接空格拆分
/// 项目中用的是盘古中文分词，
/// Document–数据结构，定义存储数据的格式
/// Index–索引的读写类
/// QueryParser–查询解析器，负责解析查询语句
/// Search—负责各种查询类，命令解析后得到就是查询类
/// Store—索引存储类，负责文件夹等等
/// Util—常见工具类库
///
/// lucene是全文搜索必备的，是大型系统必备的
///
/// Search：
/// TermQuery–单元查询 new Term(“title”,“张三”) title:张三
/// BoolenQuery—new Term(“title”,“张三”) and new Term(“title”,“李四”) title:张三 + title:李四
/// new Term(“title”,“张三”) or new Term(“title”,“李四”) title:张三 title:李四
/// WildcardQuery—通配符 new Term(“title”,“张?”) title:张？
/// new Term(“title”,“张*”) title:张*
/// PrefixQuery—前缀查询以xx开头 title:张*
/// PhraseQuery—间隔距离包含没有包含提莫而且二者距离不能超过5
/// title: “没有提莫”~5
/// 没有蘑菇的提莫没有蘑菇的蘑菇的蘑菇的提莫
/// FuzzyQuery—近似查询，ibhone----iphone title:ibhone~
/// RangeQuery—范围查询 [1,100] {1,100}
///
/// Lucene.Net一进一出，建立索引需要获取数据源，分词-保存到硬盘
/// 索引查找，
/// 自然会有些延迟，以前淘宝上架宝贝，第二天才能搜索的
/// 索引更新策略：1 数据跟新—丢一个队列—一个processor通过队列完成更新
/// 2 每一周全部索引一遍
///
/// lucene索引存的是原子–docid1，docid2，docid3
/// 不store可以大量节约空间；查找时原子匹配多个id;

第一步：lucene初始化索引

/// <summary>
/// 初始化索引
/// </summary>
public static void InitIndex()
{
    List<Commodity> commodityList = GetList();//数据源

    FSDirectory directory = FSDirectory.Open(StaticConstant.TestIndexPath);//文件夹
    using (IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED))//索引写入器
    {
        foreach (Commodity commdity in commodityList)
        {
            for (int k = 0; k < 10; k++)
            {
                Document doc = new Document();//一条数据
                doc.Add(new Field("id", commdity.Id.ToString(), Field.Store.NO, Field.Index.NOT_ANALYZED));//一个字段  列名  值   是否保存值  是否分词
                doc.Add(new Field("title", commdity.Title, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("url", commdity.Url, Field.Store.NO, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("imageurl", commdity.ImageUrl, Field.Store.NO, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("content", "this is lucene working,powerful tool " + k, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new NumericField("price", Field.Store.YES, true).SetDoubleValue((double)(commdity.Price + k)));
                //doc.Add(new NumericField("time", Field.Store.YES, true).SetLongValue(DateTime.Now.ToFileTimeUtc()));
                doc.Add(new NumericField("time", Field.Store.YES, true).SetIntValue(int.Parse(DateTime.Now.ToString("yyyyMMdd")) + k));
                writer.AddDocument(doc);//写进去
            }
        }
        writer.Optimize();//优化  就是合并
    }
}

基础的查询

FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath);
IndexSearcher searcher = new IndexSearcher(dir);//查找器
TermQuery query = new TermQuery(new Term("title", "图书馆"));//包含
TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
foreach (ScoreDoc sd in docs.ScoreDocs)
{
    Document doc = searcher.Doc(sd.Doc);
    Console.WriteLine("***************************************");
    Console.WriteLine(string.Format("id={0}", doc.Get("id")));
    Console.WriteLine(string.Format("title={0}", doc.Get("title")));
    Console.WriteLine(string.Format("time={0}", doc.Get("time")));
    Console.WriteLine(string.Format("price={0}", doc.Get("price")));
    Console.WriteLine(string.Format("content={0}", doc.Get("content")));
}
Console.WriteLine("1一共命中了{0}个", docs.TotalHits);

关键字查询

FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath);
IndexSearcher searcher = new IndexSearcher(dir);//查找器
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器
string keyword = "高中政治 人 教 新课 标 选修 生活 中的 法律常识";
Query query = parser.Parse(keyword);
TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
int i = 0;
foreach (ScoreDoc sd in docs.ScoreDocs)
{
    if (i++ < 1000)
    {
        Document doc = searcher.Doc(sd.Doc);
        Console.WriteLine("***************************************");
        Console.WriteLine(string.Format("id={0}", doc.Get("id")));
        Console.WriteLine(string.Format("title={0}", doc.Get("title")));
        Console.WriteLine(string.Format("time={0}", doc.Get("time")));
        Console.WriteLine(string.Format("price={0}", doc.Get("price")));
    }
}
Console.WriteLine($"一共命中{docs.TotalHits}");

多条件查询，除了关键字，时间，排序

FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath);
IndexSearcher searcher = new IndexSearcher(dir);//查找器
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器
string keyword = "高中政治 人 教 新课 标 选修 生活 中的 法律常识";
Query query = parser.Parse(keyword);
NumericRangeFilter<int> timeFilter = NumericRangeFilter.NewIntRange("time", 20190101, 20191231, true, true);//过滤
SortField sortPrice = new SortField("price", SortField.DOUBLE, false);//降序
SortField sortTime = new SortField("time", SortField.INT, true);//升序
Sort sort = new Sort(sortTime, sortPrice);//排序 哪个前哪个后

TopDocs docs = searcher.Search(query, timeFilter, 10000, sort);//找到的数据
int i = 0;
foreach (ScoreDoc sd in docs.ScoreDocs)
{
    if (i++ < 1000)
    {
        Document doc = searcher.Doc(sd.Doc);
        Console.WriteLine("***************************************");
        Console.WriteLine(string.Format("id={0}", doc.Get("id")));
        Console.WriteLine(string.Format("title={0}", doc.Get("title")));
        Console.WriteLine(string.Format("time={0}", doc.Get("time")));
        Console.WriteLine(string.Format("price={0}", doc.Get("price")));
    }
}
Console.WriteLine("3一共命中了{0}个", docs.TotalHits);

/// 1 索引增删改查和分词处理
/// 2 京东数据多线程建立索引
/// 3 索引查询接口封装
///
/// Lucene–封装的lucene相关操作封装
///
/// LuceneAnalyze–负责完成查询关键字解析，尽可能拆分成原子数组
/// 如果只有一个词，prefix查询苹果*
/// 如果是多个词，换成或者关系，
/// 都是为了更多的命中结果(贪婪搜索)
/// 做个关键词清理
///
/// LuceneBulid— BuildIndex–MergeIndex 多线程写不同子路径，完成后合并
/// 增加/删除索引更新索引-只能先删除再更新
///
/// LuceneQuery—QueryIndexPage 支持关键字，支持范围过滤支持排序
///
/// Processor—Lucene多线程建立索引
/// IndexBuilder 入口，启动多线程创建+完成后的Merge
/// IndexBuilderPerThread 每个线程是如何完成索引建立的
///
/// DataService–CommodityLucene对外提供的搜索封装
/// CommodityRepository-SqlHelper，完成数据库数据查询

批量索引建立

IndexBuilder.Build();
int total = 0;
string pricefilter = "[50,2000]";
string priceorderby = "price desc";
List<Commodity> commoditylist = CommodityLucene.QueryCommodity(1, 30, out total, "书", null, pricefilter, priceorderby);

foreach (Commodity commodity in commoditylist)
{
    Console.WriteLine("title={0},price={1}", commodity.Title, commodity.Price);
}

备注：代码量太大，需要详细封装可以联系我

福建小徐

发布了143 篇原创文章 · 获赞 117 · 访问量 4233

私信关注

C#/.NET 系统优化专题（搜索引擎Lucene的使用）

猜你喜欢