package com.hb; import java.io.Reader; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Version; public class MyStopAnalyzer extends Analyzer { private Set stops; public MyStopAnalyzer(String[] strs){ //会自动将字符串数据转为set stops = StopFilter.makeStopSet(Version.LUCENE_35, strs, true); //将原有的停用词加入到现在的停用词中 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } public MyStopAnalyzer(){ //获取原有的停用词 stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } @Override public TokenStream tokenStream(String fieldname, Reader reader) { //为这个分词器设定过滤链和Tokenizers return new StopFilter(Version.LUCENE_35, new LowerCaseFilter(Version.LUCENE_35, new LetterTokenizer(Version.LUCENE_35, reader)), stops); } }
lucene Analyzer 分词 一
猜你喜欢
转载自hbiao68.iteye.com/blog/2108307
今日推荐
周排行