lucene同义词分词器

public class SynonymAnalyzer extends Analyzer{

	@Override
	protected TokenStreamComponents createComponents(String fieldname, Reader reader) {
		
		SynonymMap.Builder builder = new SynonymMap.Builder(true);
		//Be sure the boolean last arg you pass there is the one you want.  There are significant tradeoffs here.
		//Add as many terms as you like here...
		builder.add(new CharsRef("中华"), new CharsRef("华夏"), true);
//		builder.add(new CharsRef("中国"), new CharsRef("华夏"), true);
//		builder.add(new CharsRef("喀什米尔"), new CharsRef("克什米尔"), true);
		
		Set<String> keys = DataCache.SYNONYMS.keySet();
		for (String key : keys) {
			String[] samewords = DataCache.SYNONYMS.get(key);
			for (String word : samewords) {
				builder.add(new CharsRef(key), new CharsRef(word), true);
//				System.out.println(key+"--"+word);
			}
		}
		
//		CharsRef multiWordCharsRef = new CharsRef();
//		SynonymMap.Builder.join(new String[]{"中华", "华夏"}, multiWordCharsRef);
//		builder.add(new CharsRef("中国"), multiWordCharsRef, true);
		
		SynonymMap mySynonymMap = null;
		try {
			mySynonymMap = builder.build();
//			mySynonymMap = null;
		} catch (IOException e) {
			e.printStackTrace();
		}
		Tokenizer source = new AnsjTokenizer(new ToAnalysis(reader), reader,null,true);
		//Tokenizer source = new AnsjTokenizer(new IndexAnalysis(reader), reader,null,true);
//		Tokenizer source = new ClassicTokenizer(Version.LUCENE_46, reader);
	    TokenStream filter = new StandardFilter(Version.LUCENE_40, source);
	    filter = new LowerCaseFilter(Version.LUCENE_40,filter);
	    
	    filter = new SynonymFilter(filter, mySynonymMap, false);
	    //Whatever other filter you want to add to the chain, being mindful of order.
	    return new TokenStreamComponents(source, filter);
	    
	}
	
	
}

猜你喜欢

转载自itace.iteye.com/blog/2023123