一、关系
AttributeSource→TokenStream→Tokenizer
↓
TokenFilter
=============================================
Analyzer中的一个抽象方法是
//属性 private final ReuseStrategy reuseStrategy; ======================================== //TokenStreamComponents //保存了tokenizer和tokeniStream //也可以设置Reader protected abstract TokenStreamComponents createComponents(String fieldName,Reader reader); ======================================== //得到TokenStream public final TokenStream tokenStream(final String fieldName, final Reader reader) throws IOException { //ReuseStrategy这个内部类是干吗的? // private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>(); //内部抽象类 GlobalReuseStrategy 存放:TokenStreamComponents // PerFieldReuseStrategy存放 Map<String, TokenStreamComponents> private final ReuseStrategy reuseStrategy; TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName); final Reader r = initReader(fieldName, reader); if (components == null) { components = createComponents(fieldName, r); reuseStrategy.setReusableComponents(fieldName, components); } else { components.setReader(r); } return components.getTokenStream(); }
分词输出例子:
Analyzer a=new WhitespaceAnalyzer(Version.LUCENE_43); TokenStream tokenStream=a.tokenStream("CESHI", new StringReader("I LOVE YOU!")); CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); //java.lang.ArrayIndexOutOfBoundsException while(tokenStream.incrementToken()){ System.out.print("["+termAttribute.toString()+" }
二、TokenStream的一些方法和属性
//对于Reader的解析,Token的不断输出 public abstract boolean incrementToken() throws IOException; public void reset() throws IOException {}
三、Tokenizer的属性和方法
//声明Tokenizer的时候必须有Reader Reader
四、CharTokenizer
public abstract class CharTokenizer extends Tokenizer { //tokenizer的属性Reader public CharTokenizer(Version matchVersion, Reader input) { super(input); charUtils = CharacterUtils.getInstance(matchVersion); } public CharTokenizer(Version matchVersion, AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(matchVersion); } // note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset() //用这些参数的时候必须reset()下 把bufferIndex=0 //因为第一次处理的时候 if (bufferIndex >= dataLen) 不然reader充值不进来? private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0; private static final int MAX_WORD_LEN = 255; //允许单词的最大长度 private static final int IO_BUFFER_SIZE = 4096;//一次允许的最大的字符数 //添加一些attribute private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); //CharacterUtils的方法 //codePointAt fill //通过 Character得到该类提供了几种方法,以确定字符的类别(小写字母,数字,等等),并将字符从大写转换成小写,反之亦然 private final CharacterUtils charUtils; //CharacterBuffer的属性 //char[] buffer; int offset; int length; private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); //判断是不是token protected abstract boolean isTokenChar(int c); //当看到小写处理的时候是处理掉了 转换为小写了。 protected int normalize(int c) { return c; } @Override public final boolean incrementToken() throws IOException { //这个处理是attributeSource处理的具体纳特state没看懂? clearAttributes(); int length = 0; int start = -1; // this variable is always initialized int end = -1; char[] buffer = termAtt.buffer(); //循环开始?? //offset的明白了一点,但是termAtt怎么得到字符的哪?又是怎么得到小写字符的哪? while (true) { //把tokenizer的reader的值赋值到ioBuffer里 if (bufferIndex >= dataLen) { offset += dataLen; //tokenizer有reader参数 //实例化analyzer必须实现的方法返回TokenStreamComponents 这个类实现需要tokenizer 属性reader,TokenStream //把输入流填充到ioBuffer中 if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { finalOffset = correctOffset(offset); return false; } } //赋值成功的话datLen会得到数据长度 dataLen = ioBuffer.getLength(); bufferIndex = 0; } //赋值成功后判断偏移量的字符 返回给定索引上的 Unicode 代码点 final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex); //确定表示指定字符(Unicode 代码点)所需的 char 值的数量 具体也不清楚 final int charCount = Character.charCount(c); bufferIndex += charCount; //WhitespaceTokenizer 判断是否是空格 //如果length>0也跳出了循环 if (isTokenChar(c)) { // if it's a token char //如果length==0 / start if (length == 0) { // start of token assert start == -1; start = offset + bufferIndex - charCount; end = start; } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer } end += charCount; length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test break; } else if (length > 0) // at non-Letter w/ chars break; // return 'em } termAtt.setLength(length); assert start != -1; offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end)); return true; } @Override public final void end() { offsetAtt.setOffset(finalOffset, finalOffset); } //重置属性 @Override public void reset() throws IOException { bufferIndex = 0; offset = 0; dataLen = 0; finalOffset = 0; ioBuffer.reset(); // make sure to reset the IO buffer!! } }