






private final ReuseStrategy reuseStrategy;
  protected abstract TokenStreamComponents createComponents(String fieldName,Reader reader);

public final TokenStream tokenStream(final String fieldName,
                                       final Reader reader) throws IOException {
// private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>();
//内部抽象类 GlobalReuseStrategy 存放:TokenStreamComponents 
//               PerFieldReuseStrategy存放 Map<String, TokenStreamComponents>         
    TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName);
    final Reader r = initReader(fieldName, reader);
    if (components == null) {
      components = createComponents(fieldName, r);
      reuseStrategy.setReusableComponents(fieldName, components);
    } else {
    return components.getTokenStream();


Analyzer a=new WhitespaceAnalyzer(Version.LUCENE_43);
TokenStream tokenStream=a.tokenStream("CESHI", new StringReader("I LOVE YOU!"));
 CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);

 tokenStream.reset();  //java.lang.ArrayIndexOutOfBoundsException



public abstract boolean incrementToken() throws IOException;

public void reset() throws IOException {}




public abstract class CharTokenizer extends Tokenizer {
  public CharTokenizer(Version matchVersion, Reader input) {
    charUtils = CharacterUtils.getInstance(matchVersion);
  public CharTokenizer(Version matchVersion, AttributeFactory factory,
      Reader input) {
    super(factory, input);
    charUtils = CharacterUtils.getInstance(matchVersion);
  // note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
  //用这些参数的时候必须reset()下 把bufferIndex=0
  //因为第一次处理的时候       if (bufferIndex >= dataLen) 不然reader充值不进来?
  private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
  private static final int MAX_WORD_LEN = 255; //允许单词的最大长度
  private static final int IO_BUFFER_SIZE = 4096;//一次允许的最大的字符数
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  //codePointAt fill
  //通过 Character得到该类提供了几种方法,以确定字符的类别(小写字母,数字,等等),并将字符从大写转换成小写,反之亦然
  private final CharacterUtils charUtils;
  //char[] buffer; int offset; int length;
  private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
  protected abstract boolean isTokenChar(int c);

  //当看到小写处理的时候是处理掉了 转换为小写了。
  protected int normalize(int c) {
    return c;
  public final boolean incrementToken() throws IOException {
    int length = 0;
    int start = -1; // this variable is always initialized
    int end = -1;
    char[] buffer = termAtt.buffer();
    while (true) {
      if (bufferIndex >= dataLen) {
        offset += dataLen;
        //实例化analyzer必须实现的方法返回TokenStreamComponents 这个类实现需要tokenizer 属性reader,TokenStream
        if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
          dataLen = 0; // so next offset += dataLen won't decrement offset
          if (length > 0) {
          } else {
            finalOffset = correctOffset(offset);
            return false;
        dataLen = ioBuffer.getLength();
        bufferIndex = 0;
     //赋值成功后判断偏移量的字符  返回给定索引上的 Unicode 代码点
      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
     //确定表示指定字符(Unicode 代码点)所需的 char 值的数量 具体也不清楚
      final int charCount = Character.charCount(c);
      bufferIndex += charCount;
      //WhitespaceTokenizer 判断是否是空格
      if (isTokenChar(c)) {               // if it's a token char
          //如果length==0 / start
    	  if (length == 0) {                // start of token
          assert start == -1;
          start = offset + bufferIndex - charCount;
          end = start;
        } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
          buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
        end += charCount;
        length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
        if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
      } else if (length > 0)             // at non-Letter w/ chars
        break;                           // return 'em
    assert start != -1;
   offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));
    return true;
  public final void end() {
    offsetAtt.setOffset(finalOffset, finalOffset);
  public void reset() throws IOException {
    bufferIndex = 0;
    offset = 0;
    dataLen = 0;
    finalOffset = 0;
    ioBuffer.reset(); // make sure to reset the IO buffer!!

