SQLite源码学习(39) balance的一些准备工作

1.balacne的第一个判断条件

if( pPage->nOverflow==0 && pPage->nFree<=nMin )

pPage->nOverflow表示btree节点满了,需要分裂,pPage->nFree<=nMin表示当前节点的cell使用量没有低于最小的,不需要分裂

2.cell数量溢出和长度溢出的区别

2.1 内容溢出

长度溢出指单个cell的内容太长了,需要把超出的部分移到溢出页里,主要处理在fillInCell()函数,只有下面这个条件不满足才往下走

  if( nPayload<=pPage->maxLocal ){
    
    
    .......
    return SQLITE_OK;
  }

如果太长,那么当前结点只存储spaceLeft长度的内容,其他移到overflow页里

  n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
  spaceLeft = n;
    /* Write the payload into the local Cell and any extra into overflow pages */
  while( 1 ){
    
    
    n = nPayload;
    if( n>spaceLeft ) n = spaceLeft;
    ... ...
    memcpy(pPayload, pSrc, n);
  }

在上面这个循环里还会分配溢出页

rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);

把溢出页的页码加到当前已经写完的部分cell的结尾

  mn = pPage->minLocal;
  n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
  pPrior = &pCell[nHeader+n];
  put4byte(pPrior, pgnoOvfl);

继续循环往下写

pPayload = &pOvfl->aData[4];

溢出页的前4字节记录下一个溢出页的页号,如果没有则写0

      pPrior = pOvfl->aData;
      put4byte(pPrior, 0);

那么在读取存在溢出的cell的时候怎么才能知道第一个节点的长度是多少呢?
答案是这个长度已经被sqlite在代码中固定死了,看下面代码

static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
  MemPage *pPage,         /* Page containing the cell */
  u8 *pCell,              /* Pointer to the cell text. */
  CellInfo *pInfo         /* Fill in this structure */
){
    
    
  int minLocal;  /* Minimum amount of payload held locally */
  int maxLocal;  /* Maximum amount of payload held locally */
  int surplus;   /* Overflow payload available for local storage */

  minLocal = pPage->minLocal;
  maxLocal = pPage->maxLocal;
  surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
  ... ...
  }

2.2 数量溢出

数量溢出指的是cell的数量超过单个节点的最大容量,需要根据btree算法来分裂来保持平衡,从而加快查询速度,判断条件在insertCell()函数里,这个cell不会插入到当前节点,而是先放在pPage->apOvfl[j]里

  if( pPage->nOverflow || sz+2>pPage->nFree ){
    
    
    if( pTemp ){
    
    
      memcpy(pTemp, pCell, sz);
      pCell = pTemp;
    }
    if( iChild ){
    
    
      put4byte(pCell, iChild);
    }
    j = pPage->nOverflow++;
    /* Comparison against ArraySize-1 since we hold back one extra slot
    ** as a contingency.  In other words, never need more than 3 overflow
    ** slots but 4 are allocated, just to be safe. */
    assert( j < ArraySize(pPage->apOvfl)-1 );
    pPage->apOvfl[j] = pCell;
    pPage->aiOvfl[j] = (u16)i;
  }

那么pPage->nOverflow什么时候重新设为0呢,以根节点满了为例,这个时候会在balance_deeper()新建一个子节点把所有内容到拷贝到子节点

  if( rc==SQLITE_OK ){
    
    
    rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
    copyNodeContent(pRoot, pChild, &rc);
  }
  pChild->nOverflow = pRoot->nOverflow;
  /* Zero the contents of pRoot. Then install pChild as the right-child. */
  zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);

循环之后用子结点代替当前结点

  do {
    
    
    int iPage;
    MemPage *pPage = pCur->pPage;
   ...  ....
        rc = balance_deeper(pPage, &pCur->apPage[1]);
        if( rc==SQLITE_OK ){
    
    
          pCur->apPage[0] = pPage;
          pCur->pPage = pCur->apPage[1];
          assert( pCur->pPage->nOverflow );
        }

子节点又会在balance_quick()里新建一个兄弟节点,把溢出的cell放到兄弟节点里,然后从函数里出来时,把pPage->nOverflow清0

rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
pPage->nOverflow = 0;

3.子节点把一个 cell拷贝到父节点时为什么不拷贝内容

在balance_quick()函数里,发现把溢出的cell拷贝到兄弟结点后,接下来把分裂的一个cell拷贝到父结点,发现只拷贝了cell的头部,而没有拷贝cell的内容。

    /* Create a divider cell to insert into pParent. The divider cell
    ** consists of a 4-byte page number (the page number of pPage) and
    ** a variable length key value (which must be the same value as the
    ** largest key on pPage).
    **
    ** To find the largest key value on pPage, first find the right-most 
    ** cell on pPage. The first two fields of this cell are the 
    ** record-length (a variable length integer at most 32-bits in size)
    ** and the key value (a variable length integer, may have any value).
    ** The first of the while(...) loops below skips over the record-length
    ** field. The second while(...) loop copies the key value from the
    ** cell on pPage into the pSpace buffer.
    */
    pCell = findCell(pPage, pPage->nCell-1);
    pStop = &pCell[9];
    while( (*(pCell++)&0x80) && pCell<pStop );
    pStop = &pCell[9];
    while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );

    /* Insert the new divider cell into pParent. */
    if( rc==SQLITE_OK ){
    
    
      insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
                   0, pPage->pgno, &rc);
    }

    /* Set the right-child pointer of pParent to point to the new page. */
    put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);

这里的原因是SQLite的table tree使用了b+ tree的算法,B+tree的内部结点并没有指向关键字具体信息的指针。因此其内部结点相对B 树更小。如果把所有同一内部结点的关键字存放在同一盘块中,那么盘块所能容纳的关键字数量也越多。一次性读入内存中的需要查找的关键字也就越多。相对来说IO读写次数也就降低了。

4.CellInfo结构体的作用

CellInfo info是BtCursor的一个成员,在调用getCellInfo获取cell信息时会用到

static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
    
    
  if( pCur->info.nSize==0 ){
    
    
    pCur->curFlags |= BTCF_ValidNKey;
    btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
  }else{
    
    
    assertCellInfo(pCur);
  }
}

最开始出现在虚拟机的OP_Rowid指令里

v = sqlite3BtreeIntegerKey(pC->uc.pCursor);

5. cell的key值在哪里排序

如果没有设置PRIMARY KEY那么key值pCur->ix默认从小到大,没有排序,在NewRowid指令里设置

static int moveToRightmost(BtCursor *pCur){
    
    
  Pgno pgno;
  int rc = SQLITE_OK;
  MemPage *pPage = 0;

  assert( cursorOwnsBtShared(pCur) );
  assert( pCur->eState==CURSOR_VALID );
  while( !(pPage = pCur->pPage)->leaf ){
    
    
    pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
    pCur->ix = pPage->nCell;
    rc = moveToChild(pCur, pgno);
    if( rc ) return rc;
  }
  pCur->ix = pPage->nCell-1;
  assert( pCur->info.nSize==0 );
  assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
  return SQLITE_OK;
}

有主键的时候才排序。在虚拟机的NotExists指令里会执行sqlite3BtreeMovetoUnpacked函数,用二分法确定key值在cell的位置

        getVarint(pCell, (u64*)&nCellKey);
        if( nCellKey<intKey ){
    
    
          lwr = idx+1;
          if( lwr>upr ){
    
     c = -1; break; }
        }else if( nCellKey>intKey ){
    
    
          upr = idx-1;
          if( lwr>upr ){
    
     c = +1; break; }
        }else{
    
    
          assert( nCellKey==intKey );
          pCur->ix = (u16)idx;
          if( !pPage->leaf ){
    
    
            lwr = idx;
            goto moveto_next_layer;
          }else{
    
    
            pCur->curFlags |= BTCF_ValidNKey;
            pCur->info.nKey = nCellKey;
            pCur->info.nSize = 0;
            *pRes = 0;
            return SQLITE_OK;
          }
        }
        assert( lwr+upr>=0 );
        idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */

然后保存在pCur->ix里

pCur->ix = (u16)idx;

之后插入时会用到

idx = pCur->ix;
insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);

6. 分裂时如何保证key值的顺序

一开始root结点满了的时候,会在balance_deeper()里新建一个子结点把所有内容移到这个子结点,然后在balance_quick()里把子结点最大的cell的key值移到作为父结点的root结点,这个cell的左孩子页号设为改子结点页号,然后再新建一个兄弟结点,把溢出的cell(pPage->apOvfl[0])移到兄弟结点,父结点的最右孩子设为该结点的页号

    /* Set the right-child pointer of pParent to point to the new page. */
    put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);

那么为什么能保证这个溢出cell的key值比左孩子节点的所有可以值都大呢?

这个可以从balance_quick()的名字里看出来是一种快速平衡,也就是cell值默认从小到大排列的时候才会进balance_quick(),否则进balance_nonroot()

      if( rc==SQLITE_OK ){
    
    
#ifndef SQLITE_OMIT_QUICKBALANCE
        if( pPage->intKeyLeaf
         && pPage->nOverflow==1
         && pPage->aiOvfl[0]==pPage->nCell //新cell的key值默认从小到大排列
         && pParent->pgno!=1
         && pParent->nCell==iIdx
        ){
    
    
          rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
        }else
#endif
        {
    
    
          rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
                               pCur->hints&BTREE_BULKLOAD);
         }

从下面代码中可以看到pPage->aiOvfl[0]的来源

static void insertCell(
  MemPage *pPage,   /* Page into which we are copying */
  int i,            //第i个节点  /* New cell becomes the i-th cell of the page */
  u8 *pCell,        /* Content of the new cell */
  int sz,           /* Bytes of content in pCell */
  u8 *pTemp,        /* Temp storage space for pCell, if needed */
  Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
  int *pRC          /* Read and write return code from here */
){
    
    
    ... ...
    pPage->apOvfl[j] = pCell;
    pPage->aiOvfl[j] = (u16)i;

猜你喜欢

转载自blog.csdn.net/pfysw/article/details/109520875