1.balacne的第一个判断条件
if( pPage->nOverflow==0 && pPage->nFree<=nMin )
pPage->nOverflow表示btree节点满了,需要分裂,pPage->nFree<=nMin表示当前节点的cell使用量没有低于最小的,不需要分裂
2.cell数量溢出和长度溢出的区别
2.1 内容溢出
长度溢出指单个cell的内容太长了,需要把超出的部分移到溢出页里,主要处理在fillInCell()函数,只有下面这个条件不满足才往下走
if( nPayload<=pPage->maxLocal ){
.......
return SQLITE_OK;
}
如果太长,那么当前结点只存储spaceLeft长度的内容,其他移到overflow页里
n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
spaceLeft = n;
/* Write the payload into the local Cell and any extra into overflow pages */
while( 1 ){
n = nPayload;
if( n>spaceLeft ) n = spaceLeft;
... ...
memcpy(pPayload, pSrc, n);
}
在上面这个循环里还会分配溢出页
rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
把溢出页的页码加到当前已经写完的部分cell的结尾
mn = pPage->minLocal;
n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
pPrior = &pCell[nHeader+n];
put4byte(pPrior, pgnoOvfl);
继续循环往下写
pPayload = &pOvfl->aData[4];
溢出页的前4字节记录下一个溢出页的页号,如果没有则写0
pPrior = pOvfl->aData;
put4byte(pPrior, 0);
那么在读取存在溢出的cell的时候怎么才能知道第一个节点的长度是多少呢?
答案是这个长度已经被sqlite在代码中固定死了,看下面代码
static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
MemPage *pPage, /* Page containing the cell */
u8 *pCell, /* Pointer to the cell text. */
CellInfo *pInfo /* Fill in this structure */
){
int minLocal; /* Minimum amount of payload held locally */
int maxLocal; /* Maximum amount of payload held locally */
int surplus; /* Overflow payload available for local storage */
minLocal = pPage->minLocal;
maxLocal = pPage->maxLocal;
surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
... ...
}
2.2 数量溢出
数量溢出指的是cell的数量超过单个节点的最大容量,需要根据btree算法来分裂来保持平衡,从而加快查询速度,判断条件在insertCell()函数里,这个cell不会插入到当前节点,而是先放在pPage->apOvfl[j]里
if( pPage->nOverflow || sz+2>pPage->nFree ){
if( pTemp ){
memcpy(pTemp, pCell, sz);
pCell = pTemp;
}
if( iChild ){
put4byte(pCell, iChild);
}
j = pPage->nOverflow++;
/* Comparison against ArraySize-1 since we hold back one extra slot
** as a contingency. In other words, never need more than 3 overflow
** slots but 4 are allocated, just to be safe. */
assert( j < ArraySize(pPage->apOvfl)-1 );
pPage->apOvfl[j] = pCell;
pPage->aiOvfl[j] = (u16)i;
}
那么pPage->nOverflow什么时候重新设为0呢,以根节点满了为例,这个时候会在balance_deeper()新建一个子节点把所有内容到拷贝到子节点
if( rc==SQLITE_OK ){
rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
copyNodeContent(pRoot, pChild, &rc);
}
pChild->nOverflow = pRoot->nOverflow;
/* Zero the contents of pRoot. Then install pChild as the right-child. */
zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
循环之后用子结点代替当前结点
do {
int iPage;
MemPage *pPage = pCur->pPage;
... ....
rc = balance_deeper(pPage, &pCur->apPage[1]);
if( rc==SQLITE_OK ){
pCur->apPage[0] = pPage;
pCur->pPage = pCur->apPage[1];
assert( pCur->pPage->nOverflow );
}
子节点又会在balance_quick()里新建一个兄弟节点,把溢出的cell放到兄弟节点里,然后从函数里出来时,把pPage->nOverflow清0
rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
pPage->nOverflow = 0;
3.子节点把一个 cell拷贝到父节点时为什么不拷贝内容
在balance_quick()函数里,发现把溢出的cell拷贝到兄弟结点后,接下来把分裂的一个cell拷贝到父结点,发现只拷贝了cell的头部,而没有拷贝cell的内容。
/* Create a divider cell to insert into pParent. The divider cell
** consists of a 4-byte page number (the page number of pPage) and
** a variable length key value (which must be the same value as the
** largest key on pPage).
**
** To find the largest key value on pPage, first find the right-most
** cell on pPage. The first two fields of this cell are the
** record-length (a variable length integer at most 32-bits in size)
** and the key value (a variable length integer, may have any value).
** The first of the while(...) loops below skips over the record-length
** field. The second while(...) loop copies the key value from the
** cell on pPage into the pSpace buffer.
*/
pCell = findCell(pPage, pPage->nCell-1);
pStop = &pCell[9];
while( (*(pCell++)&0x80) && pCell<pStop );
pStop = &pCell[9];
while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
/* Insert the new divider cell into pParent. */
if( rc==SQLITE_OK ){
insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
0, pPage->pgno, &rc);
}
/* Set the right-child pointer of pParent to point to the new page. */
put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
这里的原因是SQLite的table tree使用了b+ tree的算法,B+tree的内部结点并没有指向关键字具体信息的指针。因此其内部结点相对B 树更小。如果把所有同一内部结点的关键字存放在同一盘块中,那么盘块所能容纳的关键字数量也越多。一次性读入内存中的需要查找的关键字也就越多。相对来说IO读写次数也就降低了。
4.CellInfo结构体的作用
CellInfo info是BtCursor的一个成员,在调用getCellInfo获取cell信息时会用到
static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
if( pCur->info.nSize==0 ){
pCur->curFlags |= BTCF_ValidNKey;
btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
}else{
assertCellInfo(pCur);
}
}
最开始出现在虚拟机的OP_Rowid指令里
v = sqlite3BtreeIntegerKey(pC->uc.pCursor);
5. cell的key值在哪里排序
如果没有设置PRIMARY KEY那么key值pCur->ix默认从小到大,没有排序,在NewRowid指令里设置
static int moveToRightmost(BtCursor *pCur){
Pgno pgno;
int rc = SQLITE_OK;
MemPage *pPage = 0;
assert( cursorOwnsBtShared(pCur) );
assert( pCur->eState==CURSOR_VALID );
while( !(pPage = pCur->pPage)->leaf ){
pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
pCur->ix = pPage->nCell;
rc = moveToChild(pCur, pgno);
if( rc ) return rc;
}
pCur->ix = pPage->nCell-1;
assert( pCur->info.nSize==0 );
assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
return SQLITE_OK;
}
有主键的时候才排序。在虚拟机的NotExists指令里会执行sqlite3BtreeMovetoUnpacked函数,用二分法确定key值在cell的位置
getVarint(pCell, (u64*)&nCellKey);
if( nCellKey<intKey ){
lwr = idx+1;
if( lwr>upr ){
c = -1; break; }
}else if( nCellKey>intKey ){
upr = idx-1;
if( lwr>upr ){
c = +1; break; }
}else{
assert( nCellKey==intKey );
pCur->ix = (u16)idx;
if( !pPage->leaf ){
lwr = idx;
goto moveto_next_layer;
}else{
pCur->curFlags |= BTCF_ValidNKey;
pCur->info.nKey = nCellKey;
pCur->info.nSize = 0;
*pRes = 0;
return SQLITE_OK;
}
}
assert( lwr+upr>=0 );
idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */
然后保存在pCur->ix里
pCur->ix = (u16)idx;
之后插入时会用到
idx = pCur->ix;
insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
6. 分裂时如何保证key值的顺序
一开始root结点满了的时候,会在balance_deeper()里新建一个子结点把所有内容移到这个子结点,然后在balance_quick()里把子结点最大的cell的key值移到作为父结点的root结点,这个cell的左孩子页号设为改子结点页号,然后再新建一个兄弟结点,把溢出的cell(pPage->apOvfl[0])移到兄弟结点,父结点的最右孩子设为该结点的页号
/* Set the right-child pointer of pParent to point to the new page. */
put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
那么为什么能保证这个溢出cell的key值比左孩子节点的所有可以值都大呢?
这个可以从balance_quick()的名字里看出来是一种快速平衡,也就是cell值默认从小到大排列的时候才会进balance_quick(),否则进balance_nonroot()
if( rc==SQLITE_OK ){
#ifndef SQLITE_OMIT_QUICKBALANCE
if( pPage->intKeyLeaf
&& pPage->nOverflow==1
&& pPage->aiOvfl[0]==pPage->nCell //新cell的key值默认从小到大排列
&& pParent->pgno!=1
&& pParent->nCell==iIdx
){
rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
}else
#endif
{
rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
pCur->hints&BTREE_BULKLOAD);
}
从下面代码中可以看到pPage->aiOvfl[0]的来源
static void insertCell(
MemPage *pPage, /* Page into which we are copying */
int i, //第i个节点 /* New cell becomes the i-th cell of the page */
u8 *pCell, /* Content of the new cell */
int sz, /* Bytes of content in pCell */
u8 *pTemp, /* Temp storage space for pCell, if needed */
Pgno iChild, /* If non-zero, replace first 4 bytes with this value */
int *pRC /* Read and write return code from here */
){
... ...
pPage->apOvfl[j] = pCell;
pPage->aiOvfl[j] = (u16)i;