阅读顺序
《Postgresql源码(30)Postgresql索引基础B-linked-tree》
《Postgresql源码(31)Btree索引相关系统表和整体结构
《Postgresql源码(32)Btree索引分裂前后结构差异对比
《Postgresql源码(33)Btree索引阅读-整个过程&_bt_first》
《Postgresql源码(34)Btree索引读——_bt_first搜索部分分析
《Postgresql源码(36)Btree索引读——_bt_next搜索部分分析
继续上一篇https://blog.csdn.net/jackgo73/article/details/122875493
本文重点关注双key场景拼scankey的特点 和 _first_key搜索部分的流程和加锁特性。
场景结构:双key跨页搜索索引
当前索引形式
root: 412 branch: 3, 115, 227, 338, 449, 560, 671, 782, 893, 1004 / \ \ \ \ / \ \ leaf: 1 2 4 5 6 ... 111 112 113 114 ...
表结构:
create table t81(id int, info text); create index idx_t81_id_info on t81(id,info); postgres=# select * from bt_metap('idx_t81_id_info'); magic | version | root | level | fastroot | fastlevel -------- --------- ------ ------- ---------- ----------- 340322 | 2 | 116 | 2 | 116 | 2 select itemoffset,ctid,itemlen,nulls,vars from bt_page_items('idx_t81_id_info', 116); itemoffset | ctid | itemlen | nulls | vars ------------ ---------- --------- ------- ------ 1 | (3,1) | 8 | f | f 2 | (115,1) | 48 | f | t 3 | (227,1) | 48 | f | t 4 | (338,1) | 48 | f | t 5 | (449,1) | 48 | f | t 6 | (560,1) | 48 | f | t 7 | (671,1) | 48 | f | t 8 | (782,1) | 48 | f | t 9 | (893,1) | 48 | f | t 10 | (1004,1) | 48 | f | t select itemoffset,ctid,itemlen,nulls,vars from bt_page_items('idx_t81_id_info', 2) limit 10; itemoffset | ctid | itemlen | nulls | vars ------------ -------- --------- ------- ------ 1 | (2,41) | 48 | f | t 2 | (1,21) | 48 | f | t 3 | (1,22) | 48 | f | t 4 | (1,23) | 48 | f | t 5 | (1,24) | 48 | f | t 6 | (1,25) | 48 | f | t 7 | (1,26) | 48 | f | t 8 | (1,27) | 48 | f | t 9 | (1,28) | 48 | f | t 10 | (1,29) | 48 | f | t select itemoffset,ctid,itemlen,nulls,vars from bt_page_items('idx_t81_id_info', 5) limit 10; itemoffset | ctid | itemlen | nulls | vars ------------ -------- --------- ------- ------ 1 | (4,81) | 48 | f | t 2 | (3,61) | 48 | f | t 3 | (3,62) | 48 | f | t 4 | (3,63) | 48 | f | t 5 | (3,64) | 48 | f | t 6 | (3,65) | 48 | f | t 7 | (3,66) | 48 | f | t 8 | (3,67) | 48 | f | t 9 | (3,68) | 48 | f | t 10 | (3,69) | 48 | f | t select * from t81 where ctid='(1,23)'; id | info ----- ---------------------------------- 143 | 8a5f7d9783b38755e1df35c7ff7456c6 select * from t81 where ctid='(3,63)'; id | info ----- ---------------------------------- 423 | c985618de291db89879d2c8589e39449
用于分析的SQL 预期1:索引会扫描1、2、4、54leaf页面,访问3号branch页面,访问116号root页面 预期2:搜索条件将减少到4个,定位初始搜索点的条件是id>143 and info>‘8a’
select * from t81 where id>143 and info>'8a' and id<423 and info<'9f' -- 正常条件 and id>100 and id<500 and info>'00' and info<'ff'; -- 冗余条件
结果
id | info
-----+----------------------------------
149 | 950ff540153cef9af64b53ca17fa3d1e
162 | 99980baf92019a558251089685444d7f
175 | 9e70997ba78c8458bfa8f78439fa6664
178 | 8bc6444cc5e80a6c29407ae55053bcfc
193 | 9a6fe7fcfe2191134424f1471e44287d
208 | 9a598f8859cd3b48fb691f1a1b61b4eb
209 | 91e1f34f0594ea1df35efae36e385e1f
238 | 9a1242106efdc75be7c4f19975509052
243 | 8e5ba26c5dfa9127756d3224cdb4397b
248 | 98dd36696506d990fcf3e6e9e39181d6
255 | 8de35b6524b7780c155c6dfd51baa4fb
268 | 9d2c8bee035881f0944ea596ad080cef
275 | 8c0c22ec820d248e574fbde6441da2fc
295 | 8c855108642c6be76bff4f451de41404
300 | 998dfbe7f18a86d3fe8d6965ce836cfc
327 | 8da63af1288433da429d272f5087096c
328 | 8b649f2659d1a049f62f6b77fdd553e9
335 | 9bd93e5a2033e24926305cc1f5fd7aec
338 | 91b271b1e4011ecd07e8e3d1105db64d
350 | 9106fa32fc398321e8dff200ce4f2aef
356 | 9e154f50faa64931d34fe86639e7c986
358 | 8daf29713228d27188949d2092ba8498
360 | 9b003ad7b5169a20d4d21f084749ee84
395 | 99060b7ae0af3fb48aae1770fb98a844
413 | 961c6e221306ee733559773bd6bb522f
415 | 930235728f96601ab13e0d1726669e7a
420 | 93b764104fbcb1d2bdd23e378c9222ce
计划
explain analyze select * from t81
where id>143 and info>'8a' and id<423 and info<'9f' -- 正常条件
and id>100 and id<500 and info>'00' and info<'ff'; -- 冗余条件
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using idx_t81_id_info on t81 (cost=0.42..62.35 rows=23 width=37) (actual time=0.028..0.062 rows=27 loops=1)
Index Cond: ((id > 143) AND (id < 423) AND (id > 100) AND (id < 500) AND (info > '8a'::text) AND (info < '9f'::text) AND (info > '00'::text) AND (info < 'ff'::text))
Heap Fetches: 27
Planning time: 0.195 ms
Execution time: 0.091 ms
下面开始分析
_first_key
直奔主题_first_key。ps. 避免断到其他堆栈里:break _bt_first if scan->numberOfKeys==8
按上一篇的流程来看:
第一步:_bt_preprocess_keys开始处理冗余key,结果记录到so中 第二步:开始构造startKeys(ScanKey数组)(找到能用的scankey) 第三步:比较函数转换为通用的row comparisons 第四步:确定临界值判断规则 第五步:scankeys&边界规则准备完毕,开始扫描
第一步:_bt_preprocess_keys开始处理冗余key,结果记录到so中
预处理 _bt_preprocess_keys(scan)前后对比,冗余key被合并。 输入:scan->keyData
输出: ((BTScanOpaque)scan->opaque)->keyData
执行前:scan->numberOfKeys
8
执行后:p ((BTScanOpaque)scan->opaque)->numberOfKeys
4
--
执行前:sk_argument(sk_func): 143(int4gt) 423(int4lt) 100(int4gt) 500(int4lt) 25572760(text_gt) 25573760(text_lt) 25575200(text_gt) 25575760(text_lt)
执行后:sk_argument(sk_func): 143(int4gt) 423(int4lt) 25572760(text_gt) 25573760(text_lt)
-- command
p scan->keyData[0] ... scan->keyData[7]
p ((BTScanOpaque)scan->opaque)->numberOfKeys
p ((BTScanOpaque)scan->opaque)->keyData[0]
第二步:开始构造startKeys(ScanKey数组)(找到能用的scankey)
select * from t81
where id>143 and info>'8a' and id<423 and info<'9f' -- 正常条件
and id>100 and id<500 and info>'00' and info<'ff'; -- 冗余条件
找起始点所需的所有key保存在startKeys中。
输入:so->keyData 输出:startKeys数组(根据规则选择ScanKey chosen,然后保存到startKeys中)
遍历so->keyData
第一轮循环:(id>143)大于号走BTGreaterStrategyNumber分支,chosen = cur
第二轮循环:(id<423)小于号不能当做定位起点,do nothing
第三轮循环:(info>'8a')新一列会触发startKeys保存chosen,并且已经找到 > 直接跳出循环。
for (cur = so->keyData, i = 0;; cur++, i++)
if (i >= so->numberOfKeys || cur->sk_attno != curattr) // 已经到下一列了,可以保存上一轮的结果
结果:startKeys保存了一条id>143
(gdb) p *startKeys[0]
$34 = {sk_flags = 131072, sk_attno = 1, sk_strategy = 5, sk_subtype = 23, sk_collation = 0, sk_func = {fn_addr = 0x8f65ac <int4gt>,
fn_oid = 147, fn_nargs = 2, fn_strict = 1 '\001', fn_retset = 0 '\000', fn_stats = 2 '\002', fn_extra = 0x0, fn_mcxt = 0x187d1b0,
fn_expr = 0x0}, sk_argument = 143}
(gdb) p keysCount
$35 = 1
第三步:比较函数转换为通用的row comparisons
比较函数替换int4gt转换为btint4cmp
输入:遍历startKeys 输出:scankeys
(gdb) p *startKeys[0]
$44 = {sk_flags = 131072, sk_attno = 1, sk_strategy = 5, sk_subtype = 23, sk_collation = 0, sk_func = {fn_addr = 0x8f65ac <int4gt>}, sk_argument = 143}
(gdb) p scankeys[0]
$43 = {sk_flags = 131072, sk_attno = 1, sk_strategy = 0, sk_subtype = 23, sk_collation = 0, sk_func = {fn_addr = 0x4f2f5d <btint4cmp>}, sk_argument = 143}
函数区别:返回值变了,可以用于二分查找
Datum
int4gt(PG_FUNCTION_ARGS)
{
int32 arg1 = PG_GETARG_INT32(0);
int32 arg2 = PG_GETARG_INT32(1);
PG_RETURN_BOOL(arg1 > arg2);
}
Datum
btint4cmp(PG_FUNCTION_ARGS)
{
int32 a = PG_GETARG_INT32(0);
int32 b = PG_GETARG_INT32(1);
if (a > b)
PG_RETURN_INT32(A_GREATER_THAN_B);
else if (a == b)
PG_RETURN_INT32(0);
else
PG_RETURN_INT32(A_LESS_THAN_B);
}
第四步:确定临界值判断规则
走BTGreaterStrategyNumber分支nextkey = true; goback = false;
第五步:scankeys&边界规则准备完毕,找到起始leaf页面
逻辑都在_bt_search中,分几部分分析:
第五步分解_bt_binsrch
这里先看下比较独立的二分搜索的函数
OffsetNumber
_bt_binsrch(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
bool nextkey)
{
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high;
int32 result,
cmpval;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
low / high 相当于数组索引值,后面判断还是使用指向的数据值来判断。
- low就是页面第一条记录的offset,如果是最右页面就是第一条,如果是非最右页面是第二条
- high用页面指针算出来
low = P_FIRSTDATAKEY(opaque);
high = PageGetMaxOffsetNumber(page);
/* * If there are no keys on the page, return the first available slot. Note * this covers two cases: the page is really empty (no keys), or it * contains only a high key. The latter case is possible after vacuuming. * This can never happen on an internal page, however, since they are * never empty (an internal page must have children). */
if (high < low)
return low;
/* * Binary search to find the first key on the page >= scan key, or first * key > scankey when nextkey is true. * * For nextkey=false (cmpval=1), the loop invariant is: all slots before * 'low' are < scan key, all slots at or after 'high' are >= scan key. * * For nextkey=true (cmpval=0), the loop invariant is: all slots before * 'low' are <= scan key, all slots at or after 'high' are > scan key. * * We can fall out when high == low. */
nextkey决定了对边界值得处理方式:
nextkey=true:if (result == 1 || result == 0) low = mid + 1 else high = mid;
- 当遇到相等情况时,移动左边界,且移动后右半部分包含相等的那个值,达到只要>scankey的效果
nextkey=false: if(result == 1) low = mid + 1 else high = mid;
- 当遇到相等情况时,移动右边界,让左半部分包含相等的那个值,达到要>=scankey的效果
上面解释太抽象,看下面例子:
例如有这样的数据:
scankey = 5
index 0 1 2 3 4 5 6 7 8
value 1 2 3 5 5 5 6 7 9
分别来看:
next=true(1,0移动左边界)(-1移动右边界) | next=false(1移动左边界)(0,-1移动右边界)
【第一轮】注意因为low=mid+1,意味如果把相等值划到左半部分,右区间就不包含这个值了,达到>的效果。
mid = low + ((high - low) / 2) = 4 mid = low + ((high - low) / 2) = 4
value[4]=5 value[4]=5
low = mid + 1 = 5 high = 8 low = 0 high = 4
【第二轮】
mid = low + ((high - low) / 2) = 6 mid = low + ((high - low) / 2) = 2
value[6]=6 value[2]=3
low = 5 high = mid + 1 = 6 low = mid + 1 = 4 high = 4
结束:low = 3 (value[3] = 5,找到第一个大于等于5的值)
【第三轮】
mid = low + ((high - low) / 2) = 5
values[5]=5
low = mid + 1 = 6 high = 8
结束:low = 6 (values[6]=6,找到第一个大于5的值)
high++; /* establish the loop invariant for high */
cmpval = nextkey ? 0 : 1; /* select comparison value */
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
/* We have low <= mid < high, so mid points at a real slot */
result = _bt_compare(rel, keysz, scankey, page, mid);
if (result >= cmpval)
low = mid + 1; // 关键在这一行,这里+1后会排除一个相等元素
else
high = mid;
}
/* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (resp. > * scan key), which could be the last slot + 1. */
if (P_ISLEAF(opaque))
return low;
/* * On a non-leaf page, return the last key < scan key (resp. <= scan key). * There must be one if _bt_compare() is playing by the rules. */
Assert(low > P_FIRSTDATAKEY(opaque));
return OffsetNumberPrev(low);
}
第五步分解_bt_getroot
总结:通过0号页面meta找到root,没有就分配一个,然后返回带读锁的root。(meta会缓存避免反复读)
步骤:简单buffer操作,不贴代码了。
【1】从rel->rd_amcache缓存读meta,然后直接读root(_bt_getbuf(rel, rootblkno, BT_READ)),如果root页面可用直接返回。
【2】如果没有缓存,_bt_getbuf读0号页面,如果发现root是空的,创建root,记录XLOG(meta、root两个页面),释放meta锁,加上root读锁返回。
【3】如果没有缓存,_bt_getbuf读0号页面,如果发现root可用,把meta缓存到rel->rd_amcache,释放meta锁,加上root读锁返回。
第五步分解_bt_moveright
总结:用opaque->btpo_next向右读,直到找到highkey>scankey的第一个页面(nextkey=1)或直到找到highkey>=scankey的第一个页面(nextkey=0)。
Buffer
_bt_moveright(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
bool nextkey,
bool forupdate,
BTStack stack,
int access,
Snapshot snapshot)
{
Page page;
BTPageOpaque opaque;
int32 cmpval;
/* * When nextkey = false (normal case): if the scan key that brought us to * this page is > the high key stored on the page, then the page has split * and we need to move right. (If the scan key is equal to the high key, * we might or might not need to move right; have to scan the page first * anyway.) * * When nextkey = true: move right if the scan key is >= page's high key. * * The page could even have split more than once, so scan as far as * needed. * * We also have to move right if we followed a link that brought us to a * dead page. */
nextkey=1 表示 tuple>scankey nextkey=0 表示 tuple>=scankey
cmpval = nextkey ? 0 : 1;
for (;;)
{
page = BufferGetPage(buf);
TestForOldSnapshot(snapshot, rel, page);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_RIGHTMOST(opaque))
break;
/* * Finish any incomplete splits we encounter along the way. */
if (forupdate && P_INCOMPLETE_SPLIT(opaque))
{
BlockNumber blkno = BufferGetBlockNumber(buf);
/* upgrade our lock if necessary */
if (access == BT_READ)
{
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBuffer(buf, BT_WRITE);
}
if (P_INCOMPLETE_SPLIT(opaque))
_bt_finish_split(rel, buf, stack);
else
_bt_relbuf(rel, buf);
/* re-acquire the lock in the right mode, and re-check */
buf = _bt_getbuf(rel, blkno, access);
continue;
}
关键的比较逻辑在这里: if(_bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval) 没找到继续下页 else 找到了
两种情况
-
nextkey=1 表示 tuple>scankey
if(_bt_compare(rel, keysz, scankey, page, P_HIKEY) >= 0) 没找到继续下页 else 找到了
- _bt_compare>=0 : scankey >= tuple
- else分支表示scankey < tuple
-
nextkey=0 表示 tuple>=scankey
if(_bt_compare(rel, keysz, scankey, page, P_HIKEY) >= 1) 没找到继续下页 else 找到了
- _bt_compare>=1 : scankey > tuple
- else分支表示scankey <= tuple
找到第一个highkey比scankey大的,scankey就在当前页面中。
scankey=15,nextkey=1时,会找到第三页
scankey=15,nextkey=0时,会找到第二页
scankey=16,nextkey=0/1时,会找到第三页
[7] [15] [20]
1 7 15 20
2 9 16 22
3 10 17
5 12 18
6 13 19
if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
{
/* step right one page */
buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
continue;
}
else
break;
}
if (P_IGNORE(opaque))
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
return buf;
}
第五步整体流程_bt_search
总结: 【1】_bt_getroot拿到root页面带读锁 【2】进入循环:右移动直到找到第一个符合要求的页面(符合要求的定义见上一节) 【3】如果是叶子页面结束搜索返回 【4】如果不是叶子页面(那就是branch页面)二分搜索页面,找到符合要求的tuple,取出指向的页面 【5】释放当前页面读锁,加下一个页面读锁,继续【2】,直到找到叶子页面为止
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
Buffer *bufP, int access, Snapshot snapshot)
{
BTStack stack_in = NULL;
*bufP = _bt_getroot(rel, access);
if (!BufferIsValid(*bufP))
return (BTStack) NULL;
for (;;)
{
Page page;
BTPageOpaque opaque;
OffsetNumber offnum;
ItemId itemid;
IndexTuple itup;
BlockNumber blkno;
BlockNumber par_blkno;
BTStack new_stack;
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
(access == BT_WRITE), stack_in,
BT_READ, snapshot);
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_ISLEAF(opaque))
break;
offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
itemid = PageGetItemId(page, offnum);
itup = (IndexTuple) PageGetItem(page, itemid);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
par_blkno = BufferGetBlockNumber(*bufP);
new_stack = (BTStack) palloc(sizeof(BTStackData));
new_stack->bts_blkno = par_blkno;
new_stack->bts_offset = offnum;
memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
new_stack->bts_parent = stack_in;
/* drop the read lock on the parent page, acquire one on the child */
*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
/* okay, all set to move down a level */
stack_in = new_stack;
}
return stack_in;
}
第六步:从起始leaf中找到所需ctid
继续看_bt_first
最后一部分代码
bool
_bt_first(IndexScanDesc scan, ScanDirection dir)
...
// 上一节已经分析
stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
scan->xs_snapshot);
/* don't need to keep the stack around... */
_bt_freestack(stack);
if (!BufferIsValid(buf))
{
/* * We only get here if the index is completely empty. Lock relation * because nothing finer to lock exists. */
PredicateLockRelation(rel, scan->xs_snapshot);
/* * mark parallel scan as done, so that all the workers can finish * their scan */
_bt_parallel_done(scan);
BTScanPosInvalidate(so->currPos);
return false;
}
else
PredicateLockPage(rel, BufferGetBlockNumber(buf),
scan->xs_snapshot);
_bt_initialize_more_data(so, dir);
/* position to the precise item on the page */
上面已经用scankey定位到leaf页面,这里继续在leaf页面中二分定位到起始tuple:
offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
/* * If nextkey = false, we are positioned at the first item >= scan key, or * possibly at the end of a page on which all the existing items are less * than the scan key and we know that everything on later pages is greater * than or equal to scan key. * * If nextkey = true, we are positioned at the first item > scan key, or * possibly at the end of a page on which all the existing items are less * than or equal to the scan key and we know that everything on later * pages is greater than scan key. * * The actually desired starting point is either this item or the prior * one, or in the end-of-page case it's the first item on the next page or * the last item on this page. Adjust the starting offset if needed. (If * this results in an offset before the first item or after the last one, * _bt_readpage will report no items found, and then we'll step to the * next page as needed.) */
if (goback)
offnum = OffsetNumberPrev(offnum);
/* remember which buffer we have pinned, if any */
Assert(!BTScanPosIsValid(so->currPos));
so->currPos.buf = buf;
/* * Now load data from the first page of the scan. */
这里查到的offnum=5,也就是这一条数据:
select * from t81 where ctid='(1,23)';
id | info
-----+----------------------------------
143 | 8a5f7d9783b38755e1df35c7ff7456c6
select itemoffset,ctid,itemlen,nulls,vars from bt_page_items('idx_t81_id_info', 2) limit 10;
itemoffset | ctid | itemlen | nulls | vars
------------+--------+---------+-------+------
1 | (2,41) | 48 | f | t
2 | (1,21) | 48 | f | t
3 | (1,22) | 48 | f | t
4 | (1,23) | 48 | f | t <--------143
5 | (1,24) | 48 | f | t <--------144 查询条件是id>143,起始位置在这里
6 | (1,25) | 48 | f | t
7 | (1,26) | 48 | f | t
8 | (1,27) | 48 | f | t
9 | (1,28) | 48 | f | t
10 | (1,29) | 48 | f | t
_bt_readpage函数会缓存结果:
_bt_readpage
so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
so->currPos.nextPage = opaque->btpo_next;
while (offnum <= maxoff)
itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
if (itup != NULL)
_bt_saveitem(so, itemIndex, offnum, itup); // 把ctid缓存到本地变量中
读取完成后 1、currItem指向第一条数据(已本地缓存) 2、scan->xs_ctup.t_self保存第一条的ctid。
结束。
if (!_bt_readpage(scan, dir, offnum))
{
/* * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
if (!_bt_steppage(scan, dir))
return false;
}
else
{
/* Drop the lock, and maybe the pin, on the current page */
_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
}
readcomplete:
/* OK, itemIndex says what to return */
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true;
}
_bt_next
下一篇35继续展开分析