阅读顺序
《Postgresql源码(30)Postgresql索引基础B-linked-tree》
《Postgresql源码(31)Btree索引相关系统表和整体结构
《Postgresql源码(32)Btree索引分裂前后结构差异对比
《Postgresql源码(33)Btree索引阅读-整个过程&_bt_first》
《Postgresql源码(34)Btree索引读——_bt_first搜索部分分析
《Postgresql源码(36)Btree索引读——_bt_next搜索部分分析
总结
-
BTScanPosData会在so->currPos->items当前查询页面的缓存ctid。每次查询一个页面,都会使用_bt_steppage更新currPos的内容。
-
叶页加锁特点:_bt_steppage会使用
_bt_readnextpage
打开下一个页面,加开始检查数据,满足要求heap tid记录到so->currPos->items中。同时只有一把锁。 -
branch页面加锁特点:btree爬非叶节点时,从root到leaf的过程是用_bt_relandgetbuf加锁,也就是先放一把旧锁,再加一个页面的锁。同时只有一把锁。(
_bt_search
)
页面结构及预期
继续分析34篇文章中提到的这篇文章SQL
用于分析的SQL预期:_bt_next会扫过1、2、4三次leaf页面
-- 起始 > (1,19) 终止 < (3,59) -- PAGE1:扫描1条:itemoffset 141-141 -- PAGE2:扫描140条:itemoffset 2-141 -- PAGE4:扫描139条:itemoffset 2-140 select * from t81 where id>139 and id<419;
数据构建
create table t81(id int, info text); create index idx_t81_id_info on t81(id,info); insert into t81 select generate_series(1,149370), md5(random()::text); select * from bt_metap('idx_t81_id_info'); magic | version | root | level | fastroot | fastlevel -------- --------- ------ ------- ---------- ----------- 340322 | 2 | 116 | 2 | 116 | 2 select itemoffset,ctid from bt_page_items('idx_t81_id_info', 116); itemoffset | ctid ------------ ---------- 1 | (3,1) 2 | (115,1) 3 | (227,1)
4 | (338,1)
5 | (449,1)
6 | (560,1)
7 | (671,1)
8 | (782,1)
9 | (893,1)
10 | (1004,1)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 1);
itemoffset | ctid
------------+---------
1 | (1,21)
2 | (0,1)
3 | (0,2)
4 | (0,3)
5 | (0,4)
...
140 | (1,19) <------------------------ start
141 | (1,20)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 2);
itemoffset | ctid
------------+---------
1 | (2,41)
2 | (1,21)
3 | (1,22)
4 | (1,23)
5 | (1,24)
6 | (1,25)
...
140 | (2,39)
141 | (2,40)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 4);
itemoffset | ctid
------------+---------
1 | (3,61)
2 | (2,41)
3 | (2,42)
4 | (2,43)
5 | (2,44)
6 | (2,45)
7 | (2,46)
...
140 | (3,59) <------------------------ end
141 | (3,60)
select * from t81 where ctid='(1,19)';
id | info
-----+----------------------------------
143 | 2c33d634b63de9c16306ac81abd2dcf2
select * from t81 where ctid='(3,59)';
id | info
-----+----------------------------------
423 | d13e311ad061155067e44af2c655d5b2
-- 起始 > (1,19) 终止 < (3,59)
-- PAGE1:扫描1条:itemoffset 141-141
-- PAGE2:扫描140条:itemoffset 2-141
-- PAGE4:扫描139条:itemoffset 2-140
select * from t81 where id>139 and id<419;
前面《Postgresql源码(33)Btree索引读——整体流程&_bt_first》
已经对定位初始页做了分析,这里已经拿到了初始ctid,继续分析后面的搜索流程。
重点关注scan过程用到的数据结构。
_bt_next
b PortalRun
b _bt_next
在_bt_first执行后,BTScanOpaque的数据已经完整,这里缓存了查询下一条所需要的全部数据。
函数流程比较简单:
bool
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
BTScanPosItem *currItem;
/* * Advance to next tuple on current page; or if there's no more, try to * step to the next page with data. */
if (ScanDirectionIsForward(dir))
{
if (++so->currPos.itemIndex > so->currPos.lastItem)
{
if (!_bt_steppage(scan, dir))
return false;
}
}
else
{
if (--so->currPos.itemIndex < so->currPos.firstItem)
{
if (!_bt_steppage(scan, dir))
return false;
}
}
/* OK, itemIndex says what to return */
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true;
}
这里重点看下BTScanOpaque用到的currPos数据结构和当前内容。
typedef struct BTScanOpaqueData
{
...
BTScanPosData currPos; /* current position data */
...
} BTScanOpaqueData;
BTScanPosData
typedef struct BTScanPosData
{
Buffer buf; /* if valid, the buffer is pinned */
XLogRecPtr lsn; /* pos in the WAL stream when page was read */
BlockNumber currPage; /* page referenced by items array */
BlockNumber nextPage; /* page's right link when we scanned it */
/* * moreLeft and moreRight track whether we think there may be matching * index entries to the left and right of the current page, respectively. * We can clear the appropriate one of these flags when _bt_checkkeys() * returns continuescan = false. */
bool moreLeft;
bool moreRight;
/* * If we are doing an index-only scan, nextTupleOffset is the first free * location in the associated tuple storage workspace. */
int nextTupleOffset;
/* * The items array is always ordered in index order (ie, increasing * indexoffset). When scanning backwards it is convenient to fill the * array back-to-front, so we start at the last slot and fill downwards. * Hence we need both a first-valid-entry and a last-valid-entry counter. * itemIndex is a cursor showing which entry was last returned to caller. */
int firstItem; /* first valid index in items[] */
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
} BTScanPosData;
typedef struct BTScanPosItem /* what we remember about each match */
{
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
} BTScanPosItem;
执行SQL后,第一次_bt_next
-- 起始 > (1,19) 终止 < (3,59)
-- PAGE1:扫描1条:itemoffset 141-141
-- PAGE2:扫描140条:itemoffset 2-141
-- PAGE4:扫描139条:itemoffset 2-140
select * from t81 where id>139 and id<419;
这里应该扫描PAGE1的最后一条:
(gdb) p so->currPos
{
buf = 150,
lsn = 128202491872,
currPage = 1,
nextPage = 2,
moreLeft = 0 '\000',
moreRight = 1 '\001',
nextTupleOffset = 48,
firstItem = 0,
lastItem = 0,
itemIndex = 0,
items = {
{
heapTid = {
ip_blkid = {
bi_hi = 0, bi_lo = 1}, ip_posid = 20}, indexOffset = 141, tupleOffset = 0}}
}
注意第一次执行后,会进入_bt_steppage
缓存下一页的数据,下一次执行_bt_next:
(gdb) p so->currPos
{
buf = 152,
lsn = 128202499320,
currPage = 2,
nextPage = 4,
moreLeft = 1 '\001',
moreRight = 1 '\001',
nextTupleOffset = 6720,
firstItem = 0,
lastItem = 139,
itemIndex = 0,
items = {
{
heapTid = {
ip_blkid = {
bi_hi = 0, bi_lo = 1}, ip_posid = 21}, indexOffset = 2, tupleOffset = 0},
{
heapTid = {
ip_blkid = {
bi_hi = 0, bi_lo = 1}, ip_posid = 22}, indexOffset = 3, tupleOffset = 48},
{
heapTid = {
ip_blkid = {
bi_hi = 0, bi_lo = 1}, ip_posid = 23}, indexOffset = 4, tupleOffset = 96},
...
}
取数循环
ExecutePlan:外层循环,每次拿一条
/*
* Loop until we've processed the proper number of tuples from the plan.
*/
for (;;)
ExecProcNode
ExecIndexOnlyScan
ExecScan
ExecScanFetch
IndexOnlyNext:内部拼TupleTableSlot返回
index_getnext_tid
btgettuple
_bt_next
(1)按so->currPos.items找到当前缓存的heaptid
(2)记录scan->xs_itup:{t_tid = {ip_blkid = {bi_hi = 0, bi_lo = 1}, ip_posid = 22}, t_info = 16432}
index_fetch_heap
StoreIndexTuple