PostgreSQL ring buffer策略
When running a query that needs to access a large number of pages just once,such as VACUUM or a large sequential scan, a different strategy is used.A page that has been touched only by such a scan is unlikely to be needed again soon, so instead of running the normal clock sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated using the normal clock sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes.
引用了src/backend/storage/buffer/README中对ring buffer的介绍,ring buffer就是在当需要大量访问页面的情况下如vacuum或者大量的全表扫描时采用的一种特殊的策略。不会像正常的时钟扫描算法交换出整个缓冲区,而是在一小块缓冲区上使用时钟扫描算法,并会重用缓冲区完成整个扫描。这样就会避免大量全表扫描带来的缓冲区命中率的下降。
一、使用ring buffer策略的场景
/src/backend/storage/buffer/freelist.c/GetAccessStrategy
GetAccessStrategy(BufferAccessStrategyType btype)
{
BufferAccessStrategy strategy;
int ring_size;
/*
* Select ring size to use. See buffer/README for rationales.
*
* Note: if you change the ring size for BAS_BULKREAD, see also
* SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
*/
switch (btype)
{
case BAS_NORMAL:
/* if someone asks for NORMAL, just give 'em a "default" object */
return NULL;
case BAS_BULKREAD:
ring_size = 256 * 1024 / BLCKSZ;
break;
case BAS_BULKWRITE:
ring_size = 16 * 1024 * 1024 / BLCKSZ;
break;
case BAS_VACUUM:
ring_size = 256 * 1024 / BLCKSZ;
break;
default:
elog(ERROR, "unrecognized buffer access strategy: %d",
(int) btype);
return NULL; /* keep compiler quiet */
}
/* Make sure ring isn't an undue fraction of shared buffers */
ring_size = Min(NBuffers / 8, ring_size);
/* Allocate the object and initialize all elements to zeroes */
strategy = (BufferAccessStrategy)
palloc0(offsetof(BufferAccessStrategyData, buffers) +
ring_size * sizeof(Buffer));
/* Set fields that don't start out zero */
strategy->btype = btype;
strategy->ring_size = ring_size;
return strategy;
}
1.BAS_BULKREAD
批量读的情况,会分配256KB的内存区域,README中解释了这个大小的原因,原因是这个大小刚好适合L2缓存,这会让OS缓存到共享缓存传输效率更高,那什么情况才算批量读呢?
src/backend/access/heap/heapam.c/
if (!RelationUsesLocalBuffers(scan->rs_rd) &&
scan->rs_nblocks > NBuffers / 4)
{
allow_strat = scan->rs_allow_strat;
allow_sync = scan->rs_allow_sync;
}
else
allow_strat = allow_sync = false;
if (allow_strat)
{
/* During a rescan, keep the previous strategy object. */
if (scan->rs_strategy == NULL)
scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
}
这个意思就是说如果表不是临时表并且扫描的块数大于shared_buffer的1/4的块数,就会使用BAS_BULKREAD策略。
2.BAS_BULKWRITE
批量写的情况下,会分配16MB的空间,原因是较小的ring buffer会频繁的进行wal flush,降低写入的效率。批量写的情况为如下场景:
- COPY FROM 命令
- CREATE TABLE AS 命令
- CREATE MATERIALIZED VIEW或者 REFRESH MATERIALIZED VIEW 命令
- ALTER TABLE 命令
3.BAS_VACUUM
vacuum会使用256KB的内存区域,类似顺序扫描。如果有脏页,wal必须被写入才能重用这个buffer。
二、测试
pg_buffercache可以帮助我们观察shared_buffer中使用的情况
1.BAS_BULKREAD测试
#创建pg_buffercache插件
postgres=# create extension pg_buffercache ;
CREATE EXTENSION
#查看shared_buffer大小
postgres=# show shared_buffers ;
shared_buffers
----------------
128MB
(1 row)
#创建表
postgres=# create table test as select generate_series(1,1000000);
SELECT 1000000
#查看表大小,表大小需要大于shared_buffer的1/4
postgres=# \dt+ test
List of relations
Schema | Name | Type | Owner | Size | Description
--------+------+-------+----------+-------+-------------
public | test | table | postgres | 35 MB |
(1 row)
#重启数据库,清空shared_buffer
pg_ctl restart
#关闭并行
postgres=# set max_parallel_workers_per_gather =0;
SET
#查询shared_buffer中test表中的块的信息
postgres=# select * from pg_buffercache where relfilenode ='test'::regclass;
bufferid | relfilenode | reltablespace | reldatabase | relforknumber | relblocknumber | isdirty | usagecount | pinning_backends
----------+-------------+---------------+-------------+---------------+----------------+---------+------------+------------------
(0 rows)
#进行全部扫描
postgres=# select count(1) from test;
count
---------
1000000
(1 row)
#查询shared_buffer中test表中的块的信息
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
count
-------
32
(1 row)
发现有32个块,刚好与256KB(32*8)大小对应
2.BAS_BULKWRITE
#导出数据文件
postgres=# copy test to '/home/postgres/test.csv' with csv;
COPY 1000000
#重启数据库
pg_ctl restart
#查询shared_buffer中test表中的块的信息
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
count
-------
0
(1 row)
#copy from 导入test表中
postgres=# copy test from '/home/postgres/test.csv';
COPY 1000000
#查询shared_buffer中test表中的块的信息
postgres=# select count(1) from pg_buffercache where relfilenode='test'::regclass;
count
-------
2051
(1 row)
批量写入分配的大小为16MB,预想的块数应该为2048(16*1024/8),发现与预想有点差别,下面我们看下原因
pg_buffercache中有个relforknumber这个字段,这个定义如下
/src/include/common/relpath.h
typedef enum ForkNumber
{
InvalidForkNumber = -1,
MAIN_FORKNUM = 0,
FSM_FORKNUM,
VISIBILITYMAP_FORKNUM,
INIT_FORKNUM
/*
* NOTE: if you add a new fork, change MAX_FORKNUM and possibly
* FORKNAMECHARS below, and update the forkNames array in
* src/common/relpath.c
*/
} ForkNumber;
0就是main数据文件,1就是fsm文件,2就是vm文件。
我们在查下pg_buffercache
postgres=# select relforknumber,count(1) from pg_buffercache where relfilenode='test'::regclass group by relforknumber;
relforknumber | count
---------------+-------
0 | 2048
1 | 3
(2 rows)
发现数据文件buffer块与我们之前预想的结果是一致的,多的三个块是访问fsm文件的块。
postgres=# select relfilenode,relforknumber,relblocknumber,isdirty,usagecount,pinning_backends from pg_buffercache where relfilenode='test'::regclass and relforknumber!='0';
relfilenode | relforknumber | relblocknumber | isdirty | usagecount | pinning_backends
-------------+---------------+----------------+---------+------------+------------------
16437 | 1 | 3 | f | 5 | 0
16437 | 1 | 0 | f | 5 | 0
16437 | 1 | 2 | f | 5 | 0
(3 rows)
访问fsm的调用关系是CopyFrom->CopyFromInsertBatch->heap_multi_insert->RelationGetBufferForTuple->GetPageWithFreeSpace->fsm_search。如果插入buffer的块数=1000或者行的大小加起来大于64KB就会触发一次刷写,这时就会去访问fsm文件寻找空余的空间,具体怎么访问fsm文件,这里就不具体展开讨论了。
src/backend/commands/copy.c/CopyFrom
if (useHeapMultiInsert)
{
/* Add this tuple to the tuple buffer */
if (nBufferedTuples == 0)
firstBufferedLineNo = cstate->cur_lineno;
bufferedTuples[nBufferedTuples++] = tuple;
bufferedTuplesSize += tuple->t_len;
/*
* If the buffer filled up, flush it. Also flush if the
* total size of all the tuples in the buffer becomes
* large, to avoid using large amounts of memory for the
* buffer when the tuples are exceptionally wide.
*/
if (nBufferedTuples == MAX_BUFFERED_TUPLES ||
bufferedTuplesSize > 65535)
{
CopyFromInsertBatch(cstate, estate, mycid, hi_options,
resultRelInfo, myslot, bistate,
nBufferedTuples, bufferedTuples,
firstBufferedLineNo);
nBufferedTuples = 0;
bufferedTuplesSize = 0;
}
}
3.BAS_VACUUM
#查询shared_buffer中test表中的块的信息
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
count
-------
0
(1 row)
#vacuum操作
postgres=# vacuum test ;
VACUUM
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
count
-------
37
(1 row)
预想结果应该是32,结果是37,我们再看下原因
postgres=# select relfilenode,relforknumber,relblocknumber,isdirty,usagecount,pinning_backends from pg_buffercache where relfilenode='test'::regclass and relforknumber!='0';
relfilenode | relforknumber | relblocknumber | isdirty | usagecount | pinning_backends
-------------+---------------+----------------+---------+------------+------------------
16437 | 2 | 0 | t | 2 | 0
16437 | 1 | 2 | f | 5 | 0
16437 | 1 | 3 | t | 5 | 0
16437 | 1 | 0 | t | 1 | 0
16437 | 1 | 1 | t | 1 | 0
(5 rows)
这里可以看到有四次是对fsm文件不同块进行的访问,有1次对vm文件进行的访问,减掉5刚好就是32个块。