PostgreSQL checkpoint中用于刷一個臟page的函數是什么

137次閱讀

共計 11661 個字符，預計需要花費 30 分鐘才能閱讀完成。

這篇文章主要講解了“PostgreSQL checkpoint 中用于刷一個臟 page 的函數是什么”，文中的講解內容簡單清晰，易于學習與理解，下面請大家跟著丸趣 TV 小編的思路慢慢深入，一起來研究和學習“PostgreSQL checkpoint 中用于刷一個臟 page 的函數是什么”吧！

一、數據結構

宏定義
checkpoints request flag bits, 檢查點請求標記位定義.

/*
 * OR-able request flag bits for checkpoints. The  cause  bits are used only
 * for logging purposes. Note: the flags must be defined so that it s
 * sensible to OR together request flags arising from different requestors.
 */
/* These directly affect the behavior of CreateCheckPoint and subsidiaries */
#define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */
#define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but
 * issued at end of WAL recovery */
#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */
#define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */
#define CHECKPOINT_FLUSH_ALL 0x0010 /* Flush all pages, including those
 * belonging to unlogged tables */
/* These are important to RequestCheckpoint */
#define CHECKPOINT_WAIT 0x0020 /* Wait for completion */
#define CHECKPOINT_REQUESTED 0x0040 /* Checkpoint request has been made */
/* These indicate the cause of a checkpoint request */
#define CHECKPOINT_CAUSE_XLOG 0x0080 /* XLOG consumption */
#define CHECKPOINT_CAUSE_TIME 0x0100 /* Elapsed time */

二、源碼解讀

SyncOneBuffer, 在 syncing 期間處理一個 buffer, 其主要處理邏輯如下:
1. 獲取 buffer 描述符
2. 鎖定 buffer
3. 根據 buffer 狀態和輸入參數執行相關判斷 / 處理
4. 釘住臟頁, 上共享鎖, 調用 FlushBuffer 刷盤
5. 解鎖 / 解釘和其他收尾工作

/*
 * SyncOneBuffer -- process a single buffer during syncing.
 *  在 syncing 期間處理一個 buffer
 *
 * If skip_recently_used is true, we don t write currently-pinned buffers, nor
 * buffers marked recently used, as these are not replacement candidates.
 *  如 skip_recently_used 為 T, 既不寫 currently-pinned buffers,
 *  也不寫標記為最近使用的 buffers, 因為這些緩沖區不是可替代的緩沖區.
 *
 * Returns a bitmask containing the following flag bits:
 * BUF_WRITTEN: we wrote the buffer.
 * BUF_REUSABLE: buffer is available for replacement, ie, it has
 * pin count 0 and usage count 0.
 *  返回位掩碼:
 * BUF_WRITTEN:  已寫入 buffer
 * BUF_REUSABLE: buffer 可用于替代 (pin count 和 usage count 均為 0)
 *
 * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
 * after locking it, but we don t care all that much.)
 *
 * Note: caller must have done ResourceOwnerEnlargeBuffers.
 */
static int
SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 int result = 0;
 uint32 buf_state;
 BufferTag tag;
 ReservePrivateRefCountEntry();
 /*
 * Check whether buffer needs writing.
 *  檢查 buffer 是否需要寫入.
 *
 * We can make this check without taking the buffer content lock so long
 * as we mark pages dirty in access methods *before* logging changes with
 * XLogInsert(): if someone marks the buffer dirty just after our check we
 * don t worry because our checkpoint.redo points before log record for
 * upcoming changes and so we are not required to write such dirty buffer.
 *  在使用 XLogInsert() logging 變化前通過訪問方法標記 pages 為臟時,
 *  不需要持有鎖太長的時間來執行該檢查:
 *  因為如果某個進程在檢查后標記 buffer 為臟,
 *  在這種情況下 checkpoint.redo 指向了變化出現前的 log 位置, 因此無需擔心, 而且不必寫這樣的臟塊.
 */
 buf_state = LockBufHdr(bufHdr);
 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0  
 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
 {
 result |= BUF_REUSABLE;
 }
 else if (skip_recently_used)
 {
 /* Caller told us not to write recently-used buffers */
 // 跳過最近使用的 buffer
 UnlockBufHdr(bufHdr, buf_state);
 return result;
 }
 if (!(buf_state   BM_VALID) || !(buf_state   BM_DIRTY))
 {
 /* It s clean, so nothing to do */
 //buffer 無效或者不是臟塊
 UnlockBufHdr(bufHdr, buf_state);
 return result;
 }
 /*
 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
 * buffer is clean by the time we ve locked it.)
 *  釘住它, 上共享鎖, 并刷到盤上.
 */
 PinBuffer_Locked(bufHdr);
 LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 // 調用 FlushBuffer
 //If the caller has an smgr reference for the buffer s relation, pass it as the second parameter. 
 //If not, pass NULL.
 FlushBuffer(bufHdr, NULL);
 LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 tag = bufHdr- 
 UnpinBuffer(bufHdr, true);
 ScheduleBufferTagForWriteback(wb_context,  tag);
 return result | BUF_WRITTEN;
}

FlushBuffer
FlushBuffer 函數物理上把共享緩存刷盤, 主要實現函數還是 smgrwrite(storage manager write).

/*
 * FlushBuffer
 * Physically write out a shared buffer.
 *  物理上把共享緩存刷盤.
 *
 * NOTE: this actually just passes the buffer contents to the kernel; the
 * real write to disk won t happen until the kernel feels like it. This
 * is okay from our point of view since we can redo the changes from WAL.
 * However, we will need to force the changes to disk via fsync before
 * we can checkpoint WAL.
 *  只是把 buffer 內容發給 os 內核, 何時真正寫盤由 os 來確定.
 *  在 checkpoint WAL 前需要通過 fsync 強制落盤.
 *
 * The caller must hold a pin on the buffer and have share-locked the
 * buffer contents. (Note: a share-lock does not prevent updates of
 * hint bits in the buffer, so the page could change while the write
 * is in progress, but we assume that that will not invalidate the data
 * written.)
 *  調用者必須釘住了緩存并且持有共享鎖.
 * ( 注意: 共享鎖不會 buffer 中的 hint bits 的更新, 因此在寫入期間 page 可能會出現變化,
 *  但我假定那樣不會讓寫入的數據無效 )
 *
 * If the caller has an smgr reference for the buffer s relation, pass it
 * as the second parameter. If not, pass NULL.
 */
static void
FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 XLogRecPtr recptr;
 ErrorContextCallback errcallback;
 instr_time io_start,
 io_time;
 Block bufBlock;
 char *bufToWrite;
 uint32 buf_state;
 /*
 * Acquire the buffer s io_in_progress lock. If StartBufferIO returns
 * false, then someone else flushed the buffer before we could, so we need
 * not do anything.
 */
 if (!StartBufferIO(buf, false))
 return;
 /* Setup error traceback support for ereport() */
 errcallback.callback = shared_buffer_write_error_callback;
 errcallback.arg = (void *) buf;
 errcallback.previous = error_context_stack;
 error_context_stack =  errcallback;
 /* Find smgr relation for buffer */
 if (reln == NULL)
 reln = smgropen(buf- tag.rnode, InvalidBackendId);
 TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf- tag.forkNum,
 buf- tag.blockNum,
 reln- smgr_rnode.node.spcNode,
 reln- smgr_rnode.node.dbNode,
 reln- smgr_rnode.node.relNode);
 buf_state = LockBufHdr(buf);
 /*
 * Run PageGetLSN while holding header lock, since we don t have the
 * buffer locked exclusively in all cases.
 */
 recptr = BufferGetLSN(buf);
 /* To check if block content changes while flushing. - vadim 01/17/97 */
 buf_state  = ~BM_JUST_DIRTIED;
 UnlockBufHdr(buf, buf_state);
 /*
 * Force XLOG flush up to buffer s LSN. This implements the basic WAL
 * rule that log updates must hit disk before any of the data-file changes
 * they describe do.
 *
 * However, this rule does not apply to unlogged relations, which will be
 * lost after a crash anyway. Most unlogged relation pages do not bear
 * LSNs since we never emit WAL records for them, and therefore flushing
 * up through the buffer LSN would be useless, but harmless. However,
 * GiST indexes use LSNs internally to track page-splits, and therefore
 * unlogged GiST pages bear  fake  LSNs generated by
 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
 * LSN counter could advance past the WAL insertion point; and if it did
 * happen, attempting to flush WAL through that location would fail, with
 * disastrous system-wide consequences. To make sure that can t happen,
 * skip the flush if the buffer isn t permanent.
 */
 if (buf_state   BM_PERMANENT)
 XLogFlush(recptr);
 /*
 * Now it s safe to write buffer to disk. Note that no one else should
 * have been able to write it while we were busy with log flushing because
 * we have the io_in_progress lock.
 */
 bufBlock = BufHdrGetBlock(buf);
 /*
 * Update page checksum if desired. Since we have only shared lock on the
 * buffer, other processes might be updating hint bits in it, so we must
 * copy the page to private storage if we do checksumming.
 */
 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf- tag.blockNum);
 if (track_io_timing)
 INSTR_TIME_SET_CURRENT(io_start);
 /*
 * bufToWrite is either the shared buffer or a copy, as appropriate.
 */
 smgrwrite(reln,
 buf- tag.forkNum,
 buf- tag.blockNum,
 bufToWrite,
 false);
 if (track_io_timing)
 { INSTR_TIME_SET_CURRENT(io_time);
 INSTR_TIME_SUBTRACT(io_time, io_start);
 pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
 INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
 }
 pgBufferUsage.shared_blks_written++;
 /*
 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
 * end the io_in_progress state.
 */
 TerminateBufferIO(buf, true, 0);
 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf- tag.forkNum,
 buf- tag.blockNum,
 reln- smgr_rnode.node.spcNode,
 reln- smgr_rnode.node.dbNode,
 reln- smgr_rnode.node.relNode);
 /* Pop the error context stack */
 error_context_stack = errcallback.previous;
}

三、跟蹤分析

測試腳本

testdb=# update t_wal_ckpt set c2 =  C4# ||substr(c2,4,40);
UPDATE 1
testdb=# checkpoint;

跟蹤分析

(gdb) handle SIGINT print nostop pass
SIGINT is used by the debugger.
Are you sure you want to change it? (y or n) y
Signal Stop Print Pass to program Description
SIGINT No Yes Yes Interrupt
(gdb) b SyncOneBuffer
Breakpoint 1 at 0x8a7167: file bufmgr.c, line 2357.
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
Breakpoint 1, SyncOneBuffer (buf_id=0, skip_recently_used=false, wb_context=0x7fff27f5ae00) at bufmgr.c:2357
2357 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
(gdb) n
2358 int result = 0;
(gdb) p *bufHdr
$1 = {tag = {rnode = {spcNode = 1663, dbNode = 16384, relNode = 221290}, forkNum = MAIN_FORKNUM, blockNum = 0}, buf_id = 0, 
 state = {value = 3548905472}, wait_backend_pid = 0, freeNext = -2, content_lock = {tranche = 53, state = { value = 536870912}, waiters = {head = 2147483647, tail = 2147483647}}}
(gdb) n
2362 ReservePrivateRefCountEntry();
(gdb) 
2373 buf_state = LockBufHdr(bufHdr);
(gdb) 
2375 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0  
(gdb) 
2376 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
(gdb) 
2375 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0  
(gdb) 
2380 else if (skip_recently_used)
(gdb) 
2387 if (!(buf_state   BM_VALID) || !(buf_state   BM_DIRTY))
(gdb) 
2398 PinBuffer_Locked(bufHdr);
(gdb) p buf_state
$2 = 3553099776
(gdb) n
2399 LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
(gdb) 
2401 FlushBuffer(bufHdr, NULL);
(gdb) step
FlushBuffer (buf=0x7fedc4a68300, reln=0x0) at bufmgr.c:2687
2687 if (!StartBufferIO(buf, false))
(gdb) n
2691 errcallback.callback = shared_buffer_write_error_callback;
(gdb) 
2692 errcallback.arg = (void *) buf;
(gdb) 
2693 errcallback.previous = error_context_stack;
(gdb) 
2694 error_context_stack =  errcallback;
(gdb) 
2697 if (reln == NULL)
(gdb) 
2698 reln = smgropen(buf- tag.rnode, InvalidBackendId);
(gdb) 
2700 TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf- tag.forkNum,
(gdb) 
2706 buf_state = LockBufHdr(buf);
(gdb) 
2712 recptr = BufferGetLSN(buf);
(gdb) 
2715 buf_state  = ~BM_JUST_DIRTIED;
(gdb) p recptr
$3 = 16953421760
(gdb) n
2716 UnlockBufHdr(buf, buf_state);
(gdb) 
2735 if (buf_state   BM_PERMANENT)
(gdb) 
2736 XLogFlush(recptr);
(gdb) 
2743 bufBlock = BufHdrGetBlock(buf);
(gdb) 
2750 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf- tag.blockNum);
(gdb) p bufBlock
$4 = (Block) 0x7fedc4e68300
(gdb) n
2752 if (track_io_timing)
(gdb) 
2758 smgrwrite(reln,
(gdb) 
2764 if (track_io_timing)
(gdb) 
2772 pgBufferUsage.shared_blks_written++;
(gdb) 
2778 TerminateBufferIO(buf, true, 0);
(gdb) 
2780 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf- tag.forkNum,
(gdb) 
2787 error_context_stack = errcallback.previous;
(gdb) 
2788 }
(gdb) 
SyncOneBuffer (buf_id=0, skip_recently_used=false, wb_context=0x7fff27f5ae00) at bufmgr.c:2403
2403 LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
(gdb) 
2405 tag = bufHdr- 
(gdb) 
2407 UnpinBuffer(bufHdr, true);
(gdb) 
2409 ScheduleBufferTagForWriteback(wb_context,  tag);
(gdb) 
2411 return result | BUF_WRITTEN;
(gdb) 
2412 }
(gdb)

感謝各位的閱讀，以上就是“PostgreSQL checkpoint 中用于刷一個臟 page 的函數是什么”的內容了，經過本文的學習后，相信大家對 PostgreSQL checkpoint 中用于刷一個臟 page 的函數是什么這一問題有了更深刻的體會，具體使用情況還需要大家實踐驗證。這里是丸趣 TV，丸趣 TV 小編將為大家推送更多相關知識點的文章，歡迎關注！

正文完