5_PostgreSQL WAL内部结构解析_PageHeader
内容概述
PG WAL文件默认大小为16M,内部被划分为8K大小的pages,每个pages由PageHeader(1个)+XLOG records(多个)组成,PG WAL文件中第一个page的PageHeader结构为XLogLongPageHeaderData,第N(N>=2)个page的PageHeader结构为XLogPageHeaderData,本文解析XLogLongPageHeaderData/XLogPageHeaderData结构。
图解PG WAL内部结构

PageHeader结构体定义
通过分析XLogLongPageHeaderData包含XLogPageHeaderData结构,并增加了xlp_sysid/xlp_seg_size/xlp_xlog_blcksz 3个字段。
/*
* Each page of XLOG file has a header like this:
*/
#define XLOG_PAGE_MAGIC 0xD110 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
uint16 xlp_magic; /* magic value for correctness checks */
uint16 xlp_info; /* flag bits, see below */
TimeLineID xlp_tli; /* TimeLineID of first record on page */
XLogRecPtr xlp_pageaddr; /* XLOG address of this page */
/*
* When there is not enough space on current page for whole record, we
* continue on the next page. xlp_rem_len is the number of bytes
* remaining from a previous page; it tracks xl_tot_len in the initial
* header. Note that the continuation data isn't necessarily aligned.
*/
uint32 xlp_rem_len; /* total len of remaining data for record */
} XLogPageHeaderData;
#define SizeOfXLogShortPHD MAXALIGN(sizeof(XLogPageHeaderData))
typedef XLogPageHeaderData *XLogPageHeader;
/*
* When the XLP_LONG_HEADER flag is set, we store additional fields in the
* page header. (This is ordinarily done just in the first page of an
* XLOG file.) The additional fields serve to identify the file accurately.
*/
typedef struct XLogLongPageHeaderData
{
XLogPageHeaderData std; /* standard header fields */
uint64 xlp_sysid; /* system identifier from pg_control */
uint32 xlp_seg_size; /* just as a cross-check */
uint32 xlp_xlog_blcksz; /* just as a cross-check */
} XLogLongPageHeaderData;
#define SizeOfXLogLongPHD MAXALIGN(sizeof(XLogLongPageHeaderData))
typedef XLogLongPageHeaderData *XLogLongPageHeader;
/* When record crosses page boundary, set this flag in new page's header */
#define XLP_FIRST_IS_CONTRECORD 0x0001
/* This flag indicates a "long" page header */
#define XLP_LONG_HEADER 0x0002
/* This flag indicates backup blocks starting in this page are optional */
#define XLP_BKP_REMOVABLE 0x0004
/* Replaces a missing contrecord; see CreateOverwriteContrecordRecord */
#define XLP_FIRST_IS_OVERWRITE_CONTRECORD 0x0008
/* All defined flag bits in xlp_info (used for validity checking of header) */
#define XLP_ALL_FLAGS 0x000F
源码位置:src/include/access/xlog_internal.h
解析XLogLongPageHeaderData结构
物理结构生成
### step1. 创建table t1切换日志并查询当前日志
create table t1(a1 int);
select pg_switch_wal();
SELECT pg_walfile_name(pg_current_wal_lsn());
### step2. 插入一条记录,并生成新的日志文件
insert into t1 values(99);
select pg_switch_wal();
SELECT pg_walfile_name(pg_current_wal_lsn());
二进制解析
[postgres@enmo pgwal]$ dd if=00000001000000020000008B bs=8192 skip=0 count=1 |hexdump -C
1+0 records in
1+0 records out
8192 bytes (8.2 kB) copied, 3.9047e-05 s, 210 MB/s
00000000 10 d1 06 00 01 00 00 00 00 00 00 8b 02 00 00 00 |................|
00000010 00 00 00 00 00 00 00 00 7b be 24 97 1b 53 57 63 |........{.$..SWc|
00000020 00 00 00 01 00 20 00 00 32 00 00 00 00 00 00 00 |..... ..2.......|
00000030 78 1d 00 8a 02 00 00 00 10 08 00 00 ba dd c6 25 |x..............%|
00000040 ff 18 00 00 00 00 00 00 00 00 00 43 b4 9a 6a 05 |...........C..j.|
00000050 00 00 6a 05 00 00 69 05 00 00 00 00 00 00 00 00 |..j...i.........|
00000060 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00002000
[postgres@enmo pgwal]$
### XLogPageHeaderData 结构体
uint16 xlp_magic : 10 d1
uint16 xlp_info: 06 00
TimeLineID xlp_tli: 01 00 00 00
XLogRecPtr xlp_pageaddr: 00 00 00 8b 02 00 00 00
uint32 xlp_rem_len: 00 00 00 00
### XLogLongPageHeaderData结构体
uint64 xlp_sysid: 7b be 24 97 1b 53 57 63
uint32 xlp_seg_size:00 00 00 01
uint32 xlp_xlog_blcksz:00 20 00 00
执行函数解析
通过跟踪程序及debug程序,在src\backend\access\transam/xlog.c 中函数AdvanceXLInsertBuffer创建WAL文件的一个page的XLogLongPageHeaderData,核心代码如下所示,
/*
* Initialize XLOG buffers, writing out old buffers if they still contain
* unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
* true, initialize as many pages as we can without having to write out
* unwritten data. Any new pages are initialized to zeros, with pages headers
* initialized properly.
*/
static void
AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
int nextidx;
XLogRecPtr OldPageRqstPtr;
XLogwrtRqst WriteRqst;
XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
XLogRecPtr NewPageBeginPtr;
XLogPageHeader NewPage;
int npages = 0;
LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
.........
/*
* Now the next buffer slot is free and we can set it up to be the
* next output page.
*/
NewPageBeginPtr = XLogCtl->InitializedUpTo;
NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
/*
* Be sure to re-zero the buffer so that bytes beyond what we've
* written will look like zeroes and not valid XLOG records...
*/
MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
/*
* Fill the new page's header
*/
NewPage->xlp_magic = XLOG_PAGE_MAGIC;
/* NewPage->xlp_info = 0; */ /* done by memset */
NewPage->xlp_tli = tli;
NewPage->xlp_pageaddr = NewPageBeginPtr;
/* NewPage->xlp_rem_len = 0; */ /* done by memset */
/*
* If online backup is not in progress, mark the header to indicate
* that WAL records beginning in this page have removable backup
* blocks. This allows the WAL archiver to know whether it is safe to
* compress archived WAL data by transforming full-block records into
* the non-full-block format. It is sufficient to record this at the
* page level because we force a page switch (in fact a segment
* switch) when starting a backup, so the flag will be off before any
* records can be written during the backup. At the end of a backup,
* the last page will be marked as all unsafe when perhaps only part
* is unsafe, but at worst the archiver would miss the opportunity to
* compress a few records.
*/
if (!Insert->forcePageWrites)
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
/*
* If first page of an XLOG segment file, make it a long header.
*/
if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
{
XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
NewLongPage->xlp_sysid = ControlFile->system_identifier;
NewLongPage->xlp_seg_size = wal_segment_size;
NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
NewPage->xlp_info |= XLP_LONG_HEADER;
}
.................
解析XLogPageHeaderData结构
生成WAL日志多个pages
insert into t1
select n from generate_series(1,1000) as n;
二进制解析
通过hexdump验证,第一个PageHeader为:XLogLongPageHeaderData结构,第N(N>=2)个PageHeader为:XLogPageHeaderData [postgres@enmo pgwal]$ dd if=00000001000000020000008B bs=8192 skip=0 count=1 |hexdump -C|head -3 1+0 records in 1+0 records out 8192 bytes (8.2 kB) copied, 3.2369e-05 s, 253 MB/s 00000000 10 d1 06 00 01 00 00 00 00 00 00 8b 02 00 00 00 |................| 00000010 00 00 00 00 00 00 00 00 7b be 24 97 1b 53 57 63 |........{.$..SWc| 00000020 00 00 00 01 00 20 00 00 32 00 00 00 00 00 00 00 |..... ..2.......| [postgres@enmo pgwal]$ dd if=00000001000000020000008B bs=8192 skip=1 count=1 |hexdump -C|head -3 1+0 records in 1+0 records out 8192 bytes (8.2 kB) copied, 3.5012e-05 s, 234 MB/s 00000000 10 d1 05 00 01 00 00 00 00 20 00 8b 02 00 00 00 |......... ......| 00000010 23 00 00 00 00 00 00 00 00 20 0a 00 7f 06 00 00 |#........ ......| 00000020 05 00 00 00 04 40 00 00 02 00 00 00 ff 03 01 00 |.....@..........| [postgres@enmo pgwal]$ dd if=00000001000000020000008B bs=8192 skip=2 count=1 |hexdump -C|head -3 1+0 records in 1+0 records out 8192 bytes (8.2 kB) copied, 3.4905e-05 s, 235 MB/s 00000000 10 d1 04 00 01 00 00 00 00 40 00 8b 02 00 00 00 |.........@......| 00000010 00 00 00 00 00 00 00 00 3b 00 00 00 6a 05 00 00 |........;...j...| 00000020 c0 3f 00 8b 02 00 00 00 00 0a 00 00 a1 cf ae c9 |.?..............| [postgres@enmo pgwal]
小结
通过源码调试、二进制dump等方式对PG WAL的PageHeader结构进行了深入分析,整体结构还是非常清晰的,如果对文章内容有疑问欢迎留言讨论。
最后修改时间:2022-10-31 08:52:19
「喜欢这篇文章,您的关注和赞赏是给作者最好的鼓励」
关注作者
【版权声明】本文为墨天轮用户原创内容,转载时必须标注文章的来源(墨天轮),文章链接,文章作者等基本信息,否则作者和墨天轮有权追究责任。如果您发现墨天轮中有涉嫌抄袭或者侵权的内容,欢迎发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




