作者:杨向博

平安科技DBA,开源PostgreSQL爱好者。三年数据库运维经验,目前主要负责平安云数据库运维。对SQL优化和源代码分析存在浓厚兴趣。
一、Pathman简介
二、问题背景
postgres=#select * from qsump_pacloud_oscginfo_activity_detail_info_day where id=10;ERROR: constraint"pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_check"ofpartition "qsump_pacloud_oscginfo_activity_detail_info_day_10"does not existHINT: pg_pathman will be disabled to allow you toresolve this issue
三、问题分析
postgres=#select conname,contype,convalidated from pg_constraint whereconname='pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_check';-[ RECORD 1]+---------------------------------------------------------conname |pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_checcontype |cconvalidated | t
get_partition_constraint_expr(Oid partition){Oid conid; * constraint Oid */char *conname; * constraint name */HeapTuple con_tuple;Datum conbin_datum;bool conbin_isnull;Expr *expr; * expression tree for constraint */conname =build_check_constraint_name_relid_internal(partition);conid =get_relation_constraint_oid(partition, conname, true);if (!OidIsValid(conid)){DisablePathman(); * disable pg_pathman since config is broken */ereport(ERROR,(errmsg("constraint \"%s\" ofpartition \"%s\" does not exist",conname,get_rel_name_or_relid(partition)),errhint(INIT_ERROR_HINT)));}…}
char *build_check_constraint_name_relid_internal(Oid relid){AssertArg(OidIsValid(relid));return build_check_constraint_name_relname_internal(get_rel_name(relid));}/** Generatecheck constraint name for a partition.* NOTE: thisfunction does not perform sanity checks at all.*/char *build_check_constraint_name_relname_internal(const char *relname){return psprintf("pathman_%s_check", relname);}
可以看到conname是在build_check_constraint_name_relname_internal函数中拼接的,%s传递的是表名,从报错来看,拼接的conname为:
"pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_check"
Oidget_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok){Relation pg_constraint;HeapTuple tuple;SysScanDescscan;ScanKeyDataskey[1];Oid conOid= InvalidOid;/** Fetch the constraint tuple frompg_constraint. There may be more than* one match, because constraints are notrequired to have unique names;* if so, error out.*/pg_constraint = heap_open(ConstraintRelationId, AccessShareLock);ScanKeyInit(&skey[0],Anum_pg_constraint_conrelid,BTEqualStrategyNumber,F_OIDEQ,ObjectIdGetDatum(relid));scan =systable_beginscan(pg_constraint, ConstraintRelidIndexId, true,NULL, 1, skey);while(HeapTupleIsValid(tuple = systable_getnext(scan))){Form_pg_constraint con =(Form_pg_constraint) GETSTRUCT(tuple);if (strcmp(NameStr(con->conname), conname)== 0){if(OidIsValid(conOid))ereport(ERROR,(errcode(ERRCODE_DUPLICATE_OBJECT),errmsg("table \"%s\" hasmultiple constraints named \"%s\"",get_rel_name(relid),conname)));conOid= HeapTupleGetOid(tuple);}}systable_endscan(scan);/* If nosuch constraint exists, complain */if(!OidIsValid(conOid) && !missing_ok)ereport(ERROR,(errcode(ERRCODE_UNDEFINED_OBJECT),errmsg("constraint \"%s\" fortable \"%s\" does not exist",conname,get_rel_name(relid))));heap_close(pg_constraint,AccessShareLock);return conOid;}
NameData conname; /* name of this constraint */typedef struct nameData{char data[NAMEDATALEN];} NameData;typedef NameData *Name;#define NAMEDATALEN 64postgres=# selectchar_length('pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_chec');char_length-------------63(1 row)
可以看到conname的最大长度为64,而pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_chec的长度为63,再加结束符‘\0’,刚好是64。
6.初步的代码走读,证明了是约束名conname长度超过字段最大长度了,导致存储的约束名不完整,在查询校验约束时报错了。后边的GDB调试也证实了确实是这样,见文章最后。
四、解决方案
1.方案讨论
1)约束名过长是由于表名太长,可以压缩一下表名,将某些地方缩写。这个方案看起来最简单快捷,但是开发兄弟不一定认可接受,可能对于整个应用来说并不是修改一个表名那么简单
站在应用的角度,可以认为是数据库给定的字段长度不够,那么就得考虑是否可以从数据库自身去修改:
2)修改字段最大长度,从64调整为128,但是NAMEDATALEN这个宏,涉及到所有的表、函数、触发器等对象的名称,牵一发则动全身,这个方案风险大,不可行。
3)继续走读代码,发现约束名是在创建分区表的同时创建的,也是通过调用build_check_constraint_name_relname_internal函数,拼接为pathman_%s_check,%s为表名。不难发现仅pathman和check合起来就占了12个字符长度,为了保证可读性check保持不变,可以将pathman缩写为pm,这样能够节省出5个字符长度,那么以现在的表名长度,可以容纳99万个分区表,足够了。

2.方案实施
经过讨论,考虑到改动数据库的风险较大,最终还是选择了压缩表名,把几个单词全拼做了简写,并重建了分区。
3. 其他测试
作为dba,数据库侧的修改,我们还是要自己玩一玩的,验证下其余两个方案
方案2)修改后查询成功,未出现报错。篇幅有限这里不讨论细节
方案3)如下:
修改代码
/** Generatecheck constraint name for a partition.* NOTE: thisfunction does not perform sanity checks at all.*/char *build_check_constraint_name_relname_internal(constchar *relname){/*Modified Begin pathman to pm */returnpsprintf("pm_%s_check",relname);/* End2020-01-10 */}
编译
cd $pg_pathmansrcdir
Make && make install
登录数据库执行 drop extensionpg_pathman ;
重启数据库并create extensionpg_pathman ;
创建分区表,插入数据,查询成功未报错
postgres=# createtable qsump_pacloud_oscginfo_activity_detail_info_day(id intprimarykey,add_time timestamp without time zone not null);CREATE TABLEpostgres=# selectcreate_range_partitions('qsump_pacloud_oscginfo_activity_detail_info_day'::REGCLASS,postgres(# 'add_time',postgres(# '2020-01-01 00:00:00'::timestampwithouttime zone ,postgres(# interval '7 days' ,postgres(# 9,postgres(# true );create_range_partitions-------------------------9(1 row)postgres=# insertinto qsump_pacloud_oscginfo_activity_detail_info_day values(10,'2020-03-1200:00:00'::timestamp without time zone);INSERT 0 1postgres=# select* from qsump_pacloud_oscginfo_activity_detail_info_day whereid=10;id | add_time----+---------------------10 |2020-03-12 00:00:00(1 row)postgres=# \d+qsump_pacloud_oscginfo_activity_detail_info_day_10Table "public.qsump_pacloud_oscginfo_activity_detail_info_day_10"Column | Type | Modifiers |Storage | Stats target | Description----------+-----------------------------+-----------+---------+--------------+-------------id | integer | not null | plain | |add_time |timestamp without time zone | not null |plain | |Indexes:"qsump_pacloud_oscginfo_activity_detail_info_day_10_pkey"PRIMARY KEY,btree (id)Check constraints:"pm_qsump_pacloud_oscginfo_activity_detail_info_day_10_check"CHECK(add_time >= '2020-03-04 00:00:00'::timestamp without time zone ANDadd_time <'2020-03-11 00:00:00'::timestamp without time zone)Inherits:qsump_pacloud_oscginfo_activity_detail_info_daypostgres=#select char_length('pm_qsump_pacloud_oscginfo_activity_detail_info_day_10_check');char_length-------------59(1 row)postgres=#
五、GDB调试
Session1:
建立一个连接,查询pid
[postgres@postgres_zabbix~]$ psqlpsql (9.6.6)Type"help" for help.postgres=#select pg_backend_pid();pg_backend_pid----------------4240
Session2:
gdb调试session
[postgres@postgres_zabbix~]$ gdbGNU gdb (GDB)Red Hat Enterprise Linux 7.6.1-114.el7Copyright (C)2013 Free Software Foundation, Inc.License GPLv3+:GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>This is freesoftware: you are free to change and redistribute it.There is NOWARRANTY, to the extent permitted by law. Type "show copying"and "showwarranty" for details.This GDB wasconfigured as "x86_64-redhat-linux-gnu".For bugreporting instructions, please see:<http://www.gnu.org/software/gdb/bugs/>.(gdb) attach 4240 attachsession 1Attaching toprocess 4240Reading symbolsfrom /home/postgres/postgresql-9.6.6/pg9debug/bin/postgres...done.Reading symbolsfrom /lib64/librt.so.1...(no debugging symbols found)...done.Loaded symbolsfor /lib64/librt.so.1Reading symbolsfrom /lib64/libdl.so.2...(no debugging symbols found)...done.Loaded symbols for/lib64/libdl.so.2Reading symbolsfrom /lib64/libm.so.6...(no debugging symbols found)...done.Loaded symbolsfor /lib64/libm.so.6Reading symbolsfrom /lib64/libc.so.6...(no debugging symbols found)...done.Loaded symbolsfor /lib64/libc.so.6Reading symbolsfrom /lib64/libpthread.so.0...(no debugging symbols found)...done.[Threaddebugging using libthread_db enabled]Using hostlibthread_db library "/lib64/libthread_db.so.1".Loaded symbolsfor /lib64/libpthread.so.0Reading symbolsfrom /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.Loaded symbolsfor /lib64/ld-linux-x86-64.so.2Reading symbolsfrom /home/postgres/postgresql-9.6.6/pg9debug/lib/pg_pathman.so...done.Loaded symbolsfor /home/postgres/postgresql-9.6.6/pg9debug/lib/pg_pathman.soReading symbolsfrom /lib64/libnss_files.so.2...(no debugging symbols found)...done.Loaded symbolsfor /lib64/libnss_files.so.20x00007f995cd6a5e3in __epoll_wait_nocancel () from /lib64/libc.so.6Missing separatedebuginfos, use: debuginfo-install glibc-2.17-260.el7_6.6.x86_64(gdb) b get_partition_constraint_expr //设置断点Breakpoint 1 at0x7f99562fc3f0: file src/relation_info.c, line 1283.(gdb) b get_relation_constraint_oid //设置断点Breakpoint 2 at0x544991: file pg_constraint.c, line 771.
Session2:
postgres=# select* from qsump_pacloud_oscginfo_activity_detail_info_daywhere id=10; //执行查询
(gdb) n //开始调试//省略了一些过程,从第一个分区表开始Breakpoint 1,get_partition_constraint_expr (partition=16589) atsrc/relation_info.c:12831283 conname =build_check_constraint_name_relid_internal(partition);(gdb)1284 conid =get_relation_constraint_oid(partition, conname, true);(gdb)Breakpoint 2, get_relation_constraint_oid(relid=16589,conname=0x13997e8"pathman_qsump_pacloud_oscginfo_activity_detail_info_day_1_check",missing_ok=1 '\001') at pg_constraint.c:771 //传入拼接的conname771 Oid conOid = InvalidOid;(gdb)778 pg_constraint =heap_open(ConstraintRelationId, AccessShareLock);(gdb)780 ScanKeyInit(&skey[0],(gdb)785 scan =systable_beginscan(pg_constraint, ConstraintRelidIndexId, true,(gdb)788 while (HeapTupleIsValid(tuple =systable_getnext(scan)))(gdb)790 Form_pg_constraint con =(Form_pg_constraint)GETSTRUCT(tuple); //获取pg_constraint中的数据(gdb)(gdb) n792 if(strcmp(NameStr(con->conname), conname) == 0)(gdb) p *con$2 = {conname ={data ="pathman_qsump_pacloud_oscginfo_activity_detail_info_day_1_check"},connamespace = 2200, contype = 99 'c', condeferrable = 0 '\000', condeferred =0 '\000', convalidated = 1'\001', conrelid = 16589, contypid = 0, conindid = 0,confrelid = 0, confupdtype = 32 ' ',confdeltype = 32 ' ', confmatchtype = 32' ', conislocal = 1 '\001', coninhcount= 0,connoinherit = 0 '\000'} //pg_constraint中的conname和查询时拼接的一致(gdb) n794 if(OidIsValid(conOid))(gdb) n799 conOid =HeapTupleGetOid(tuple);(gdb)788 while (HeapTupleIsValid(tuple =systable_getnext(scan)))(gdb)790 Form_pg_constraint con =(Form_pg_constraint)GETSTRUCT(tuple);(gdb)792 if(strcmp(NameStr(con->conname), conname) == 0)(gdb)788 while (HeapTupleIsValid(tuple =systable_getnext(scan)))(gdb)803 systable_endscan(scan);(gdb)806 if (!OidIsValid(conOid) &&!missing_ok)(gdb)812 heap_close(pg_constraint,AccessShareLock);(gdb)814 return conOid;(gdb) p conOid$5 = 16788 //返回的conoid为16788//省略一些过程,直接看报错的这个分区表(gdb)Breakpoint 1,get_partition_constraint_expr (partition=16643) atsrc/relation_info.c:12831283 conname =build_check_constraint_name_relid_internal(partition);(gdb)1284 conid =get_relation_constraint_oid(partition, conname, true);(gdb)Breakpoint 2, get_relation_constraint_oid(relid=16643,conname=0x13997e8"pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_check",missing_ok=1 '\001') at pg_constraint.c:771 //传入拼接的conname771 Oid conOid = InvalidOid;(gdb)778 pg_constraint =heap_open(ConstraintRelationId, AccessShareLock);(gdb)780 ScanKeyInit(&skey[0],(gdb)785 scan =systable_beginscan(pg_constraint, ConstraintRelidIndexId, true,(gdb)788 while (HeapTupleIsValid(tuple =systable_getnext(scan)))(gdb)790 Form_pg_constraint con =(Form_pg_constraint)GETSTRUCT(tuple); //读取pg_constraint的数据(gdb)792 if(strcmp(NameStr(con->conname), conname) == 0)(gdb) p *con$4 = {conname ={data ="pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_chec"},connamespace = 2200, contype = 99 'c', condeferrable = 0 '\000', condeferred =0 '\000', convalidated = 1'\001', conrelid = 16643, contypid = 0, conindid = 0,confrelid = 0, confupdtype = 32 ' ',confdeltype = 32 ' ', confmatchtype = 32' ', conislocal = 1 '\001',coninhcount = 0,connoinherit = 0 '\000'}//pg_constraint中读取的conname和查询时拼接的不同(gdb) n788 while (HeapTupleIsValid(tuple =systable_getnext(scan)))(gdb)790 Form_pg_constraint con =(Form_pg_constraint)GETSTRUCT(tuple);(gdb)792 if(strcmp(NameStr(con->conname), conname) == 0)(gdb)788 while (HeapTupleIsValid(tuple =systable_getnext(scan)))(gdb)803 systable_endscan(scan);(gdb)806 if (!OidIsValid(conOid) &&!missing_ok)(gdb)812 heap_close(pg_constraint,AccessShareLock);(gdb)814 return conOid;(gdb) p conOid$5 = 0 //因此返回的conoid为0即InvalidOid(gdb) n815 }(gdb)
ERROR: constraint"pathman_qsump_pacloud_oscginfo_activity_detail_info_day_10_check"of partition"qsump_pacloud_oscginfo_activity_detail_info_day_10" does not existHINT: pg_pathman will be disabled to allow you toresolve this issuepostgres=#
关于我们
中国开源软件推进联盟PostgreSQL分会(简称:PG分会)于2017年成立,由国内多家PG生态企业所共同发起,业务上接受工信部产业发展研究院指导。PG分会致力于构建PG产业生态,推动PG产学研用发展。





