PG 函数:逐行处理衍生重复计算、非预期返回结果
PG中的函数处理是逐行处理的,会衍生出重复计算、非预期返回结果的问题,使用时需要注意:
mysql、达梦等也有类似问题
测试表(200W数据)
drop table t1;
create table t1(id number primary key,msg1 varchar(3),dt datetime,msg3 varchar(30),msg4 varchar(30),msg5 varchar(30),msg6 varchar(30));
\set SQLTERM /
begin
for i in 1..200 loop
insert into t1
select (i-1)*10000+rownum,DBMS_RANDOM.string('A',3),current_timestamp-((i-1)*10000+rownum)*1.0/1440,DBMS_RANDOM.string('A',20),DBMS_RANDOM.string('A',20),DBMS_RANDOM.string('A',20),DBMS_RANDOM.string('A',20)
from dual connect by rownum<=10000;
commit;
end loop;
end;
/
\set SQLTERM ;
create index idx_t1_msg on t1(msg1);
create index idx_t1_dt on t1(dt);
逐行计算导致非意料结果:
函数逐行计算,消耗资源放大
-- 基本函数
-- 在等式左边使用函数:
-- 每行计算一次:to_char()运算 执行了200W次
select * from t1 where to_char(dt,'yyyy-mm-dd')='2023-08-18'
-- 对每一行计算一次,则等价于
select * from (select t1.*,to_char(dt,'yyyy-mm-dd') fun_value from t1) where fun_value='2023-08-18'
-- 在等式右边使用函数: 每一行仍然会计算一次
-- 对下面的查询,等式右边并不是执行1次,而是执行200W次,结果就是
select * from t1 where dt=add_months(current_timestamp,-12)
-- 对每一行计算一次,则等价于
select * from (select t1.*,add_months(current_timestamp,-12) fun_value from t1) where dt=fun_value
主键等值查询,返回多个值
-- 随机函数、自定义函数默认VOLATILE属性时,在条件右边不能使用索引
-- 自定义函数需要特别注意
select id from t1 where id=round(random()*2000000,0)::integer;
id
---------
220924
1853722
(2 rows)
-- 如果对于以上返回多个值和行为感到迷惑,改写之后(逐行处理)将会理解生产结果的原因
-- 每一行都计算一次,则等价于
test=# select id from ( select id,round(random()*2000000,0)::integer r_id from t1 ) where id=r_id;
id
---------
36155
1773445
(2 rows)
优化
select t1.* from t1,(select round(random()*2000000,0)::integer random_value) tmp where id=random_value;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------
Nested Loop (cost=0.30..8.36 rows=1 width=102) (actual time=0.120..0.121 rows=1 loops=1)
-> Result (cost=0.00..0.02 rows=1 width=4) (actual time=0.019..0.020 rows=1 loops=1)
-> Index Scan using t1_pkey on t1 (cost=0.30..8.32 rows=1 width=102) (actual time=0.094..0.094 rows=1 loops=1)
Index Cond: (id = (((round(((random() * '2000000'::double precision))::numeric, 0))::integer))::numeric)
Planning Time: 0.235 ms
Execution Time: 0.154 ms
(6 rows)
重复计算,-- 在有索引的情况下的重复计算
-- 确定值-不走索引,只需要376毫秒
-- 使用函数,需要480.611毫秒
-- 尝试优化: 物化使函数转为确定变量 801.455毫秒
-- 确定值-不走索引,只需要376毫秒
explain analyze select * from t1 where dt between '2019-10-28 16:31:34' and '2023-08-19 13:49:10';
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Seq Scan on t1 (cost=0.00..38584.00 rows=2000000 width=102) (actual time=0.025..376.370 rows=2000000 loops=1)
Filter: (((dt)::timestamp without time zone >= '2019-10-28 16:31:34'::timestamp without time zone) AND ((dt)::timestamp without time zone <= '2023-08-19 13:49:10'::timestam
p without time zone))
Planning Time: 0.243 ms
Execution Time: 456.570 ms
(4 rows)
-- 使用函数,需要480.611毫秒
explain analyze select * from t1 where dt between add_months(current_timestamp,-72) and add_months(current_timestamp,1);
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Index Scan using idx_t1_dt on t1 (cost=0.93..54047.93 rows=2000000 width=102) (actual time=0.156..480.611 rows=2000000 loops=1)
Index Cond: (((dt)::timestamp without time zone >= (add_months(CURRENT_TIMESTAMP, '-72'::integer))::timestamp without time zone) AND ((dt)::timestamp without time zone <= (
add_months(CURRENT_TIMESTAMP, 1))::timestamp without time zone))
Planning Time: 0.474 ms
Execution Time: 557.478 ms
(4 rows)
-- 在子查询展开的情况下优化不效,必须要使用物化
explain analyze
select * from t1,(select add_months(current_timestamp,-72)::datetime start_time,add_months(current_timestamp,1)::datetime end_time ) tmp
where dt between start_time and end_time;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.94..495047.94 rows=2000000 width=118) (actual time=0.556..13701.358 rows=2000000 loops=1)
Workers Planned: 4
Workers Launched: 0
-> Parallel Index Scan using idx_t1_dt on t1 (cost=0.94..294047.94 rows=500000 width=118) (actual time=0.281..13509.406 rows=2000000 loops=1)
Index Cond: (((dt)::timestamp without time zone >= (add_months(CURRENT_TIMESTAMP, '-72'::integer))::timestamp without time zone) AND ((dt)::timestamp without time zon
e <= (add_months(CURRENT_TIMESTAMP, 1))::timestamp without time zone))
Planning Time: 0.544 ms
Execution Time: 13808.107 ms
(7 rows)
-- 尝试优化: 物化使函数转为确定变量 801.455毫秒
explain analyze
with tmp as MATERIALIZED (select add_months(current_timestamp,-72)::datetime start_time,add_months(current_timestamp,1)::datetime end_time
) select * from t1,tmp where dt between start_time and end_time;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Nested Loop (cost=0.95..8232.63 rows=222222 width=118) (actual time=0.216..801.455 rows=2000000 loops=1)
CTE tmp
-> Result (cost=0.00..0.52 rows=1 width=16) (actual time=0.122..0.122 rows=1 loops=1)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=16) (actual time=0.124..0.126 rows=1 loops=1)
-> Index Scan using idx_t1_dt on t1 (cost=0.43..6009.87 rows=222222 width=102) (actual time=0.086..413.187 rows=2000000 loops=1)
Index Cond: (((dt)::timestamp without time zone >= (tmp.start_time)::timestamp without time zone) AND ((dt)::timestamp without time zone <= (tmp.end_time)::timestamp
without time zone))
Planning Time: 0.270 ms
Execution Time: 881.570 ms
(8 rows)
重复计算,无索引的全扫描情况下-导致性能问题
-- 确定值,只需要374.263毫秒
-- 使用函数,需要13秒
-- 尝试优化: 物化使函数转为确定变量 684.215毫秒
drop index idx_t1_dt;
-- 确定值,只需要374.263毫秒
explain analyze select * from t1 where dt between '2019-10-28 16:31:34' and '2023-08-19 13:49:10';
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Seq Scan on t1 (cost=0.00..38584.00 rows=1999600 width=102) (actual time=0.029..374.263 rows=2000000 loops=1)
Filter: (((dt)::timestamp without time zone >= '2019-10-28 16:31:34'::timestamp without time zone) AND ((dt)::timestamp without time zone <= '2023-08-19 13:49:10'::timestam
p without time zone))
Planning Time: 0.141 ms
Execution Time: 453.942 ms
(4 rows)
-- 使用函数,需要13秒
explain analyze select * from t1 where dt between add_months(current_timestamp,-72) and add_months(current_timestamp,1);
test=# explain analyze select * from t1 where dt between add_months(current_timestamp,-72) and add_months(current_timestamp,1);
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..469544.00 rows=1999600 width=102) (actual time=0.400..12975.549 rows=2000000 loops=1)
Workers Planned: 4
Workers Launched: 0
-> Parallel Seq Scan on t1 (cost=0.00..268584.00 rows=499900 width=102) (actual time=0.153..12779.753 rows=2000000 loops=1)
Filter: (((dt)::timestamp without time zone >= (add_months(CURRENT_TIMESTAMP, '-72'::integer))::timestamp without time zone) AND ((dt)::timestamp without time zone <=
(add_months(CURRENT_TIMESTAMP, 1))::timestamp without time zone))
Planning Time: 0.344 ms
Execution Time: 13077.545 ms
(7 rows)
-- 尝试优化: 物化使函数转为确定变量 684.215毫秒
explain analyze
with tmp as MATERIALIZED (select add_months(current_timestamp,-72)::datetime start_time,add_months(current_timestamp,1)::datetime end_time
) select * from t1,tmp where dt between start_time and end_time;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Nested Loop (cost=0.52..58584.54 rows=222222 width=118) (actual time=0.175..684.215 rows=2000000 loops=1)
Join Filter: (((t1.dt)::timestamp without time zone >= (tmp.start_time)::timestamp without time zone) AND ((t1.dt)::timestamp without time zone <= (tmp.end_time)::timestamp
without time zone))
CTE tmp
-> Result (cost=0.00..0.52 rows=1 width=16) (actual time=0.139..0.140 rows=1 loops=1)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=16) (actual time=0.142..0.144 rows=1 loops=1)
-> Seq Scan on t1 (cost=0.00..28584.00 rows=2000000 width=102) (actual time=0.028..189.824 rows=2000000 loops=1)
Planning Time: 0.255 ms
Execution Time: 761.686 ms
(8 rows)
最后修改时间:2023-08-22 11:28:19
「喜欢这篇文章,您的关注和赞赏是给作者最好的鼓励」
关注作者
【版权声明】本文为墨天轮用户原创内容,转载时必须标注文章的来源(墨天轮),文章链接,文章作者等基本信息,否则作者和墨天轮有权追究责任。如果您发现墨天轮中有涉嫌抄袭或者侵权的内容,欢迎发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




