PG 函数：逐行处理衍生重复计算、非预期返回结果

刘洪 2023-08-18

292

PG中的函数处理是逐行处理的，会衍生出重复计算、非预期返回结果的问题，使用时需要注意：

mysql、达梦等也有类似问题

测试表（200W数据）

drop table t1;
create table t1(id number primary key,msg1 varchar(3),dt datetime,msg3 varchar(30),msg4 varchar(30),msg5 varchar(30),msg6 varchar(30));

\set SQLTERM /
begin
for i in 1..200 loop
insert into t1 
 select (i-1)*10000+rownum,DBMS_RANDOM.string('A',3),current_timestamp-((i-1)*10000+rownum)*1.0/1440,DBMS_RANDOM.string('A',20),DBMS_RANDOM.string('A',20),DBMS_RANDOM.string('A',20),DBMS_RANDOM.string('A',20) 
 from dual connect by rownum<=10000;
commit;
end loop;
end;
/
\set SQLTERM ;

create index idx_t1_msg on t1(msg1);
create index idx_t1_dt on t1(dt);

逐行计算导致非意料结果：

函数逐行计算，消耗资源放大

-- 基本函数
-- 在等式左边使用函数：  
-- 每行计算一次:to_char()运算 执行了200W次
select * from t1 where to_char(dt,'yyyy-mm-dd')='2023-08-18'
-- 对每一行计算一次，则等价于
select * from (select t1.*,to_char(dt,'yyyy-mm-dd') fun_value from t1) where fun_value='2023-08-18'


-- 在等式右边使用函数：  每一行仍然会计算一次
-- 对下面的查询，等式右边并不是执行1次，而是执行200W次，结果就是
select * from t1 where dt=add_months(current_timestamp,-12)
-- 对每一行计算一次，则等价于
select * from (select t1.*,add_months(current_timestamp,-12) fun_value from t1) where dt=fun_value

主键等值查询，返回多个值

-- 随机函数、自定义函数默认VOLATILE属性时，在条件右边不能使用索引
-- 自定义函数需要特别注意
select id from t1 where id=round(random()*2000000,0)::integer;
   id    
---------
  220924
 1853722
(2 rows)

-- 如果对于以上返回多个值和行为感到迷惑，改写之后(逐行处理)将会理解生产结果的原因
-- 每一行都计算一次，则等价于
test=# select id from ( select id,round(random()*2000000,0)::integer r_id from t1 ) where id=r_id;
   id    
---------
   36155
 1773445
(2 rows)

优化

select t1.* from t1,(select round(random()*2000000,0)::integer random_value) tmp where id=random_value;
                                                     QUERY PLAN                                                     
--------------------------------------------------------------------------------------------------------------------
 Nested Loop  (cost=0.30..8.36 rows=1 width=102) (actual time=0.120..0.121 rows=1 loops=1)
   ->  Result  (cost=0.00..0.02 rows=1 width=4) (actual time=0.019..0.020 rows=1 loops=1)
   ->  Index Scan using t1_pkey on t1  (cost=0.30..8.32 rows=1 width=102) (actual time=0.094..0.094 rows=1 loops=1)
         Index Cond: (id = (((round(((random() * '2000000'::double precision))::numeric, 0))::integer))::numeric)
 Planning Time: 0.235 ms
 Execution Time: 0.154 ms
(6 rows)

重复计算，-- 在有索引的情况下的重复计算

-- 确定值-不走索引，只需要376毫秒
-- 使用函数，需要480.611毫秒
-- 尝试优化: 物化使函数转为确定变量  801.455毫秒

-- 确定值-不走索引，只需要376毫秒
explain analyze select * from t1 where dt between '2019-10-28 16:31:34' and '2023-08-19 13:49:10';  
                                                                                             QUERY PLAN                                                                        
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Seq Scan on t1  (cost=0.00..38584.00 rows=2000000 width=102) (actual time=0.025..376.370 rows=2000000 loops=1)
   Filter: (((dt)::timestamp without time zone >= '2019-10-28 16:31:34'::timestamp without time zone) AND ((dt)::timestamp without time zone <= '2023-08-19 13:49:10'::timestam
p without time zone))
 Planning Time: 0.243 ms
 Execution Time: 456.570 ms
(4 rows)


-- 使用函数，需要480.611毫秒
explain analyze select * from t1 where dt between add_months(current_timestamp,-72) and add_months(current_timestamp,1);
                                                                                                                   QUERY PLAN                                                  
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Index Scan using idx_t1_dt on t1  (cost=0.93..54047.93 rows=2000000 width=102) (actual time=0.156..480.611 rows=2000000 loops=1)
   Index Cond: (((dt)::timestamp without time zone >= (add_months(CURRENT_TIMESTAMP, '-72'::integer))::timestamp without time zone) AND ((dt)::timestamp without time zone <= (
add_months(CURRENT_TIMESTAMP, 1))::timestamp without time zone))
 Planning Time: 0.474 ms
 Execution Time: 557.478 ms
(4 rows)


-- 在子查询展开的情况下优化不效，必须要使用物化
explain analyze
select * from t1,(select add_months(current_timestamp,-72)::datetime start_time,add_months(current_timestamp,1)::datetime end_time ) tmp
 where dt between start_time and end_time;
                                                                                                                      QUERY PLAN                                               
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Gather  (cost=1000.94..495047.94 rows=2000000 width=118) (actual time=0.556..13701.358 rows=2000000 loops=1)
   Workers Planned: 4
   Workers Launched: 0
   ->  Parallel Index Scan using idx_t1_dt on t1  (cost=0.94..294047.94 rows=500000 width=118) (actual time=0.281..13509.406 rows=2000000 loops=1)
         Index Cond: (((dt)::timestamp without time zone >= (add_months(CURRENT_TIMESTAMP, '-72'::integer))::timestamp without time zone) AND ((dt)::timestamp without time zon
e <= (add_months(CURRENT_TIMESTAMP, 1))::timestamp without time zone))
 Planning Time: 0.544 ms
 Execution Time: 13808.107 ms
(7 rows)


-- 尝试优化: 物化使函数转为确定变量  801.455毫秒
explain analyze
with tmp as MATERIALIZED  (select add_months(current_timestamp,-72)::datetime start_time,add_months(current_timestamp,1)::datetime end_time 
) select * from t1,tmp where dt between start_time and end_time;

                                                                                            QUERY PLAN                                                                         
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Nested Loop  (cost=0.95..8232.63 rows=222222 width=118) (actual time=0.216..801.455 rows=2000000 loops=1)
   CTE tmp
     ->  Result  (cost=0.00..0.52 rows=1 width=16) (actual time=0.122..0.122 rows=1 loops=1)
   ->  CTE Scan on tmp  (cost=0.00..0.02 rows=1 width=16) (actual time=0.124..0.126 rows=1 loops=1)
   ->  Index Scan using idx_t1_dt on t1  (cost=0.43..6009.87 rows=222222 width=102) (actual time=0.086..413.187 rows=2000000 loops=1)
         Index Cond: (((dt)::timestamp without time zone >= (tmp.start_time)::timestamp without time zone) AND ((dt)::timestamp without time zone <= (tmp.end_time)::timestamp 
without time zone))
 Planning Time: 0.270 ms
 Execution Time: 881.570 ms
(8 rows)

重复计算，无索引的全扫描情况下-导致性能问题

-- 确定值，只需要374.263毫秒
-- 使用函数，需要13秒
-- 尝试优化: 物化使函数转为确定变量  684.215毫秒

drop index idx_t1_dt;

-- 确定值，只需要374.263毫秒
explain analyze select * from t1 where dt between '2019-10-28 16:31:34' and '2023-08-19 13:49:10';  
                                                                                             QUERY PLAN                                                                        
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Seq Scan on t1  (cost=0.00..38584.00 rows=1999600 width=102) (actual time=0.029..374.263 rows=2000000 loops=1)
   Filter: (((dt)::timestamp without time zone >= '2019-10-28 16:31:34'::timestamp without time zone) AND ((dt)::timestamp without time zone <= '2023-08-19 13:49:10'::timestam
p without time zone))
 Planning Time: 0.141 ms
 Execution Time: 453.942 ms
(4 rows)

-- 使用函数，需要13秒
explain analyze select * from t1 where dt between add_months(current_timestamp,-72) and add_months(current_timestamp,1);
test=# explain analyze select * from t1 where dt between add_months(current_timestamp,-72) and add_months(current_timestamp,1);
                                                                                                                    QUERY PLAN                                                 
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Gather  (cost=1000.00..469544.00 rows=1999600 width=102) (actual time=0.400..12975.549 rows=2000000 loops=1)
   Workers Planned: 4
   Workers Launched: 0
   ->  Parallel Seq Scan on t1  (cost=0.00..268584.00 rows=499900 width=102) (actual time=0.153..12779.753 rows=2000000 loops=1)
         Filter: (((dt)::timestamp without time zone >= (add_months(CURRENT_TIMESTAMP, '-72'::integer))::timestamp without time zone) AND ((dt)::timestamp without time zone <=
 (add_months(CURRENT_TIMESTAMP, 1))::timestamp without time zone))
 Planning Time: 0.344 ms
 Execution Time: 13077.545 ms
(7 rows)


-- 尝试优化: 物化使函数转为确定变量  684.215毫秒
explain analyze
with tmp as MATERIALIZED  (select add_months(current_timestamp,-72)::datetime start_time,add_months(current_timestamp,1)::datetime end_time 
) select * from t1,tmp where dt between start_time and end_time;

                                                                                             QUERY PLAN                                                                        
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Nested Loop  (cost=0.52..58584.54 rows=222222 width=118) (actual time=0.175..684.215 rows=2000000 loops=1)
   Join Filter: (((t1.dt)::timestamp without time zone >= (tmp.start_time)::timestamp without time zone) AND ((t1.dt)::timestamp without time zone <= (tmp.end_time)::timestamp
 without time zone))
   CTE tmp
     ->  Result  (cost=0.00..0.52 rows=1 width=16) (actual time=0.139..0.140 rows=1 loops=1)
   ->  CTE Scan on tmp  (cost=0.00..0.02 rows=1 width=16) (actual time=0.142..0.144 rows=1 loops=1)
   ->  Seq Scan on t1  (cost=0.00..28584.00 rows=2000000 width=102) (actual time=0.028..189.824 rows=2000000 loops=1)
 Planning Time: 0.255 ms
 Execution Time: 761.686 ms
(8 rows)

pg函数函数逐行处理函数优化

最后修改时间：2023-08-22 11:28:19

「喜欢这篇文章，您的关注和赞赏是给作者最好的鼓励」

关注作者

PG 函数：逐行处理衍生重复计算、非预期返回结果

测试表（200W数据）

逐行计算导致非意料结果：

重复计算，-- 在有索引的情况下的重复计算

重复计算，无索引的全扫描情况下-导致性能问题

评论