作者
digoal
日期
2021-09-28
标签
PostgreSQL , GiST , 排序 , 过滤
1、产品的问题点
- PG GiST距离排序操作符和过滤无法同时使用索引
2、问题点背后涉及的技术原理
- PG GiST索引支持空间距离排序, 例如按距离某个经纬度点的距离排序, 返回表里面的经纬度点.
- PG 支持距离操作符, 支持排序功能, 同时支持返回距离值.
- 但是距离过滤不能使用索引
例子:
- 《PostgreSQL GiST Order by 距离 + 距离范围判定 + limit 骤变优化与背景原因》
create extension btree_gist;
postgres=# \do
List of operators
Schema | Name | Left arg type | Right arg type | Result type | Description
--------+------+-----------------------------+-----------------------------+------------------+-------------
public | <-> | bigint | bigint | bigint |
public | <-> | date | date | integer |
public | <-> | double precision | double precision | double precision |
public | <-> | integer | integer | integer |
public | <-> | interval | interval | interval |
public | <-> | money | money | money |
public | <-> | oid | oid | oid |
public | <-> | real | real | real |
public | <-> | smallint | smallint | smallint |
public | <-> | time without time zone | time without time zone | interval |
public | <-> | timestamp with time zone | timestamp with time zone | interval |
public | <-> | timestamp without time zone | timestamp without time zone | interval |
create table t_age(id int, age int);
insert into t_age select generate_series(1,10000000), random()*120;
create index idx_t_age_1 on t_age using gist (age);
select * from t_age
where
(age <-> 25) < 1
order by age <-> 25
limit 100000;
Limit (cost=0.42..10245.61 rows=100000 width=12) (actual time=0.161..8126.988 rows=83248 loops=1)
Output: id, age, ((age <-> 25))
Buffers: shared hit=9523157
-> Index Scan using idx_t_age_1 on public.t_age (cost=0.42..341506.11 rows=3333326 width=12) (actual time=0.160..8115.150 rows=83248 loops=1)
Output: id, age, (age <-> 25)
Order By: (t_age.age <-> 25)
Filter: ((t_age.age <-> 25) < 1)
Rows Removed by Filter: 9916752
Buffers: shared hit=9523157
Planning Time: 0.077 ms
Execution Time: 8133.808 ms
(11 rows)
postgres=# set enable_seqscan=off;
SET
postgres=# explain select * from t_age where (age <-> 25) <1 limit 100000;
QUERY PLAN
-------------------------------------------------------------------------------------
Limit (cost=10000000000.00..10000005827.44 rows=100000 width=8)
-> Seq Scan on t_age (cost=10000000000.00..10000194247.66 rows=3333326 width=8)
Filter: ((age <-> 25) < 1)
(3 rows)
create extension postgis;
create table t_pos(
id int primary key,
pos geometry
);
insert into t_pos
select * from (
select id,
ST_SetSRID(
ST_Point( round((random()*(135.085831-73.406586)+73.406586)::numeric,6),
round((random()*(53.880950-3.408477)+3.408477)::numeric,6)
),
4326
) as pos
from generate_series(1,1000000000) t(id)
) t
order by st_geohash(pos,15);
create index idx_t_pos_1 on t_pos using gist(pos);
select *,
st_distancespheroid(pos, st_setsrid(st_makepoint(120,50),4326), 'SPHEROID["WGS84",6378137,298.257223563]') as dist
from t_pos
where
st_distancespheroid(pos, st_setsrid(st_makepoint(120,50),4326), 'SPHEROID["WGS84",6378137,298.257223563]') < 5000
order by pos <-> st_setsrid(st_makepoint(120,50),4326)
limit 100;
骤变发生在limit超过所有符合条件的记录数时.
postgres=# explain analyze select * from t_age where (age <-> 25) < 1 order by age <-> 25 limit 825;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=0.28..80.98 rows=825 width=12) (actual time=0.074..40.244 rows=824 loops=1)
-> Index Scan using idx_t_age_1 on t_age (cost=0.28..3260.91 rows=33333 width=12) (actual time=0.073..40.119 rows=824 loops=1)
Order By: (age <-> 25)
Filter: ((age <-> 25) < 1)
Rows Removed by Filter: 99176
Planning Time: 0.068 ms
Execution Time: 40.340 ms
(7 rows)
postgres=# explain analyze select * from t_age where (age <-> 25) < 1 order by age <-> 25 limit 824;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------
Limit (cost=0.28..80.88 rows=824 width=12) (actual time=0.075..0.798 rows=824 loops=1)
-> Index Scan using idx_t_age_1 on t_age (cost=0.28..3260.91 rows=33333 width=12) (actual time=0.074..0.696 rows=824 loops=1)
Order By: (age <-> 25)
Filter: ((age <-> 25) < 1)
Planning Time: 0.066 ms
Execution Time: 0.880 ms
(6 rows)
因为排序它不懂距离value, 只知道距离是越来越大(即使排序和计算的操作符一模一样).
3、这个问题将影响哪些行业以及业务场景
- 影响最大的时基于地理位置的互联网业务, 例如社交、O2O、出行等
4、会导致什么问题?
- 当需要搜索附近的N个点, 并且距离M以内的双重需求时, 不能同时使用1个索引来满足. (要么只能用于排序, 要么只能用于where过滤)
- 需要额外的filter计算, 如果满足条件(距离M以内)的记录数不足N条, 则导致扫描整个索引, 性能急剧下降.
5、业务上应该如何避免这个坑
- 可以使用function来解决这个问题, 每次filter判定是否已越界.
- 《PostgreSQL GiST Order by 距离 + 距离范围判定 + limit 骤变优化与背景原因》
6、业务上避免这个坑牺牲了什么, 会引入什么新的问题
- 需要自定义函数, 开发成本增加.
7、数据库未来产品迭代如何修复这个坑
- 希望能直接在内核层面支持, 同一个index既能支持按距离过滤又能支持按距离排序输出.
PostgreSQL 许愿链接
您的愿望将传达给PG kernel hacker、数据库厂商等, 帮助提高数据库产品质量和功能, 说不定下一个PG版本就有您提出的功能点. 针对非常好的提议,奖励限量版PG文化衫、纪念品、贴纸、PG热门书籍等,奖品丰富,快来许愿。开不开森.
9.9元购买3个月阿里云RDS PostgreSQL实例
PostgreSQL 解决方案集合
德哥 / digoal's github - 公益是一辈子的事.





