(openGauss每日一练第 20天)：openGauss全文检索

原创 junzibuyuantian 恩墨学院 2021-12-20

493

1.tsvector

--使用gsql登录openGauss
root@modb:~# su - omm
omm@modb:~$ gsql -r
gsql ((openGauss 2.0.0 build 78689da9) compiled at 2021-03-31 21:03:52 commit 0 last mr  )
Non-SSL connection (SSL connection is recommended when requiring high-security)
Type "help" for help.

omm=# 

--把一个字符串按照空格进行分词，分词的顺序是按照长短和字母排序的, 自动去掉分词中重复的词条
omm=# SELECT 'The Fat Rats'::tsvector;
      tsvector      
--------------------
 'Fat' 'Rats' 'The'
(1 row)

--词条位置常量也可以放到词汇中
omm=# SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector;
                                   tsvector                                    
-------------------------------------------------------------------------------
 'a':1,6,10 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'on':5 'rat':12 'sat':4
(1 row)


--拥有位置的词汇甚至可以用一个权来标记，反映文档结构，这个权可以是A，B，C或D。默认的是D，因此输出中不会出现
omm=# SELECT 'a:1A fat:2B,4C cat:5D'::tsvector;
          tsvector          
----------------------------
 'a':1A 'cat':5 'fat':2B,4C
(1 row)

--to_tsvector函数对这些单词进行规范化处理, 罗列出词条并连同它们文档中的位置
omm=# SELECT to_tsvector('english', 'The Fat Rats');
   to_tsvector   
-----------------
 'fat':2 'rat':3
(1 row)

2.tsquery

omm=# SELECT 'fat & rat'::tsquery;
     tsquery    
---------------
 'fat' & 'rat'
(1 row)

--规范化转为tsquery类型
omm=# SELECT to_tsquery('Fat:ab & Cats');
     to_tsquery    
------------------
 'fat':AB & 'cat'
(1 row)

3.基本文本匹配

--全文检索基于匹配算子@@，当一个tsvector匹配到一个tsquery时，则返回true, tsvector和tsquery两种数据类型可以任意排序。
omm=# SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery AS RESULT;
 result 
--------
 t
(1 row)


#### omm=# SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector AS RESULT;
 result 
--------
 f
(1 row)

--to_tsvector和to_tsquery标准化处理
omm=# SELECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & rat') AS RESULT; 
 result LECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & rat') AS RESULT;
 --------
 t
(1 row)

omm=# SELECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & cow') AS RESULT; 
 result LECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & cow') AS RESULT;
 --------
 f
(1 row)

4.分词器

--查看所有分词器
omm=# \dF
               List of text search configurations
   Schema   |    Name    |              Description              
------------+------------+---------------------------------------
 pg_catalog | danish     | configuration for danish language
 pg_catalog | dutch      | configuration for dutch language
 pg_catalog | english    | configuration for english language
 pg_catalog | finnish    | configuration for finnish language
 pg_catalog | french     | configuration for french language
 pg_catalog | german     | configuration for german language
 pg_catalog | hungarian  | configuration for hungarian language
 pg_catalog | italian    | configuration for italian language
 pg_catalog | ngram      | ngram configuration
 pg_catalog | norwegian  | configuration for norwegian language
 pg_catalog | portuguese | configuration for portuguese language
 pg_catalog | pound      | pound configuration
 pg_catalog | romanian   | configuration for romanian language
 pg_catalog | russian    | configuration for russian language
 pg_catalog | simple     | simple configuration
 pg_catalog | spanish    | configuration for spanish language
 pg_catalog | swedish    | configuration for swedish language
 pg_catalog | turkish    | configuration for turkish language
 pg_catalog | zhparser   | zhparser configuration
(19 rows)


--查看默认分词器

omm=# show default_text_search_config;
 default_text_search_config 
----------------------------
 pg_catalog.english
(1 row)

5.表和索引

omm=# CREATE SCHEMA tsearch;
CREATE SCHEMA

omm=# CREATE TABLE tsearch.pgweb(id int, body text, title text, last_mod_date date);
CREATE TABLE

omm=# INSERT INTO tsearch.pgweb VALUES(1, 'China, officially the People''s Republic of China(PRC), located in Asia, is the world''s most populous state.', 'China', '2010-1-1');
INSERT 0 1

omm=# INSERT INTO tsearch.pgweb VALUES(2, 'America is a rock band, formed in England in 1970 by multi-instrumentalists Dewey Bunnell, Dan Peek, and Gerry Beckley.', 'America', '2010-1-1');
INSERT 0 1

omm=# INSERT INTO tsearch.pgweb VALUES(3, 'England is a country that is part of the United Kingdom. It shares land borders with Scotland to the north and Wales to the west.', 'England','2010-1-1');
INSERT 0 1

--将body字段中包含america的行打印出来

omm=# SELECT id, body, title FROM tsearch.pgweb WHERE to_tsvector(body) @@ to_tsquery('america');
 id |                                                          body                    
                                       |  title  
----+----------------------------------------------------------------------------------
---------------------------------------+---------
  2 | America is a rock band, formed in England in 1970 by multi-instrumentalists Dewey
 Bunnell, Dan Peek, and Gerry Beckley. | America
(1 row)

--检索出在title或者body字段中包含china和asia的行
omm=# SELECT title FROM tsearch.pgweb WHERE to_tsvector(title || ' ' || body) @@ to_tsquery('china & asia');
uery('china & asia');arch.pgweb WHERE to_tsvector(title || ' ' || body) @@ to_tsqu
 title 
-------
 China
(1 row)

--为了加速文本搜索，可以创建GIN索引(指定english配置来解析和规范化字符串)
omm=# CREATE INDEX pgweb_idx_1 ON tsearch.pgweb USING gin(to_tsvector('english', body));
CREATE INDEX

--连接列的索引
omm=# CREATE INDEX pgweb_idx_3 ON tsearch.pgweb USING gin(to_tsvector('english', title || ' ' ||body));
CREATE INDEX

--查看索引定义
omm=# \d+ tsearch.pgweb
                            Table "tsearch.pgweb"
    Column     |  Type   | Modifiers | Storage  | Stats target | Description 
---------------+---------+-----------+----------+--------------+-------------
 id            | integer |           | plain    |              | 
omm=#  body          | text    |           | extended |              | 
 title         | text    |           | extended |              | 
 last_mod_date | date    |           | plain    |              | 
Indexes:
    "pgweb_idx_1" gin (to_tsvector('english'::regconfig, body)) TABLESPACE pg_default
    "pgweb_idx_3" gin (to_tsvector('english'::regconfig, (title || ' '::text) || body)) TABLESPACE pg_default
Has OIDs: no
Options: orientation=row, compression=no

6.清理数据

omm=# drop schema tsearch cascade;
NOTICE:  drop cascades to table tsearch.pgweb
DROP SCHEMA

练习：

课程作业

1.用tsvector @@ tsquery和tsquery @@ tsvector完成两个基本文本匹配

omm=# SELECT 'Hello World'::tsvector;
    tsvector     
-----------------
 'Hello' 'World'
(1 row)

omm=# SELECT 'Hello & World'::tsquery;
      tsquery      
-------------------
 'Hello' & 'World'
(1 row)

omm=# SELECT to_tsquery('Hello:ab & World');
      to_tsquery      
----------------------
 'hello':AB & 'world'
(1 row)

omm=# SELECT 'really a fine day today'::tsvector @@ 'fine & day'::tsquery AS RESULT;
 result 
--------
 t
(1 row)

omm=# SELECT 'fine & day'::tsquery @@ 'really a fine day today'::tsvector AS RESULT;
 result 
--------
 t
(1 row)

2.创建表且至少有两个字段的类型为 text类型，在创建索引前进行全文检索

omm=# CREATE SCHEMA junzi;
CREATE SCHEMA

omm=# CREATE TABLE junzi.tbs_1(id int, name text, home text);
CREATE TABLE

omm=# INSERT INTO junzi.tbs_1 VALUES
(1, 'xiaohong de quan ming jiao dahong','ta de jia zai dong bei song hua jiang shang'),
(2, 'xiaolan de quan ming jiao dalan','ta de jia zai dong bei ha er bin'),
(3, 'xiaofen de quan ming jiao dafen','ta de jia zai dong bei cang chun');
INSERT 0 3


omm=# SELECT name FROM junzi.tbs_1 WHERE to_tsvector(name || ' ' || home) @@ to_tsquery('de & ming');
               name                
-----------------------------------
 xiaohong de quan ming jiao dahong
 xiaolan de quan ming jiao dalan
 xiaofen de quan ming jiao dafen
(3 rows)

3.创建GIN索引

omm=# CREATE INDEX test_idx_1 ON junzi.tbs_1 USING gin(to_tsvector('name'));
CREATE INDEX

omm=# \d+ junzi.tbs_1
                         Table "junzi.tbs_1"
 Column |  Type   | Modifiers | Storage  | Stats target | Description 
--------+---------+-----------+----------+--------------+-------------
 id     | integer |           | plain    |              | 
 name   | text    |           | extended |              | 
 home   | text    |           | extended |              | 
Indexes:
    "test_idx_1" gin (to_tsvector('name'::text)) TABLESPACE pg_default
Has OIDs: no
Options: orientation=row, compression=no

4.清理数据

omm=# drop schema junzi cascade;
NOTICE:  drop cascades to table junzi.tbs_1
DROP SCHEMA

opengauss opengauss训练营

「喜欢这篇文章，您的关注和赞赏是给作者最好的鼓励」

关注作者

(openGauss每日一练第 20天)：openGauss全文检索

1.tsvector

2.tsquery

3.基本文本匹配

4.分词器

5.表和索引

6.清理数据

评论