1.tsvector
--使用gsql登录openGauss
root@modb:~# su - omm
omm@modb:~$ gsql -r
gsql ((openGauss 2.0.0 build 78689da9) compiled at 2021-03-31 21:03:52 commit 0 last mr )
Non-SSL connection (SSL connection is recommended when requiring high-security)
Type "help" for help.
omm=#
--把一个字符串按照空格进行分词,分词的顺序是按照长短和字母排序的, 自动去掉分词中重复的词条
omm=# SELECT 'The Fat Rats'::tsvector;
tsvector
--------------------
'Fat' 'Rats' 'The'
(1 row)
--词条位置常量也可以放到词汇中
omm=# SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector;
tsvector
-------------------------------------------------------------------------------
'a':1,6,10 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'on':5 'rat':12 'sat':4
(1 row)
--拥有位置的词汇甚至可以用一个权来标记,反映文档结构,这个权可以是A,B,C或D。默认的是D,因此输出中不会出现
omm=# SELECT 'a:1A fat:2B,4C cat:5D'::tsvector;
tsvector
----------------------------
'a':1A 'cat':5 'fat':2B,4C
(1 row)
--to_tsvector函数对这些单词进行规范化处理, 罗列出词条并连同它们文档中的位置
omm=# SELECT to_tsvector('english', 'The Fat Rats');
to_tsvector
-----------------
'fat':2 'rat':3
(1 row)
2.tsquery
omm=# SELECT 'fat & rat'::tsquery;
tsquery
---------------
'fat' & 'rat'
(1 row)
--规范化转为tsquery类型
omm=# SELECT to_tsquery('Fat:ab & Cats');
to_tsquery
------------------
'fat':AB & 'cat'
(1 row)
3.基本文本匹配
--全文检索基于匹配算子@@,当一个tsvector匹配到一个tsquery时,则返回true, tsvector和tsquery两种数据类型可以任意排序。
omm=# SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery AS RESULT;
result
--------
t
(1 row)
#### omm=# SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector AS RESULT;
result
--------
f
(1 row)
--to_tsvector和to_tsquery标准化处理
omm=# SELECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & rat') AS RESULT;
result LECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & rat') AS RESULT;
--------
t
(1 row)
omm=# SELECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & cow') AS RESULT;
result LECT to_tsvector('fat cats ate fat rats') @@ to_tsquery('fat & cow') AS RESULT;
--------
f
(1 row)
4.分词器
--查看所有分词器
omm=# \dF
List of text search configurations
Schema | Name | Description
------------+------------+---------------------------------------
pg_catalog | danish | configuration for danish language
pg_catalog | dutch | configuration for dutch language
pg_catalog | english | configuration for english language
pg_catalog | finnish | configuration for finnish language
pg_catalog | french | configuration for french language
pg_catalog | german | configuration for german language
pg_catalog | hungarian | configuration for hungarian language
pg_catalog | italian | configuration for italian language
pg_catalog | ngram | ngram configuration
pg_catalog | norwegian | configuration for norwegian language
pg_catalog | portuguese | configuration for portuguese language
pg_catalog | pound | pound configuration
pg_catalog | romanian | configuration for romanian language
pg_catalog | russian | configuration for russian language
pg_catalog | simple | simple configuration
pg_catalog | spanish | configuration for spanish language
pg_catalog | swedish | configuration for swedish language
pg_catalog | turkish | configuration for turkish language
pg_catalog | zhparser | zhparser configuration
(19 rows)
--查看默认分词器
omm=# show default_text_search_config;
default_text_search_config
----------------------------
pg_catalog.english
(1 row)
5.表和索引
omm=# CREATE SCHEMA tsearch;
CREATE SCHEMA
omm=# CREATE TABLE tsearch.pgweb(id int, body text, title text, last_mod_date date);
CREATE TABLE
omm=# INSERT INTO tsearch.pgweb VALUES(1, 'China, officially the People''s Republic of China(PRC), located in Asia, is the world''s most populous state.', 'China', '2010-1-1');
INSERT 0 1
omm=# INSERT INTO tsearch.pgweb VALUES(2, 'America is a rock band, formed in England in 1970 by multi-instrumentalists Dewey Bunnell, Dan Peek, and Gerry Beckley.', 'America', '2010-1-1');
INSERT 0 1
omm=# INSERT INTO tsearch.pgweb VALUES(3, 'England is a country that is part of the United Kingdom. It shares land borders with Scotland to the north and Wales to the west.', 'England','2010-1-1');
INSERT 0 1
--将body字段中包含america的行打印出来
omm=# SELECT id, body, title FROM tsearch.pgweb WHERE to_tsvector(body) @@ to_tsquery('america');
id | body
| title
----+----------------------------------------------------------------------------------
---------------------------------------+---------
2 | America is a rock band, formed in England in 1970 by multi-instrumentalists Dewey
Bunnell, Dan Peek, and Gerry Beckley. | America
(1 row)
--检索出在title或者body字段中包含china和asia的行
omm=# SELECT title FROM tsearch.pgweb WHERE to_tsvector(title || ' ' || body) @@ to_tsquery('china & asia');
uery('china & asia');arch.pgweb WHERE to_tsvector(title || ' ' || body) @@ to_tsqu
title
-------
China
(1 row)
--为了加速文本搜索,可以创建GIN索引(指定english配置来解析和规范化字符串)
omm=# CREATE INDEX pgweb_idx_1 ON tsearch.pgweb USING gin(to_tsvector('english', body));
CREATE INDEX
--连接列的索引
omm=# CREATE INDEX pgweb_idx_3 ON tsearch.pgweb USING gin(to_tsvector('english', title || ' ' ||body));
CREATE INDEX
--查看索引定义
omm=# \d+ tsearch.pgweb
Table "tsearch.pgweb"
Column | Type | Modifiers | Storage | Stats target | Description
---------------+---------+-----------+----------+--------------+-------------
id | integer | | plain | |
omm=# body | text | | extended | |
title | text | | extended | |
last_mod_date | date | | plain | |
Indexes:
"pgweb_idx_1" gin (to_tsvector('english'::regconfig, body)) TABLESPACE pg_default
"pgweb_idx_3" gin (to_tsvector('english'::regconfig, (title || ' '::text) || body)) TABLESPACE pg_default
Has OIDs: no
Options: orientation=row, compression=no
6.清理数据
omm=# drop schema tsearch cascade;
NOTICE: drop cascades to table tsearch.pgweb
DROP SCHEMA
练习:
课程作业
1.用tsvector @@ tsquery和tsquery @@ tsvector完成两个基本文本匹配
omm=# SELECT 'Hello World'::tsvector;
tsvector
-----------------
'Hello' 'World'
(1 row)
omm=# SELECT 'Hello & World'::tsquery;
tsquery
-------------------
'Hello' & 'World'
(1 row)
omm=# SELECT to_tsquery('Hello:ab & World');
to_tsquery
----------------------
'hello':AB & 'world'
(1 row)
omm=# SELECT 'really a fine day today'::tsvector @@ 'fine & day'::tsquery AS RESULT;
result
--------
t
(1 row)
omm=# SELECT 'fine & day'::tsquery @@ 'really a fine day today'::tsvector AS RESULT;
result
--------
t
(1 row)
2.创建表且至少有两个字段的类型为 text类型,在创建索引前进行全文检索
omm=# CREATE SCHEMA junzi;
CREATE SCHEMA
omm=# CREATE TABLE junzi.tbs_1(id int, name text, home text);
CREATE TABLE
omm=# INSERT INTO junzi.tbs_1 VALUES
(1, 'xiaohong de quan ming jiao dahong','ta de jia zai dong bei song hua jiang shang'),
(2, 'xiaolan de quan ming jiao dalan','ta de jia zai dong bei ha er bin'),
(3, 'xiaofen de quan ming jiao dafen','ta de jia zai dong bei cang chun');
INSERT 0 3
omm=# SELECT name FROM junzi.tbs_1 WHERE to_tsvector(name || ' ' || home) @@ to_tsquery('de & ming');
name
-----------------------------------
xiaohong de quan ming jiao dahong
xiaolan de quan ming jiao dalan
xiaofen de quan ming jiao dafen
(3 rows)
3.创建GIN索引
omm=# CREATE INDEX test_idx_1 ON junzi.tbs_1 USING gin(to_tsvector('name'));
CREATE INDEX
omm=# \d+ junzi.tbs_1
Table "junzi.tbs_1"
Column | Type | Modifiers | Storage | Stats target | Description
--------+---------+-----------+----------+--------------+-------------
id | integer | | plain | |
name | text | | extended | |
home | text | | extended | |
Indexes:
"test_idx_1" gin (to_tsvector('name'::text)) TABLESPACE pg_default
Has OIDs: no
Options: orientation=row, compression=no
4.清理数据
omm=# drop schema junzi cascade;
NOTICE: drop cascades to table junzi.tbs_1
DROP SCHEMA
「喜欢这篇文章,您的关注和赞赏是给作者最好的鼓励」
关注作者
【版权声明】本文为墨天轮用户原创内容,转载时必须标注文章的来源(墨天轮),文章链接,文章作者等基本信息,否则作者和墨天轮有权追究责任。如果您发现墨天轮中有涉嫌抄袭或者侵权的内容,欢迎发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




