CREATE TABLE movies(ReleaseYear int,Title text,Origin text,Director text,Casting text,Genre text,WikiPage text,Plot text);
\COPY movies(ReleaseYear, Title, Origin, Director, Casting, Genre, WikiPage, Plot)FROM 'wiki_movie_plots_deduped.csv' DELIMITER ',' CSV HEADER;
tsvector和tsquery数据类型 match运算符@@,用于检查tsquery是否与tsvector匹配 用于对每个匹配进行排名的函数(ts_rank、ts_rank_cd) GIN索引类型,用于高效查询tsvector的倒排索引
SELECT * FROM unnest(to_tsvector('english','I''m going to make him an offer he can''t refuse. Refusing is not an option.'));lexeme | positions | weights--------+-----------+---------go | {3} | {D}m | {2} | {D}make | {5} | {D}offer | {8} | {D}option | {17} | {D}refus | {12,13} | {D,D}(6 rows)
SELECT * FROM unnest(to_tsvector('simple','I''m going to make him an offer he can''t refuse. Refusing is not an option.'));lexeme | positions | weights----------+-----------+---------an | {7,16} | {D,D}can | {10} | {D}going | {3} | {D}he | {9} | {D}him | {6} | {D}i | {1} | {D}is | {14} | {D}m | {2} | {D}make | {5} | {D}not | {15} | {D}offer | {8} | {D}option | {17} | {D}refuse | {12} | {D}refusing | {13} | {D}t | {11} | {D}to | {4} | {D}(16 rows)
SELECT cfgname FROM pg_ts_config;
SELECT websearch_to_tsquery('english', 'the dark vader');websearch_to_tsquery----------------------'dark' & 'vader'
SELECT websearch_to_tsquery('english', 'quick OR dog');websearch_to_tsquery----------------------'dark' | 'vader'
SELECT websearch_to_tsquery('english', 'dark vader -wars');websearch_to_tsquery---------------------------'dark' & 'vader' & !'war'
SELECT websearch_to_tsquery('english', '"the dark vader son"');websearch_to_tsquery------------------------------'dark' <-> 'vader' <-> 'son'
SELECT websearch_to_tsquery('english', '"do or do not, there is no try"');websearch_to_tsquery----------------------'tri'(1 row)
SELECT websearch_to_tsquery('simple', '"do or do not, there is no try"');websearch_to_tsquery--------------------------------------------------------------------------'do' <-> 'or' <-> 'do' <-> 'not' <-> 'there' <-> 'is' <-> 'no' <-> 'try'
SELECT websearch_to_tsquery('english', 'dark vader') @@to_tsvector('english','Dark Vader is my father.');?column?----------t
SELECT websearch_to_tsquery('english', 'dark vader -father') @@to_tsvector('english','Dark Vader is my father.');?column?----------f
ALTER TABLE movies ADD search tsvector GENERATED ALWAYS AS(to_tsvector('english', Title) || ' ' ||to_tsvector('english', Plot) || ' ' ||to_tsvector('simple', Director) || ' ' ||to_tsvector('simple', Genre) || ' ' ||to_tsvector('simple', Origin) || ' ' ||to_tsvector('simple', Casting)) STORED;
CREATE INDEX idx_search ON movies USING GIN(search);
SELECT title FROM movies WHERE search @@ websearch_to_tsquery('english','dark vader');title--------------------------------------------------Star Wars Episode IV: A New Hope (aka Star Wars)Return of the JediStar Wars: Episode III – Revenge of the Sith(3 rows)
PostgreSQL提供了两个预定义的排名函数,它们考虑了词汇、接近度和结构信息;也就是说,它们考虑查询词在文档中出现的频率、词项在文档中的接近程度以及它们出现的文档部分的重要性。然而,"相关性"的概念是模糊的,并且非常应用程序特定。不同的应用可能需要额外的信息来进行排名,例如文档的修改时间。内置的排名函数只是示例。您可以编写自己的排名函数和/或将它们的结果与其他因素结合起来,以适应您的特定需求。
这两个提到的排名函数是ts_rank和ts_rank_cd。它们之间的区别在于,虽然它们都考虑了词项的频率,但ts_rank_cd还考虑了匹配词项之间的接近程度。
SELECT title,ts_rank(search, websearch_to_tsquery('english', 'dark vader')) rankFROM moviesWHERE search @@ websearch_to_tsquery('english','dark vader')ORDER BY rank DESCLIMIT 10;title | rank--------------------------------------------------+------------Return of the Jedi | 0.21563873Star Wars: Episode III – Revenge of the Sith | 0.12592985Star Wars Episode IV: A New Hope (aka Star Wars) | 0.05174401
标题中的匹配应该比描述或剧情中的匹配更重要。 更受欢迎的电影可以根据评级和/或收到的投票数进行推广。 考虑到用户偏好,某些类别可以得到更大的提升。例如,如果某个用户喜欢喜剧片,那么这些电影可以优先考虑。 在对搜索结果进行排名时,较新的标题可以被认为比非常老的标题更相关。
SELECT title,ts_rank(search, websearch_to_tsquery('english', 'jedi'))-- numeric booster example+ log(NumberOfVotes)*0.01FROM moviesWHERE search @@ websearch_to_tsquery('english','jedi')ORDER BY rank DESC LIMIT 10;
create function numericBooster(rating numeric, votes numeric, voteThreshold numeric)returns numeric as $$select case when votes < voteThreshold then 0 else rating end;$$ language sql;
SELECT title,ts_rank(search, websearch_to_tsquery('english', 'jedi'))-- numeric booster example+ numericBooster(Rating, NumberOfVotes, 100)*0.005FROM moviesWHERE search @@ websearch_to_tsquery('english','jedi')ORDER BY rank DESC LIMIT 10;
create function valueBooster (col text, val text, factor integer)returns integer as $$select case when col = val then factor else 0 end;$$ language sql;
SELECT title, genre,ts_rank(search, websearch_to_tsquery('english', 'jedi'))-- value booster example+ valueBooster(Genre, 'comedy', 0.05) rankFROM moviesWHERE search @@ websearch_to_tsquery('english','jedi') ORDER BY rank DESC LIMIT 10;title | genre | rank--------------------------------------------------+------------------------------------+---------------------The Men Who Stare at Goats | comedy | 0.1107927106320858Clerks | comedy | 0.1107927106320858Star Wars: The Clone Wars | animation | 0.09513916820287704Star Wars: Episode I – The Phantom Menace 3D | sci-fi | 0.09471701085567474Star Wars: Episode I – The Phantom Menace | space opera | 0.09471701085567474Star Wars: Episode II – Attack of the Clones | science fiction | 0.09285612404346466Star Wars: Episode III – Revenge of the Sith | science fiction, action | 0.09285612404346466Star Wars: The Last Jedi | action, adventure, fantasy, sci-fi | 0.0889768898487091Return of the Jedi | science fiction | 0.07599088549613953Star Wars Episode IV: A New Hope (aka Star Wars) | science fiction | 0.07599088549613953(10 rows)
ALTER TABLE movies ADD search tsvector GENERATED ALWAYS AS(setweight(to_tsvector('english', Title), 'A') || ' ' ||to_tsvector('english', Plot) || ' ' ||to_tsvector('simple', Director) || ' ' ||to_tsvector('simple', Genre) || ' ' ||to_tsvector('simple', Origin) || ' ' ||to_tsvector('simple', Casting)) STORED;
SELECT title, ts_rank(search, websearch_to_tsquery('english', 'jedi')) rankFROM moviesWHERE search @@ websearch_to_tsquery('english','jedi')ORDER BY rank DESC;title | rank--------------------------------------------------+-------------Star Wars: The Clone Wars | 0.09513917Star Wars: Episode I – The Phantom Menace | 0.09471701Star Wars: Episode I – The Phantom Menace 3D | 0.09471701Star Wars: Episode III – Revenge of the Sith | 0.092856124Star Wars: Episode II – Attack of the Clones | 0.092856124Star Wars: The Last Jedi | 0.08897689Return of the Jedi | 0.075990885Star Wars Episode IV: A New Hope (aka Star Wars) | 0.075990885Clerks | 0.06079271The Empire Strikes Back | 0.06079271The Men Who Stare at Goats | 0.06079271How to Deal | 0.06079271(12 rows)
SELECT title, ts_rank(search, websearch_to_tsquery('english', 'jedi')) rankFROM moviesWHERE search @@ websearch_to_tsquery('english','jedi')ORDER BY rank DESC;title | rank--------------------------------------------------+-------------Star Wars: The Last Jedi | 0.6361112Return of the Jedi | 0.6231253Star Wars: The Clone Wars | 0.09513917Star Wars: Episode I – The Phantom Menace | 0.09471701Star Wars: Episode I – The Phantom Menace 3D | 0.09471701Star Wars: Episode III – Revenge of the Sith | 0.092856124Star Wars: Episode II – Attack of the Clones | 0.092856124Star Wars Episode IV: A New Hope (aka Star Wars) | 0.075990885The Empire Strikes Back | 0.06079271Clerks | 0.06079271The Men Who Stare at Goats | 0.06079271How to Deal | 0.06079271(12 rows)
在单独的表中索引内容中的所有词元 对查询中的每个单词,使用相似度或Levenshtein距离在此表中进行搜索 修改查询以包括找到的任何单词 执行搜索
CREATE MATERLIAZED VIEW unique_lexeme ASSELECT word FROM ts_stat('SELECT search FROM movies');
SELECT * FROM unique_lexemeWHERE levenshtein_less_equal(word, 'pregant', 2) < 2;word----------premantpregrantpregnantpaegant

SELECT ReleaseYear/10*10 decade, count(Title) cnt FROM moviesWHERE search @@ websearch_to_tsquery('english','star wars')GROUP BY decade ORDER BY cnt DESC;decade | cnt--------+-----2000 | 392010 | 311990 | 291950 | 281940 | 261980 | 221930 | 131960 | 111970 | 71910 | 31920 | 3(11 rows)
WITH releaseYearFacets AS (SELECT 'Decade' facet, (ReleaseYear/10*10)::text val, count(Title) cntFROM moviesWHERE search @@ websearch_to_tsquery('english','star wars')GROUP BY val ORDER BY cnt DESC),genreFacets AS (SELECT 'Genre' facet, Genre val, count(Title) cnt FROM moviesWHERE search @@ websearch_to_tsquery('english','star wars')GROUP BY val ORDER BY cnt DESC LIMIT 5)SELECT * FROM releaseYearFacets UNION SELECT * FROM genreFacets;facet | val | cnt--------+---------+-----Decade | 1910 | 3Decade | 1920 | 3Decade | 1930 | 13Decade | 1940 | 26Decade | 1950 | 28Decade | 1960 | 11Decade | 1970 | 7Decade | 1980 | 22Decade | 1990 | 29Decade | 2000 | 39Decade | 2010 | 31Genre | comedy | 21Genre | drama | 35Genre | musical | 9Genre | unknown | 13Genre | war | 15(16 rows)
文章转载自云原生数据库,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




