暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

PostgreSQL SQL 技巧例子 - 网站数据解析和copy LOAD, json处理, 字符串处理, 数组处理, CTE与法, 函数, inline code , 数据ETL (generated from原始数据)

digoal 2021-01-01
410

作者

digoal

日期

2021-01-19

标签

PostgreSQL , 技巧


背景

一系列学习例子, 帮助开发者掌握更多SQL技巧.

网站数据解析和copy LOAD,

json处理,

字符串处理,

数组处理,

CTE与法,

函数,

inline code ,

数据ETL (generated from原始数据)

https://api.pgxn.org/src/pgwaffles/pgwaffles-1.0.0/pgwaffles--1.sql

```
--complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION pgwaffles" to load this file. \quit

-- Create the schema
create schema pgwaffles;

-- Create the tables for the extension
-- Data model:
-- INGREDIENT: name
-- NEEDS, 0N INGREDIENT, 0N RECIPE: quantity, unit, yield
-- RECIPE: title, picture, description, keywords, ranking
-- HAS INSTRUCTIONS, 0N STEP, _11 RECIPE
-- STEP: ordinality, description
--
-- http://mocodo.wingi.net/
create table pgwaffles.ingredient (
id integer generated always as identity primary key,
name text unique not null
);

create table pgwaffles.recipe (
id integer generated always as identity primary key,
title text unique not null,
picture text,
description text,
keywords text[] not null,
rating float
);

create table pgwaffles.step (
recipeId integer not null references pgwaffles.recipe (id),
ordinality integer not null,
description text not null,
primary key (recipeId, ordinality)
);

create table pgwaffles.ingredientInRecipe (
recipeId integer not null references pgwaffles.recipe (id),
yield integer not null,
ingredientId integer not null references pgwaffles.ingredient (id),
quantity decimal(5,3) not null,
unit text not null
);

-- Read the search results from allrecipes.com
-- sort=re means we will sort by descending rating
create temporary table allRecipes (html text);
copy allRecipes from program 'curl -s "https://www.allrecipes.com/search/results/?wt=belgium%20waffles&sort=re" | hxnormalize -x | hxselect -i "section.recipe-section > article.fixed-recipe-card > div.fixed-recipe-card__info > a" | tr "\n" " "';

-- Create the temporary table to store the best recipe
create temporary table processedRecipe(
data text,
recipe jsonb,
title text generated always as ((recipe -> 'name')::text) stored,
image text generated always as (
((recipe -> 'image')::jsonb -> 'url')::text
) stored,
description text generated always as (
(recipe -> 'description')::text
) stored,
yield integer generated always as (
regexp_replace((
regexp_split_to_array((recipe -> 'recipeYield')::text,' '))[1],
'"',
'')::integer
) stored,
ingredients jsonb generated always as (recipe -> 'recipeIngredient') stored,
steps jsonb generated always as (recipe -> 'recipeInstructions') stored,
keywords text[],
rating double precision generated always as (
((recipe -> 'aggregateRating')::jsonb -> 'ratingValue')::float
) stored
);

do
$$declare
mylink text;
_stmt text;
begin
-- Forge the copy query to get the html code source of the recipe
with rawHtml(parts) as (
-- Let's get the url links at the begining
select regexp_split_to_table(html, 'href="')
from allRecipes
),
myRecipes(url) as (
-- Let's stop at teh end of the link's url
select split_part(parts, '/">', 1)
from rawHtml
-- The first part does not contain a real link
where rawHtml.parts !~ '^<a data-content-provider-id'
)
select url into mylink
from myRecipes
limit 1;

_stmt := $query$copy processedRecipe (data) from program 'curl -s "$query$;    
_stmt := _stmt || mylink;    
_stmt := _stmt || $query$/" | hxnormalize -x |  hxselect -ci "script[type=application\/ld\+json]" | tr -d "\n" | sed "s=\\\\=\\\\\\\\=g"';$query$;    
execute _stmt;

end;
$$ language plpgsql;

-- Get the json we want
update processedRecipe set recipe = (regexp_match(
data,
'{ "@context": "http://schema.org", "@type": "Recipe",.*}')
)[1]::jsonb,
keywords = array['belgium', 'waffle'];

--Insert into my datamodel
with insertIntoRecipe(id) as (
insert into pgwaffles.recipe(title,
picture,
description,
keywords,
rating)
(select title,
image,
description,
keywords,
rating
from processedRecipe)
on conflict (title) do update set title = excluded.title
returning id
),
ingredients (data) as (
select
/ we'll remove parentethis and what's inside them as it can cause
* problems and those details shouldn't be necessary
/
regexp_replace(jsonb_array_elements_text(ingredients),
' ([^)]+) ',
' '
)
from processedRecipe
),
processedIngredients(data) as (
select
regexp_split_to_array (
/ we'll change those horrible fractions into real numbers /
case
when data ~ ' ½' then regexp_replace(data, ' ½','.5')
when data ~ '½' then regexp_replace(data, '½','0.5')
when data ~ ' ⅓' then regexp_replace(data, ' ⅓','.3')
when data ~ '⅓' then regexp_replace(data, '⅓','0.3')
when data ~ ' ⅕' then regexp_replace(data, ' ⅕','.2')
when data ~ '⅕' then regexp_replace(data, '⅕','0.2')
when data ~ ' ⅙' then regexp_replace(data, ' ⅙','.17')
when data ~ '⅙' then regexp_replace(data, '⅙','0.17')
when data ~ ' ⅛' then regexp_replace(data, ' ⅛','.125')
when data ~ '⅛' then regexp_replace(data, '⅛','0.125')
when data ~ ' ⅔' then regexp_replace(data, ' ⅔','.7')
when data ~ '⅔' then regexp_replace(data, '⅔','0.7')
when data ~ ' ⅖' then regexp_replace(data, ' ⅖','.4')
when data ~ '⅖' then regexp_replace(data, '⅖','0.4')
when data ~ ' ⅚' then regexp_replace(data, ' ⅚','.8')
when data ~ '⅚' then regexp_replace(data, '⅚','0.8')
when data ~ ' ⅜' then regexp_replace(data, ' ⅜','.375')
when data ~ '⅜' then regexp_replace(data, '⅜','0.375')
when data ~ ' ¾' then regexp_replace(data, ' ¾','.75')
when data ~ '¾' then regexp_replace(data, '¾','0.75')
when data ~ ' ⅗' then regexp_replace(data, ' ⅗','.6')
when data ~ '⅗' then regexp_replace(data, '⅗','0.6')
when data ~ ' ⅝' then regexp_replace(data, ' ⅝','.625')
when data ~ '⅝' then regexp_replace(data, '⅝','0.625')
when data ~ ' ⅞' then regexp_replace(data, ' ⅞','.875')
when data ~ ' ⅞' then regexp_replace(data, '⅞','.875')
when data ~ '⅘' then regexp_replace(data, ' ⅘','.8')
when data ~ ' ⅘' then regexp_replace(data, '⅘','0.8')
when data ~ '¼' then regexp_replace(data, ' ¼','.25')
when data ~ ' ¼' then regexp_replace(data, '¼','0.25')
when data ~ '⅐' then regexp_replace(data, ' ⅐','.14')
when data ~ ' ⅐' then regexp_replace(data, '⅐','0.14')
when data ~ '⅑' then regexp_replace(data, ' ⅑','.11')
when data ~ ' ⅑' then regexp_replace(data, '⅑','0.11')
when data ~ '⅒' then regexp_replace(data, ' ⅒','.1')
when data ~ ' ⅒' then regexp_replace(data, '⅒','0.1')
else data
end,
' ')
from ingredients
),
insertIntoIngredient(id) as (
insert into pgwaffles.ingredient (name)
(select lower(array_to_string(array_remove(array_remove(data,data[1]),data[2]),' '))
from processedIngredients
)
on conflict (name) do update set name = excluded.name
returning id, name
),
insertIntoIngredientInRecipe(id) as (
insert into pgwaffles.ingredientInRecipe(
ingredientId,
recipeId,
quantity,
yield,
unit
)
(select insertIntoIngredient.id,
insertIntoRecipe.id,
processedIngredients.data[1]::decimal(5,3) as quantity,
processedRecipe.yield,
processedIngredients.data[2] as unit
from insertIntoIngredient
inner join processedIngredients
on insertIntoIngredient.name = lower(array_to_string(array_remove(array_remove(processedIngredients.data,processedIngredients.data[1]),data[2]),' ')),
insertIntoRecipe,
processedRecipe
)
returning recipeId
)
insert into pgwaffles.step(recipeId,
ordinality,
description)
(select distinct insertIntoIngredientInRecipe.id,
steps_with_ordinality.ordinality,
steps_with_ordinality."text"
from processedRecipe,
insertIntoIngredientInRecipe,
rows from (jsonb_to_recordset(steps) as ("text" text,"@type" text))
with ordinality steps_with_ordinality)
;

create function pgwaffles.displayIngredients(out yield integer,
out quantity decimal(5,3),
out unit text,
out name text)
returns setof record as
$body$
-- Display the ingredients
with bestRecipe(recipeId) as (
select id
from pgwaffles.recipe
order by rating
limit 1
)
select
ingredientInRecipe.yield,
ingredientInRecipe.quantity,
ingredientInRecipe.unit,
ingredient.name
from pgwaffles.ingredientInRecipe
inner join bestRecipe
on ingredientInRecipe.recipeId = bestRecipe.recipeId
inner join pgwaffles.ingredient
on ingredientInRecipe.ingredientId = ingredient.id;
$body$ language sql;

create function pgwaffles.displayRecipe()
returns setof text as
$body$
--Display the steps
with bestRecipe(recipeId) as (
select id
from pgwaffles.recipe
order by rating
limit 1
)
select description
from pgwaffles.step
inner join bestRecipe
on step.recipeId = bestRecipe.recipeId
order by ordinality;
$body$ language sql;

-- Clean up
drop table allRecipes;
drop table processedRecipe;
```

PostgreSQL 许愿链接

您的愿望将传达给PG kernel hacker、数据库厂商等, 帮助提高数据库产品质量和功能, 说不定下一个PG版本就有您提出的功能点. 针对非常好的提议,奖励限量版PG文化衫、纪念品、贴纸、PG热门书籍等,奖品丰富,快来许愿。开不开森.

9.9元购买3个月阿里云RDS PostgreSQL实例

PostgreSQL 解决方案集合

德哥 / digoal's github - 公益是一辈子的事.

digoal's wechat

文章转载自digoal,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论