暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

项目实战从0到1之hive(26)企业级数据仓库构建(八):搭建DWD 层-业务数据

逗先生大数据 2020-10-04
138


   

                 点击上方蓝字关注我们           


1.1 DWD 层(业务数据)



1.1.1 商品维度表(全量表)

1)建表语句

DROP TABLE IF EXISTS `dwd_dim_sku_info`;

CREATE EXTERNAL TABLE `dwd_dim_sku_info` (

`id` string COMMENT '商品 id',

`spu_id` string COMMENT 'spuid',

`price` double COMMENT '商品价格',

`sku_name` string COMMENT '商品名称',

`sku_desc` string COMMENT '商品描述',

`weight` double COMMENT '重量',

`tm_id` string COMMENT '品牌 id',

`tm_name` string COMMENT '品牌名称',

`category3_id` string COMMENT '三级分类 id',

`category2_id` string COMMENT '二级分类 id',

`category1_id` string COMMENT '一级分类 id',

`category3_name` string COMMENT '三级分类名称',

`category2_name` string COMMENT '二级分类名称',

`category1_name` string COMMENT '一级分类名称',

`spu_name` string COMMENT 'spu 名称',

`create_time` string COMMENT '创建时间'

)

COMMENT '商品维度表'

PARTITIONED BY (`dt` string)

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_sku_info/'

tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_dim_sku_info partition(dt='2020-03-10')

select

sku.id,

sku.spu_id,

sku.price,

sku.sku_name,

sku.sku_desc,

sku.weight,

sku.tm_id,

ob.tm_name,

sku.category3_id,

c2.id category2_id,

c1.id category1_id,

c3.name category3_name,

c2.name category2_name,

c1.name category1_name,

spu.spu_name,

sku.create_time

from

(

select * from ods_sku_info where dt='2020-03-10'

)sku

join

(

select * from ods_base_trademark where dt='2020-03-10'

)ob on sku.tm_id=ob.tm_id

join

(

select * from ods_spu_info where dt='2020-03-10'

)spu on spu.id = sku.spu_id

join

(

select * from ods_base_category3 where dt='2020-03-10'

)c3 on sku.category3_id=c3.id

join

(

select * from ods_base_category2 where dt='2020-03-10'

)c2 on c3.category2_id=c2.id

join

(

select * from ods_base_category1 where dt='2020-03-10'

)c1 on c2.category1_id=c1.id;

1.1.2 优惠券信息表(全量)

把 ODS 层 ods_coupon_info 表数据导入到 DWD 层优惠卷信息表,在导入过程中可以做适当的清洗

1)建表语句

drop table if exists dwd_dim_coupon_info;

create external table dwd_dim_coupon_info(

`id` string COMMENT '购物券编号',

`coupon_name` string COMMENT '购物券名称',

`coupon_type` string COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',

`condition_amount` string COMMENT '满额数',

`condition_num` string COMMENT '满件数',

`activity_id` string COMMENT '活动编号',

`benefit_amount` string COMMENT '减金额',

`benefit_discount` string COMMENT '折扣',

`create_time` string COMMENT '创建时间',

`range_type` string COMMENT '范围类型 1、商品 2、品类 3、品牌',

`spu_id` string COMMENT '商品 id',

`tm_id` string COMMENT '品牌 id',

`category3_id` string COMMENT '品类 id',

`limit_num` string COMMENT '最多领用次数',

`operate_time` string COMMENT '修改时间',

`expire_time` string COMMENT '过期时间'

) COMMENT '优惠券信息表'

PARTITIONED BY (`dt` string)

row format delimited fields terminated by '\t'

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_coupon_info/'

tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_dim_coupon_info partition(dt='2020-03-10')

select

id,

coupon_name,

coupon_type,

condition_amount,

condition_num,

activity_id,

benefit_amount,

benefit_discount,

create_time,

range_type,

spu_id,

tm_id,

category3_id,

limit_num,

operate_time,

expire_time

from ods_coupon_info

where dt='2020-03-10';

3)查询加载结果

select * from dwd_dim_coupon_info where dt='2020-03-10';

1.1.3 活动维度表(全量)

1)建表语句

drop table if exists dwd_dim_activity_info;

create external table dwd_dim_activity_info(

`id` string COMMENT '编号',

`activity_name` string COMMENT '活动名称',

`activity_type` string COMMENT '活动类型',

`condition_amount` string COMMENT '满减金额',

`condition_num` string COMMENT '满减件数',

`benefit_amount` string COMMENT '优惠金额',

`benefit_discount` string COMMENT '优惠折扣',

`benefit_level` string COMMENT '优惠级别',

`start_time` string COMMENT '开始时间',

`end_time` string COMMENT '结束时间',

`create_time` string COMMENT '创建时间'

) COMMENT '活动信息表'

PARTITIONED BY (`dt` string)

row format delimited fields terminated by '\t'

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_activity_info/'

tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_dim_activity_info partition(dt='2020-03-10')

select

info.id,

info.activity_name,

info.activity_type,

rule.condition_amount,

rule.condition_num,

rule.benefit_amount,

rule.benefit_discount,

rule.benefit_level,

info.start_time,

info.end_time,

info.create_time

from

(

select * from ods_activity_info where dt='2020-03-10'

)info

left join

(

select * from ods_activity_rule where dt='2020-03-10'

)rule on info.id = rule.activity_id;

3)查询加载结果

select * from dwd_dim_activity_info where dt='2020-03-10';

 1.1.4 地区维度表(特殊)

1)建表语句

DROP TABLE IF EXISTS `dwd_dim_base_province`;

CREATE EXTERNAL TABLE `dwd_dim_base_province` (

`id` string COMMENT 'id',

`province_name` string COMMENT '省市名称',

`area_code` string COMMENT '地区编码',

`iso_code` string COMMENT 'ISO 编码',

`region_id` string COMMENT '地区 id',

`region_name` string COMMENT '地区名称'

)

COMMENT '地区省市表'

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_base_province/'

tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_dim_base_province

select

bp.id,

bp.name,

bp.area_code,

bp.iso_code,

bp.region_id,

br.region_name

from ods_base_province bp

join ods_base_region br

on bp.region_id=br.id;

1.1.5 时间维度表(特殊)(预留)

1)建表语句

DROP TABLE IF EXISTS `dwd_dim_date_info`;

CREATE EXTERNAL TABLE `dwd_dim_date_info`(

`date_id` string COMMENT '日',

`week_id` int COMMENT '周',

`week_day` int COMMENT '周的第几天',

`day` int COMMENT '每月的第几天',

`month` int COMMENT '第几月',

`quarter` int COMMENT '第几季度',

`year` int COMMENT '年',

`is_workday` int COMMENT '是否是周末',

`holiday_id` int COMMENT '是否是节假日'

)

row format delimited fields terminated by '\t'

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_date_info/'

tblproperties ("parquet.compression"="lzo");

2)把 date_info.txt 文件上传到 node01 的 opt/modules/db_log/路径

3)数据装载

load data local inpath '/opt/modules/db_log/date_info.txt' into table dwd_dim_date_info;

4)查询加载结果

select * from dwd_dim_date_info;

1.1.6 订单明细事实表(事务型快照事实表)

1)建表语句

drop table if exists dwd_fact_order_detail;

create external table dwd_fact_order_detail (

`id` string COMMENT '订单编号',

`order_id` string COMMENT '订单号',

`user_id` string COMMENT '用户 id',

`sku_id` string COMMENT 'sku 商品 id',

`sku_name` string COMMENT '商品名称',

`order_price` decimal(10,2) COMMENT '商品价格',

`sku_num` bigint COMMENT '商品数量',

`create_time` string COMMENT '创建时间',

`province_id` string COMMENT '省份 ID',

`total_amount` decimal(20,2) COMMENT '订单总金额'

)

PARTITIONED BY (`dt` string)

stored as parquet

location '/warehouse/gmall/dwd/dwd_fact_order_detail/'

tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_fact_order_detail partition(dt='2020-03-10')

select

od.id,

od.order_id,

od.user_id,

od.sku_id,

od.sku_name,

od.order_price,

od.sku_num,

od.create_time,

oi.province_id,

od.order_price*od.sku_num

from

(

select * from ods_order_detail where dt='2020-03-10'

) od

join

(

select * from ods_order_info where dt='2020-03-10'

) oi

on od.order_id=oi.id;

3)查询加载结果

select * from dwd_fact_order_detail where dt='2020-03-10';

1.1.7 支付事实表(事务型快照事实表)

1)建表语句

drop table if exists dwd_fact_payment_info;

create external table dwd_fact_payment_info (

`id` string COMMENT '',

`out_trade_no` string COMMENT '对外业务编号',

`order_id` string COMMENT '订单编号',

`user_id` string COMMENT '用户编号',

`alipay_trade_no` string COMMENT '支付宝交易流水编号',

`payment_amount` decimal(16,2) COMMENT '支付金额',

`subject` string COMMENT '交易内容',

`payment_type` string COMMENT '支付类型',

`payment_time` string COMMENT '支付时间',

`province_id` string COMMENT '省份 ID'

)

PARTITIONED BY (`dt` string)

stored as parquet

location '/warehouse/gmall/dwd/dwd_fact_payment_info/'

tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_fact_payment_info partition(dt='2020-03-10')

select

pi.id,

pi.out_trade_no,

pi.order_id,

pi.user_id,

pi.alipay_trade_no,

pi.total_amount,

pi.subject,

pi.payment_type,

pi.payment_time,

oi.province_id

from

(

select * from ods_payment_info where dt='2020-03-10'

)pi

join

(

select id, province_id from ods_order_info where dt='2020-03-10'

)oi

on pi.order_id = oi.id;

3)查询加载结果

select * from dwd_fact_payment_info where dt='2020-03-10';

1.1.8 退款事实表(事务型快照事实表)

把 ODS 层 ods_order_refund_info 表数据导入到 DWD 层退款事实表,在导入过程中可以做适当的清洗

1)建表语句

drop table if exists dwd_fact_order_refund_info;

create external table dwd_fact_order_refund_info(

`id` string COMMENT '编号',

`user_id` string COMMENT '用户 ID',

`order_id` string COMMENT '订单 ID',

`sku_id` string COMMENT '商品 ID',

`refund_type` string COMMENT '退款类型',

`refund_num` bigint COMMENT '退款件数',

`refund_amount` decimal(16,2) COMMENT '退款金额',

`refund_reason_type` string COMMENT '退款原因类型',

`create_time` string COMMENT '退款时间'

) COMMENT '退款事实表'

PARTITIONED BY (`dt` string)

row format delimited fields terminated by '\t'

location '/warehouse/gmall/dwd/dwd_fact_order_refund_info/';

2)数据装载

insert overwrite table dwd_fact_order_refund_info partition(dt='2020-03-10')

select

id,

user_id,

order_id,

sku_id,

refund_type,

refund_num,

refund_amount,

refund_reason_type,

create_time

from ods_order_refund_info

where dt='2020-03-10';

3)查询加载结果

select * from dwd_fact_order_refund_info where dt='2020-03-10';

1.1.9 评价事实表(事务型快照事实表)

把 ODS 层 ods_comment_info 表数据导入到 DWD 层评价事实表,在导入过程中可以做适当的清洗

1)建表语句

drop table if exists dwd_fact_comment_info;

create external table dwd_fact_comment_info(

`id` string COMMENT '编号',

`user_id` string COMMENT '用户 ID',

`sku_id` string COMMENT '商品 sku',

`spu_id` string COMMENT '商品 spu',

`order_id` string COMMENT '订单 ID',

`appraise` string COMMENT '评价',

`create_time` string COMMENT '评价时间'

) COMMENT '评价事实表'

PARTITIONED BY (`dt` string)

row format delimited fields terminated by '\t'

location '/warehouse/gmall/dwd/dwd_fact_comment_info/';

2)数据装载

insert overwrite table dwd_fact_comment_info partition(dt='2020-03-10')

select

id,

user_id,

sku_id,

spu_id,

order_id,

appraise,

create_time

from ods_comment_info

where dt='2020-03-10';

1.1.10 加购事实表(周期型快照事实表,每日快照)

由于购物车的数量是会发生变化,所以导增量不合适

每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增

周期型快照事实表劣势:存储的数据量会比较大

解决方案:周期型快照事实表存储的数据比较讲究时效性,时间太久了的意义不大,可以删除以前的数据

1)建表语句

drop table if exists dwd_fact_cart_info;

create external table dwd_fact_cart_info(

`id` string COMMENT '编号',

`user_id` string COMMENT '用户 id',

`sku_id` string COMMENT 'skuid',

`cart_price` string COMMENT '放入购物车时价格',

`sku_num` string COMMENT '数量',

`sku_name` string COMMENT 'sku 名称 (冗余)',

`create_time` string COMMENT '创建时间',

`operate_time` string COMMENT '修改时间',

`is_ordered` string COMMENT '是否已经下单。1 为已下单;0 为未下单',

`order_time` string COMMENT '下单时间'

) COMMENT '加购事实表'

PARTITIONED BY (`dt` string)

row format delimited fields terminated by '\t'

location '/warehouse/gmall/dwd/dwd_fact_cart_info/';

2)数据装载

insert overwrite table dwd_fact_cart_info partition(dt='2020-03-10')

select

id,

user_id,

sku_id,

cart_price,

sku_num,

sku_name,

create_time,

operate_time,

is_ordered,

order_time

from ods_cart_info

where dt='2020-03-10';

3)查询加载结果

select * from dwd_fact_cart_info where dt='2020-03-10';

1.1.11 收藏事实表(周期型快照事实表,每日快照)

收藏的标记,是否取消,会发生变化,做增量不合适

每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增

1)建表语句

drop table if exists dwd_fact_favor_info;

create external table dwd_fact_favor_info(

`id` string COMMENT '编号',

`user_id` string COMMENT '用户 id',

`sku_id` string COMMENT 'skuid',

`spu_id` string COMMENT 'spuid',

`is_cancel` string COMMENT '是否取消',

`create_time` string COMMENT '收藏时间',

`cancel_time` string COMMENT '取消时间'

) COMMENT '收藏事实表'

PARTITIONED BY (`dt` string)

row format delimited fields terminated by '\t'

location '/warehouse/gmall/dwd/dwd_fact_favor_info/';

2)数据装载

insert overwrite table dwd_fact_favor_info partition(dt='2020-03-10')

select

id,

user_id,

sku_id,

spu_id,

is_cancel,

create_time,

cancel_time

from ods_favor_info

where dt='2020-03-10';

3)查询加载结果

select * from dwd_fact_favor_info where dt='2020-03-10';

1.1.12 优惠券领用事实表(累积型快照事实表)

优惠卷的生命周期:领取优惠卷-》用优惠卷下单-》优惠卷参与支付

累积型快照事实表使用:统计优惠卷领取次数、优惠卷下单次数、优惠卷参与支付次数

1)建表语句

drop table if exists dwd_fact_coupon_use;

create external table dwd_fact_coupon_use(

`id` string COMMENT '编号',

`coupon_id` string COMMENT '优惠券 ID',

`user_id` string COMMENT 'userid',

`order_id` string COMMENT '订单 id',

`coupon_status` string COMMENT '优惠券状态',

`get_time` string COMMENT '领取时间',

`using_time` string COMMENT '使用时间(下单)',

`used_time` string COMMENT '使用时间(支付)'

) COMMENT '优惠券领用事实表'

PARTITIONED BY (`dt` string)

row format delimited fields terminated by '\t'

location '/warehouse/gmall/dwd/dwd_fact_coupon_use/';

注意:dt 是按照优惠卷领用时间 get_time 做为分区

2)数据装载

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dwd_fact_coupon_use partition(dt)

select

if(new.id is null,old.id,new.id),

if(new.coupon_id is null,old.coupon_id,new.coupon_id),

if(new.user_id is null,old.user_id,new.user_id),

if(new.order_id is null,old.order_id,new.order_id),

if(new.coupon_status is null,old.coupon_status,new.coupon_status),

if(new.get_time is null,old.get_time,new.get_time),

if(new.using_time is null,old.using_time,new.using_time),

if(new.used_time is null,old.used_time,new.used_time),

date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')

from

(

select

id,

coupon_id,

user_id,

order_id,

coupon_status,

get_time,

using_time,

used_time

from dwd_fact_coupon_use

where dt in

(

select

date_format(get_time,'yyyy-MM-dd')

from ods_coupon_use

where dt='2020-03-10'

)

)old

full outer join

(

select

id,

coupon_id,

user_id,

order_id,

coupon_status,

get_time,

using_time,

used_time

from ods_coupon_use

where dt='2020-03-10'

)new

on old.id=new.id;

1.1.13 订单事实表(累积型快照事实表)

1)concat 函数

concat 函数在连接字符串的时候,只要其中一个是 NULL,那么将返回 NULL

hive> select concat('a','b');

ab

hive> select concat('a','b',null);

NULL

2)concat_ws 函数

concat_ws 函数在连接字符串的时候,只要有一个字符串不是 NULL,就不会返回 NULL。concat_ws 函数需要指定分隔符

hive> select concat_ws('-','a','b');

a-b

hive> select concat_ws('-','a','b',null);

a-b

hive> select concat_ws('','a','b',null);

ab

(1)语法描述

STR_TO_MAP(VARCHAR text, VARCHAR listDelimiter, VARCHAR keyValueDelimiter)

(2)功能描述

使用 listDelimiter 将 text 分隔成 K-V 对,然后使用 keyValueDelimiter 分隔每个 K-V 对,

组装成 MAP 返回。默认 listDelimiter 为( ,),keyValueDelimiter 为(=)。

(3)案例

str_to_map(‘1001=2020-03-10,1002=2020-03-10’, ‘,’ , ‘=’)

输出{“1001”:“2020-03-10”,“1002”:“2020-03-10”}

4)建表语句

订单生命周期:创建时间=》支付时间=》取消时间=》完成时间=》退款时间=》退款完成时间

由于 ODS 层订单表只有创建时间和操作时间两个状态,不能表达所有时间含义,所以需要关联订单状态表。订单事实表里面增加了活动 id,所以需要关联活动订单表

drop table if exists dwd_fact_order_info;

create external table dwd_fact_order_info (

`id` string COMMENT '订单编号',

`order_status` string COMMENT '订单状态',

`user_id` string COMMENT '用户 id',

`out_trade_no` string COMMENT '支付流水号',

`create_time` string COMMENT '创建时间(未支付状态)',

`payment_time` string COMMENT '支付时间(已支付状态)',

`cancel_time` string COMMENT '取消时间(已取消状态)',

`finish_time` string COMMENT '完成时间(已完成状态)',

`refund_time` string COMMENT '退款时间(退款中状态)',

`refund_finish_time` string COMMENT '退款完成时间(退款完成状态)',

`province_id` string COMMENT '省份 ID',

`activity_id` string COMMENT '活动 ID',

`original_total_amount` string COMMENT '原价金额',

`benefit_reduce_amount` string COMMENT '优惠金额',

`feight_fee` string COMMENT '运费',

`final_total_amount` decimal(10,2) COMMENT '订单金额'

)

PARTITIONED BY (`dt` string)

stored as parquet

location '/warehouse/gmall/dwd/dwd_fact_order_info/'

tblproperties ("parquet.compression"="lzo");

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dwd_fact_order_info partition(dt)

select

if(new.id is null,old.id,new.id),

if(new.order_status is null,old.order_status,new.order_status),

if(new.user_id is null,old.user_id,new.user_id),

if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),

if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 对应未支付状态

if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),

if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),

if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),

if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),

if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),

if(new.province_id is null,old.province_id,new.province_id),

if(new.activity_id is null,old.activity_id,new.activity_id),

if(new.original_total_amount is

null,old.original_total_amount,new.original_total_amount),

if(new.benefit_reduce_amount is

null,old.benefit_reduce_amount,new.benefit_reduce_amount),

if(new.feight_fee is null,old.feight_fee,new.feight_fee),

if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),

date_format(if(new.tms['1001'] is

null,old.create_time,new.tms['1001']),'yyyy-MM-dd')

from

(

select

id,

order_status,

user_id,

out_trade_no,

create_time,

payment_time,

cancel_time,

finish_time,

refund_time,

refund_finish_time,

province_id,

activity_id,

original_total_amount,

benefit_reduce_amount,

feight_fee,

final_total_amount

from dwd_fact_order_info

where dt

in

(

select

date_format(create_time,'yyyy-MM-dd')

from ods_order_info

where dt='2020-03-10'

)

)old

full outer join

(

select

info.id,

info.order_status,

info.user_id,

info.out_trade_no,

info.province_id,

act.activity_id,

log.tms,

info.original_total_amount,

info.benefit_reduce_amount,

info.feight_fee,

info.final_total_amount

from

(

select

order_id,

str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=')

tms

from ods_order_status_log

where dt='2020-03-10'

group by order_id

)log

join

(

select * from ods_order_info where dt='2020-03-10'

)info

on log.order_id=info.id

left join

(

select * from ods_activity_order where dt='2020-03-10'

)act

on log.order_id=act.order_id

)new

on old.id=new.id;

1.1.14 用户维度表(拉链表)

用户表中的数据每日既有可能新增,也有可能修改,但修改频率并不高,属于缓慢变化

维度,此处采用拉链表存储用户维度数据

1)什么是拉链表

2)为什么要做拉链表

<ignore_js_op>

3)拉链表形成过程

4)拉链表制作过程图

5)拉链表制作过程

步骤 0:初始化拉链表(首次独立执行)

(1)建立拉链表

drop table if exists dwd_dim_user_info_his;

create external table dwd_dim_user_info_his(

`id` string COMMENT '用户 id',

`name` string COMMENT '姓名',

`birthday` string COMMENT '生日',

`gender` string COMMENT '性别',

`email` string COMMENT '邮箱',

`user_level` string COMMENT '用户等级',

`create_time` string COMMENT '创建时间',

`operate_time` string COMMENT '操作时间',

`start_date` string COMMENT '有效开始日期',

`end_date` string COMMENT '有效结束日期'

) COMMENT '订单拉链表'

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'

tblproperties ("parquet.compression"="lzo");

(2)初始化拉链表

insert overwrite table dwd_dim_user_info_his

select

id,

name,

birthday,

gender,

email,

user_level,

create_time,

operate_time,

'2020-03-10',

'9999-99-99'

from ods_user_info oi

where oi.dt='2020-03-10';

步骤 1:制作当日变动数据(包括新增,修改)每日执行

(1)如何获得每日变动表

a.最好表内有创建时间和变动时间(Lucky!)

b.如果没有,可以利用第三方工具监控比如 canal,监控 MySQL 的实时变化进行记录(麻烦)

c.逐行对比前后两天的数据,检查 md5(concat(全部有可能变化的字段))是否相同(low)

d.要求业务数据库提供变动流水(人品,颜值)

(2)因为 ods_order_info 本身导入过来就是新增变动明细的表,所以不用处理

a)数据库中新增 2020-03-11 一天的数据

b)通过 Sqoop 把 2020-03-11 日所有数据导入mysqlTohdfs.sh all 2020-03-11

c)ods 层数据导入hdfs_to_ods_db.sh all 2020-03-11

步骤 2:先合并变动信息,再追加新增信息,插入到临时表中

1)建立临时表

drop table if exists dwd_dim_user_info_his_tmp;

create external table dwd_dim_user_info_his_tmp(

`id` string COMMENT '用户 id',

`name` string COMMENT '姓名',

`birthday` string COMMENT '生日',

`gender` string COMMENT '性别',

`email` string COMMENT '邮箱',

`user_level` string COMMENT '用户等级',

`create_time` string COMMENT '创建时间',

`operate_time` string COMMENT '操作时间',

`start_date` string COMMENT '有效开始日期',

`end_date` string COMMENT '有效结束日期'

) COMMENT '订单拉链临时表'

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_user_info_his_tmp/'

tblproperties ("parquet.compression"="lzo");

2)导入脚本

insert overwrite table dwd_dim_user_info_his_tmp

select * from

(

select

id,

name,

birthday,

gender,

email,

user_level,

create_time,

operate_time,

'2020-03-11' start_date,

'9999-99-99' end_date

from ods_user_info where dt='2020-03-11'

union all

select

uh.id,

uh.name,

uh.birthday,

uh.gender,

uh.email,

uh.user_level,

uh.create_time,

uh.operate_time,

uh.start_date,

if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),

uh.end_date) end_date

from dwd_dim_user_info_his uh left join

(

select

*

from ods_user_info

where dt='2020-03-11'

) ui on uh.id=ui.id

)his

order by his.id, start_date;

1.1.15 DWD 层数据导入脚本

1)vim ods_to_dwd_db.sh

#!/bin/bash

APP=gmall

hive=/opt/modules/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天

if [ -n "$2" ] ;then

do_date=$2

else

do_date=`date -d "-1 day" +%F`

fi

sql1="

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table ${APP}.dwd_dim_sku_info partition(dt='$do_date')

select

sku.id,

sku.spu_id,

sku.price,

sku.sku_name,

sku.sku_desc,

sku.weight,

sku.tm_id,

ob.tm_name,

sku.category3_id,

c2.id category2_id,

c1.id category1_id,

c3.name category3_name,

c2.name category2_name,

c1.name category1_name,

spu.spu_name,

sku.create_time

from

(

select * from ${APP}.ods_sku_info where dt='$do_date'

)sku

join

(

select * from ${APP}.ods_base_trademark where dt='$do_date'

)ob on sku.tm_id=ob.tm_id

join

(

select * from ${APP}.ods_spu_info where dt='$do_date'

)spu on spu.id = sku.spu_id

join

(

select * from ${APP}.ods_base_category3 where dt='$do_date'

)c3 on sku.category3_id=c3.id

join

(

select * from ${APP}.ods_base_category2 where dt='$do_date'

)c2 on c3.category2_id=c2.id

join

(

select * from ${APP}.ods_base_category1 where dt='$do_date'

)c1 on c2.category1_id=c1.id;

insert overwrite table ${APP}.dwd_dim_coupon_info partition(dt='$do_date')

select

id,

coupon_name,

coupon_type,

condition_amount,

condition_num,

activity_id,

benefit_amount,

benefit_discount,

create_time,

range_type,

spu_id,

tm_id,

category3_id,

limit_num,

operate_time,

expire_time

from ${APP}.ods_coupon_info

where dt='$do_date';

insert overwrite table ${APP}.dwd_dim_activity_info partition(dt='$do_date')

select

info.id,

info.activity_name,

info.activity_type,

rule.condition_amount,

rule.condition_num,

rule.benefit_amount,

rule.benefit_discount,

rule.benefit_level,

info.start_time,

info.end_time,

info.create_time

from

(

select * from ${APP}.ods_activity_info where dt='$do_date'

)info

left join

(

select * from ${APP}.ods_activity_rule where dt='$do_date'

)rule on info.id = rule.activity_id;

insert overwrite table ${APP}.dwd_fact_order_detail partition(dt='$do_date')

select

od.id,

od.order_id,

od.user_id,

od.sku_id,

od.sku_name,

od.order_price,

od.sku_num,

od.create_time,

oi.province_id,

od.order_price*od.sku_num

from

(

select * from ${APP}.ods_order_detail where dt='$do_date'

) od

join

(

select * from ${APP}.ods_order_info where dt='$do_date'

) oi

on od.order_id=oi.id;

insert overwrite table ${APP}.dwd_fact_payment_info partition(dt='$do_date')

select

pi.id,

pi.out_trade_no,

pi.order_id,

pi.user_id,

pi.alipay_trade_no,

pi.total_amount,

pi.subject,

pi.payment_type,

pi.payment_time,

oi.province_id

from

(

select * from ${APP}.ods_payment_info where dt='$do_date'

)pi

join

(

select id, province_id from ${APP}.ods_order_info where dt='$do_date'

)oi

on pi.order_id = oi.id;

insert overwrite table ${APP}.dwd_fact_order_refund_info partition(dt='$do_date')

select

id,

user_id,

order_id,

sku_id,

refund_type,

refund_num,

refund_amount,

refund_reason_type,

create_time

from ${APP}.ods_order_refund_info

where dt='$do_date';

insert overwrite table ${APP}.dwd_fact_comment_info partition(dt='$do_date')

select

id,

user_id,

sku_id,

spu_id,

order_id,

appraise,

create_time

from ${APP}.ods_comment_info

where dt='$do_date';

insert overwrite table ${APP}.dwd_fact_cart_info partition(dt='$do_date')

select

id,

user_id,

sku_id,

cart_price,

sku_num,

sku_name,

create_time,

operate_time,

is_ordered,

order_time

from ${APP}.ods_cart_info

where dt='$do_date';

insert overwrite table ${APP}.dwd_fact_favor_info partition(dt='$do_date')

select

id,

user_id,

sku_id,

spu_id,

is_cancel,

create_time,

cancel_time

from ${APP}.ods_favor_info

where dt='$do_date';

insert overwrite table ${APP}.dwd_fact_coupon_use partition(dt)

select

if(new.id is null,old.id,new.id),

if(new.coupon_id is null,old.coupon_id,new.coupon_id),

if(new.user_id is null,old.user_id,new.user_id),

if(new.order_id is null,old.order_id,new.order_id),

if(new.coupon_status is null,old.coupon_status,new.coupon_status),

if(new.get_time is null,old.get_time,new.get_time),

if(new.using_time is null,old.using_time,new.using_time),

if(new.used_time is null,old.used_time,new.used_time),

date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')

from

(

select

id,

coupon_id,

user_id,

order_id,

coupon_status,

get_time,

using_time,

used_time

from ${APP}.dwd_fact_coupon_use

where dt in

(

select

date_format(get_time,'yyyy-MM-dd')

from ${APP}.ods_coupon_use

where dt='$do_date'

)

)old

full outer join

(

select

id,

coupon_id,

user_id,

order_id,

coupon_status,

get_time,

using_time,

used_time

from ${APP}.ods_coupon_use

where dt='$do_date'

)new

on old.id=new.id;

insert overwrite table ${APP}.dwd_fact_order_info partition(dt)

select

if(new.id is null,old.id,new.id),

if(new.order_status is null,old.order_status,new.order_status),

if(new.user_id is null,old.user_id,new.user_id),

if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),

if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 对应未支付状态

if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),

if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),

if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),

if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),

if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),

if(new.province_id is null,old.province_id,new.province_id),

if(new.activity_id is null,old.activity_id,new.activity_id),

if(new.original_total_amount is

null,old.original_total_amount,new.original_total_amount),

if(new.benefit_reduce_amount is

null,old.benefit_reduce_amount,new.benefit_reduce_amount),

if(new.feight_fee is null,old.feight_fee,new.feight_fee),

if(new.final_total_amount is

null,old.final_total_amount,new.final_total_amount),

date_format(if(new.tms['1001'] is

null,old.create_time,new.tms['1001']),'yyyy-MM-dd')

from

(

select

id,

order_status,

user_id,

out_trade_no,

create_time,

payment_time,

cancel_time,

finish_time,

refund_time,

refund_finish_time,

province_id,

activity_id,

original_total_amount,

benefit_reduce_amount,

feight_fee,

final_total_amount

from ${APP}.dwd_fact_order_info

where dt

in

(

select

date_format(create_time,'yyyy-MM-dd')

from ${APP}.ods_order_info

where dt='$do_date'

)

)old

full outer join

(

select

info.id,

info.order_status,

info.user_id,

info.out_trade_no,

info.province_id,

act.activity_id,

log.tms,

info.original_total_amount,

info.benefit_reduce_amount,

info.feight_fee,

info.final_total_amount

from

(

select

order_id,

str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','

=') tms

from ${APP}.ods_order_status_log

where dt='$do_date'

group by order_id

)log

join

(

select * from ${APP}.ods_order_info where dt='$do_date'

)info

on log.order_id=info.id

left join

(

select * from ${APP}.ods_activity_order where dt='$do_date'

)act

on log.order_id=act.order_id

)new

on old.id=new.id;

insert overwrite table ${APP}.dwd_dim_user_info_his_tmp

select * from

(

select

id,

name,

birthday,

gender,

email,

user_level,

create_time,

operate_time,

'$do_date' start_date,

'9999-99-99' end_date

from ${APP}.ods_user_info where dt='$do_date'

union all

select

uh.id,

uh.name,

uh.birthday,

uh.gender,

uh.email,

uh.user_level,

uh.create_time,

uh.operate_time,

uh.start_date,

if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),

uh.end_date) end_date

from ${APP}.dwd_dim_user_info_his uh left join

(

select

*

from ${APP}.ods_user_info

where dt='$do_date'

) ui on uh.id=ui.id

)his

order by his.id, start_date;

insert overwrite table ${APP}.dwd_dim_user_info_his select * from

${APP}.dwd_dim_user_info_his_tmp;

"

sql2="

insert overwrite table ${APP}.dwd_dim_base_province

select

bp.id,

bp.name,

bp.area_code,

bp.iso_code,

bp.region_id,

br.region_name

from ${APP}.ods_base_province bp

join ${APP}.ods_base_region br

on bp.region_id=br.id;

"

case $1 in

"first"){

$hive -e "$sql1"

$hive -e "$sql2"

};;

"all"){

$hive -e "$sql1"

};;

esac

2)增加脚本执行权限

chmod 770 ods_to_dwd_db.sh

3)执行脚本导入数据

ods_to_dwd_db.sh all 2020-03-11

4)查看导入数据

select * from dwd_fact_order_info where dt='2020-03-11';

select * from dwd_fact_order_detail where dt='2020-03-11';

select * from dwd_fact_comment_info where dt='2020-03-11';

select * from dwd_fact_order_refund_info where dt='2020-03-11';



文章转载自逗先生大数据,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论