Hive常用函数总结

数据与共享 2023-06-25

184

临时表：temp_dual

1.关系运算

1.判断相等 = 小于 <；小于<；大于等于>=；小于等于 <=;


2.不等 <>  
# 举例：select * from temp_dual where id<>1;


3.为空  is null；不为空 is not null


4.模糊查询 like 
# 举例：以字母d开头 select * from temp_dual where name  like 'd%';


4.Java的like：Rlike  以f开头，以r结尾。
# select * from temp_dual 'footbar' rlike '^f.*r$';


5.regexp:效果和 like类似
# 举例：判断是否全为数字 hive>select 1 from temp_dual where '324234'  regexp '^\\d+$';

2.数学运算

1.加法(+)、减法(-)、乘法(*)、除法(/)
# 举例:hive>select 1+2 from temp_dual limit 1;
# 结果：3


2.取于 %
# 举例：select (10%3.33333 333 333 333) from temp_dual  limit 1;
# 注意：hive只能精确到16位


3.位与 &
# hive>select 2&3 from temp_dual;
# 过程：0010 & 0011
# 结果：0010   即：2


4.位或 |
# hive>select 2|3 from temp_dual;
# 过程：0010 | 0011
# 结果：0110   即：3


5.位异或 ^
# hive>select 2^7 from temp_dual;
# 过程：0010 & 0111
# 结果：0101   即：5


6.取反 
# hive>select ~7 from temp_dual;
# 过程：0111
# 取反：1000
# 减一：0111
# 再取反：1000  即：-8
# 偷懒算法，取相反数为：-7 再减1；即-7-1=-8。

3.逻辑运算

与     或    非      and   or   not

4.数值计算

1.round 取整 四舍五入
# round(double a ,int b)。
# 可以指定精度为b。


2.floor 向下取整， 即：地板。


2.ceil 向上取整，即：天花板。


3.ceiling向上取整。


4.取随机数rand()。
# 举例: rand(1)  范围0-1


5.自然指数函数
# exp(double a) 返回自然对数：a的e次方。


6.log10
# 返回以10为底的对数
# hive>select log10(100) from temp_dual;
# 结果：2.0
# 以2为底的对数：log2 
# 以a为底b的对数：log(double a,double b)


7.a的b次方
# sqrt(double a)


8.a的平方根
# bin(bigint a)


9.返回string类型。a的二进制代码
# hive>select  bin(4)  from temp_dual ;
# 结果：100


10.hex(bigint a) 转成16进制
# hive>select hex(10) from temp_dual;
# 结果：A
# unhex反转十六进制


11.返回十六进制diamante对应的字符串。
# conv(bigint num ,int a ,int b)
# 将10由10进制转换为16进制：hive>select conv(10,10,16) from temp_dual;
# 结果：A


12.取绝对值abs
# hive>select abs(-12) from temp_dual ;
# 结果：12


13.正取余数pmod。返回值：int ，double
# hive>select pmod(4,3) from temp_dual;
# 结果：1


14.正弦函数sin；反正弦函数asin；


15.余弦函数cos；反余弦函数acos;


16.返回其本身positive 返回值：int  double
# hive>select positive(-10) from temp_dual ;
# 结果：-10；


17.求相反数negative  
# hive>select negative(-5) from temp_dual ;
# 结果：5。

5.日期函数

1.unix时间戳即：从1970-01-01 00:00:00 UTC到指定时间的秒数


2.unix_timestamp：获取当前unix时间戳。
# hive>select unix_timestamp() from temp_dual limit 1;
# 结果：1518491333


3.将具体日期转换成时间戳:
# hive>select unix_timestamp('20180213 11:23:45') from temp_dual limit 1;


4.from_unixtime：将时间戳转换为日期格式。
# hive>select from_unixtime(1518491333,'yyyymmdd') from temp_dual limit 1;
# 结果：20180213


5.to_date :转换为日期格式
# hive>select to_date('2018-02-13 11:23:45')  from temp_dual limit 1;
#结果：20180213


6.year ：具体年份
# hive>select * from year('2018-02-13 11:23:45') from temp_dual limit 1;
# 结果：2018
# 注意：年月日格式为（2018-09-09）。


6.其他 '2018-02-13 11:23:45'
# month：具体月份
# 结果：2
# day：具体某天
# 结果：13
# hour：具体某时
# 结果：11
# minute：具体某分
# 结果：23
# second：具体秒
# 结果：45


7.weekofyear：返回指定日期的周数
# hive>select weekofyear('2018-02-13 11:23:45') from temp_dual limit 1;
# 结果：7


8.datediff：返回两日期之前相差的天数
# 格式：datediff(string endDate,string biginDate）
# hive>select datediff('20180213','20180102') from temp_dual limit 1;
# 结果：41


9.date_add：日期增加
# hive>select date_add('2018-02-08',10) from  temp_dual limit 1;
# 结果：2018-02-18


9.date_sub：日期减少
# hive>select date_sub('2018-02-08',10) from  temp_dual limit 1;
# 结果：2018-01-29

6.条件函数

1.判断if，类似于三元运算
# 格式：if(boolean b,result1,result2) 当b为true时返回result1;否则返回result2
# hive>select if(2=3,4,5) from temp_dual limit 1;
# 结果：5


2.非空值的查找COALESCE
# 返回参数集合中的一个非空的值，如果都为空返回null
# hive>select COALESCE(null,'23','34','null') from temp_dual limit 1;
# 结果：23


3.逻辑判断 case when 
# hive>select case when 1=1 then 'true' else 'false' end  from temp_dual lime 1;
# hive>select case when id=1 then 'one' when id=2 then 'two' end from temp_dual limit 1;

7.字符串函数

1.length：返回字符串的长度
2.reverse：字符串的反转，和java中StringBuffer、StringBuidler类似。
3.concat：字符串拼接


4.concat_ws：带分隔符拼接
# hive>select concat_ws(',','ww','ss','xx') from temp_dual  limit 1;
# 结果：ww,ss,xx


5.substr/substring：截取
# 格式：substr(string str,start_num)
# 从指定位置开始截取 substr(string str,int start_num,int str_length)
# 从指定位置开始，截取str_length长的字符串：hive> select substring('aabbcc',3) from lxw_dual;
# 结果：bbcc
# hive>  selectsubstr('aabbcd',-1) from lxw_dual; （和ORACLE相同）
# 结果：d
# hive> select substr('aabbcc',3,2) from lxw_dual;
# 结果：bb
# hive>select substring('aabbcc',-2,2) from lxw_dual;
# 结果：cc


5.upper/ucase：将字符串转换为大写
6.lower/lcase：将字符串转换为小写


7.trim：去掉字符串开头和末尾的空格
# hive>select trim('   fds   fsd   ') from temp_dual limit 1;
# 结果：fds   fsd
# ltrim：去掉字符串左边的空格
# rtrim：去掉字符串右边的空格


8.regexp_replace
# 格式：regexp_replace(str1,str2,str3)
# hive>select regexp_replace('aabbcc','cc','00') from temp_dual limit 1;
# 结果：aabb00


9.regexp_extract，返回正则表达式的拆分规则
# hive> select regexp_extract('aa0bb', 'aa(.*?)(bb)', 1) from lxw_dual;
# 结果：0


10.parse_url
# 说明：返回URL中指定的部分。partToExtract的有效值为：HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, and USERINFO.
# hive> select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST') from lxw_dual;
# 结果：facebook.com
# hive> selectparse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY','k1') from temp_dual;
# 结果：v1


11.get_json_object
# 解析json的字符串json_string,返回path指定的内容。如果输入的json字符串无效，那么返回NULL。
# hive> select  get_json_object('{"store":
        {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
         "bicycle":{"price":19.95,"color":"red"} },
         "email":"amy@only_for_json_udf_test.net",
         "owner":"amy"
       } ','$.owner') from lxw_dual;
#  结果：amy


12.空格字符串函数space
# hive>select length(space(10)) from temp_dual limit 1;
# 结果：10


13.将字符串重复n次后输出repeat
# hive>select repeat('abc',5) from temp_dual;
# 结果：abcabcabcabcabc


14.返回字符串str第一个字符的ascii码 ascii
# hive> select ascii('abcdefg') from temp_dual;
# 结果：97


15.左补齐lpad
# hive> select lpad('aaa',10,'b') from temp_dual;
# 结果：bbbbbbbaaa


16.右补齐rpad
# hive> select rpad('aaa',10,'b') from temp_dual;
# 结果：aaabbbbbbb


17.split
# hive>select split('aatbbtcctdd','t') from temp_dual limit 1;
# 结果：['aa','bb',' cc','dd']
# 总结：形如Java中的字符串切割规则。按照t将字符串进行分割。


18.find_in_set
# 返回str在strlist第一次出现的位置，strlist是用逗号分割的字符串。如果没有找该str字符，则返回0;
# hive> select find_in_set('bb','aa,bb,cc') from temp_dual;
# 结果：2
# hive> select find_in_set('at','aa,bb,cc') from temp_dual;
# 结果：0

8.集合统计函数

1.count：统计条数


2.sum：求和


3.avg：求平均数


4.min：最小数


5.max：最大数


6.var_pop 统计结果集中col非空集合的总体变量（忽略null）


7.var_samp 统计结果集中col非空集合的样本变量（忽略null）


8.stddev_pop 该函数计算总体标准偏离，并返回总体变量的平方根，其返回值与VAR_POP函数的平方根相同


9.stddev_samp 该函数计算样本标准偏离


10.percentile 求准确的第pth个百分位数，p必须介于0和1之间，但是col字段目前只支持整数，不支持浮点数类型


11.percentile_approx 求近似的第pth个百分位数，p必须介于0和1之间，返回类型为double，但是col字段支持浮点类型。参数B控制内存消耗的近似精度，B越大，结果的准确度越高。默认为10,000。当col字段中的distinct值的个数小于B时，结果为准确的百分位数


12.histogram_numeric 以b为基准计算col的直方图信息。

9.复合类型构构建

1.map类型的构建：map
# hive> Create table temp_test as select map('100','tom','200','mary') as mp from temp_dual  limit 1 ;
# hive> describe temp_test;
# 输出：mp  map<string,string>
# hive> select mp from temp_test;
# 输出：{"100":"tom","200":"mary"}


2.Struct类型构建: struct
# hive> create table temp_test as select struct('tom','mary','tim')as  from temp_dual;
# hive> describe temp_test;
# 输出：mp struct<col1:string,col2:string,col3:string>
# hive> select t from temp_test;
# 输出：{"col1":"tom","col2":"mary","col3":"tim"}


3.array类型构建: array
# hive> create table temp_test as selectarray("tom","mary","tim") as mp from temp_dual;
# hive> describe temp_test;
# 输出：mp array<string>
# hive> select t from temp_test;
# 输出：["tom","mary","tim"]

10.复杂类型使用

1.array类型访问: A[n]
# 返回数组A中的第n个变量值。数组的起始下标为0
# hive> create table temp_test as selectarray("tom","mary","tim") as t from temp_dual;
# hive> select t[0],t[1],t[2] from temp_test;
# 输出：tom   mary  tim


2.map类型访问: M[key]
# hive> select mp['200'],mp['100'] from temp_test;
# mary    tom


3.truct类型访问: S.x
# hive> select mp.col1,mp.col3 from lxw_test;
# 输出：tom tim

11.复杂类型长度统计函数

1.Map类型长度函数: size(Map<K.V>)
# hive> select size(map('100','tom','101','mary')) from lxw_dual;
# 输出：2


2.array类型长度函数: size(Array<T>)  
# hive> select size(array('100','101','102','103')) from lxw_dual;
# 4


3.cast：类型转换
# hive> select cast(1 as bigint) from lxw_dual;

*欢迎关注*

文章转载自数据与共享，如果涉嫌侵权，请发送邮件至：contact@modb.pro进行举报，并提供相关证据，一经查实，墨天轮将立刻删除相关内容。

Hive常用函数总结

评论