测试数据集:

create external table if not exists order_detail(
user_id string,
device_id string,
user_type string,
price double,
sales int
)
row format delimited fields terminated by '\t'
lines terminated by '\n' 
stored as textfile;

hdfs dfs -put /home/liguodong/data/data.text /temp/lgd

load data inpath '/temp/lgd/data.text'  overwrite into table order_detail;

select * from order_detail;
+----------+-------------+------------+--------+--------+--+
| user_id | device_id | user_type | price | sales |
+----------+-------------+------------+--------+--------+--+
| zhangsa | dfsadsa323 | new | 67.1   | 2      |
| lisi | 543gfd      | old | 43.32  | 1      |
| wanger | 65ghf       | new | 88.88  | 3      |
| liiu | fdsfagwe | new | 66.0   | 1      |
| qibaqiu | fds | new | 54.32  | 1      |
| wangshi | f332 | old | 77.77  | 2      |
| liwei | hfd | old | 88.44  | 3      |
| wutong | 543gdfsd    | new | 56.55  | 6      |
| lilisi | dsfgg | new | 88.88  | 5      |
| qishili | fds | new | 66.66  | 5      |
+----------+-------------+------------+--------+--------+--+

关系运算

## > < =
##注意: String 的比较要注意(常用的时间比较可以先 to_date 之后再比较) select long_time>short_time, long_time<short_time,long_time=short_time, to_date(long_time)=to_date(short_time) from ( select '2017-01-11 00:00:00' as long_time, '2017-01-11' as short_time from order_detail limit 1
)bb;

result:
true    false   false   true

## 空值判断
select 1 from order_detail where NULL is NULL limit 1; ## 非空判断
select 1 from order_detail where 1 is not NULL limit 1; ## LIKE
语法: A LIKE B
描述: 字符串A符合表达式B的正则语法,则为TRUE;否则为FALSE. B中字符”_”表示任意单个字符,而字符”%”表示任意数量的字符。 select user_id from order_detail where user_id like 'li%'; +----------+--+
| user_id | +----------+--+
| lisi     |
| liiu     |
| liwei    |
| lilisi | +----------+--+

select user_id from order_detail where user_id like 'li__'; +----------+--+
| user_id | +----------+--+
| lisi     |
| liiu | +----------+--+


## RLIKE
语法: A RLIKE B
描述: 字符串A符合JAVA正则表达式 B 的正则语法,则为 TRUE;否则为 FALSE。


select user_id from order_detail where user_id rlike '^l.*i$'; +----------+--+
| user_id | +----------+--+
| lisi     |
| liwei    |
| lilisi | +----------+--+


select device_id from order_detail where device_id rlike '^\\w+$'; +-------------+--+
| device_id | +-------------+--+
| dfsadsa323  |
| 543gfd      |
| 65ghf       |
| fdsfagwe    |
| fds         |
| f332        |
| hfd         |
| 543gdfsd    |
| dsfgg       |
| fds | +-------------+--+

select device_id from order_detail where device_id rlike '^[a-zA-Z]+$'; +------------+--+
| device_id | +------------+--+
| fdsfagwe   |
| fds        |
| hfd        |
| dsfgg      |
| fds | +------------+--+

select device_id from order_detail where device_id rlike '^[a-zA-Z]{4,}$'; +------------+--+
| device_id | +------------+--+
| fdsfagwe   |
| dsfgg | +------------+--+


## REGEXP
语法: A REGEXP B
描述: 功能与 RLIKE 相同
select device_id from order_detail where device_id REGEXP '^[a-zA-Z]{4,}$'; +------------+--+
| device_id | +------------+--+
| fdsfagwe   |
| dsfgg | +------------+--+

数学运算

## + - * / 
注意:
hive中最高精度的数据类型是 double,只精确到小数点后16位,在做除法运算的时候要特别注意

select user_id,100-price,sales-1,price*sales,price/sales, ceil(28.0/6.999999999999999999999),ceil(28.0/6.99999999999999), price%sales from order_detail;

+----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+
| user_id | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | +----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+
| zhangsa  | 32.900000000000006  | 1    | 134.2               | 33.55               | 4    | 5    | 1.0999999999999943  |
| lisi     | 56.68               | 0    | 43.32               | 43.32               | 4    | 5    | 0.3200000000000003  |
| wanger   | 11.120000000000005  | 2    | 266.64              | 29.626666666666665  | 4    | 5    | 1.8799999999999955  |
| liiu     | 34.0                | 0    | 66.0                | 66.0                | 4    | 5    | 0.0                 |
| qibaqiu  | 45.68               | 0    | 54.32               | 54.32               | 4    | 5    | 0.3200000000000003  |
| wangshi  | 22.230000000000004  | 1    | 155.54              | 38.885              | 4    | 5    | 1.769999999999996   |
| liwei    | 11.560000000000002  | 2    | 265.32              | 29.48               | 4    | 5    | 1.4399999999999977  |
| wutong   | 43.45               | 5    | 339.29999999999995  | 9.424999999999999   | 4    | 5    | 2.549999999999997   |
| lilisi   | 11.120000000000005  | 4    | 444.4               | 17.776              | 4    | 5    | 3.8799999999999955  |
| qishili | 33.34 | 4 | 333.29999999999995 | 13.331999999999999 | 4 | 5 | 1.6599999999999966 | +----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+

## %
注意:
精度在 hive 中是个很大的问题,类似这样的操作最好通过 round 指定精度
select 8.4 % 4,round(8.4 % 4 , 2) from order_detail limit 1; +----------------------+------+--+
| _c0 | _c1 | +----------------------+------+--+
| 0.40000000000000036 | 0.4 | +----------------------+------+--+


## 位与& 位或| 位异或^ 位取反~

select 4&6, 8&4, 4|8,6|8,4^8,6^4,~6,~3 from order_detail limit 1; 

逻辑运算

## 逻辑与AND 逻辑或OR 逻辑非NOT
注意:优先级一次为NOT AND OR

## 以下两条SQL互斥
select user_id from order_detail where not ((user_id='wanger' or user_id like 'li%') and user_type='old'); +----------+--+
| user_id | +----------+--+
| zhangsa  |
| wanger   |
| liiu     |
| qibaqiu  |
| wangshi  |
| wutong   |
| lilisi   |
| qishili | +----------+--+

select user_id from order_detail where ((user_id='wanger' or user_id like 'li%') and user_type='old'); +----------+--+
| user_id | +----------+--+
| lisi     |
| liwei | +----------+--+

数值计算函数

## 取整: round
语法: round(double a)
说明: 遵循四舍五入

## 指定精度取整: round
语法: round(double a, int d)

## 向下取整: floor
说明: 返回等于或者小于该 double 变量的最大的整数

## 向上取整: ceil
说明: 返回等于或者大于该 double 变量的最小的整数

## 向上取整: ceiling
说明: 与ceil功能相同

select user_id,price,round(price),round(price,0),round(price,1),floor(price),ceil(price),ceiling(price) from order_detail;

+----------+--------+-------+-------+-------+------+------+------+--+
| user_id | price | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | +----------+--------+-------+-------+-------+------+------+------+--+
| zhangsa  | 67.1   | 67.0  | 67.0  | 67.1  | 67   | 68   | 68   |
| lisi     | 43.32  | 43.0  | 43.0  | 43.3  | 43   | 44   | 44   |
| wanger   | 88.88  | 89.0  | 89.0  | 88.9  | 88   | 89   | 89   |
| liiu     | 66.0   | 66.0  | 66.0  | 66.0  | 66   | 66   | 66   |
| qibaqiu  | 54.32  | 54.0  | 54.0  | 54.3  | 54   | 55   | 55   |
| wangshi  | 77.77  | 78.0  | 78.0  | 77.8  | 77   | 78   | 78   |
| liwei    | 88.44  | 88.0  | 88.0  | 88.4  | 88   | 89   | 89   |
| wutong   | 56.55  | 57.0  | 57.0  | 56.6  | 56   | 57   | 57   |
| lilisi   | 88.88  | 89.0  | 89.0  | 88.9  | 88   | 89   | 89   |
| qishili | 66.66 | 67.0 | 67.0 | 66.7 | 66 | 67 | 67 | +----------+--------+-------+-------+-------+------+------+------+--+


## 取随机数: rand
说明: 返回一个 0 到 1 范围内的随机数。如果指定种子 seed(整数),则会得到一个稳定的随机数序列。

## 自然指数: exp  自然对数: ln

select user_id,sales,price,rand(),rand(sales),exp(sales),ln(price) from order_detail;

+----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+
| user_id | sales | price | _c3 | _c4 | _c5 | _c6 | +----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+
| zhangsa  | 2      | 67.1   | 0.708066911383928    | 0.7311469360199058   | 7.38905609893065    | 4.206184043977636   |
| lisi     | 1      | 43.32  | 0.509552420396184    | 0.9014476240300544   | 2.718281828459045   | 3.76861442213279    |
| wanger   | 3      | 88.88  | 0.2462642074705902   | 0.49682259343089075  | 20.085536923187668  | 4.487287145331375   |
| liiu     | 1      | 66.0   | 0.720800913818751    | 0.9858769332362016   | 2.718281828459045   | 4.189654742026425   |
| qibaqiu  | 1      | 54.32  | 0.7358273250797408   | 0.8571240443456863   | 2.718281828459045   | 3.9948924832504407  |
| wangshi  | 2      | 77.77  | 0.11092554405907218  | 0.9874208338984266   | 7.38905609893065    | 4.353755752706852   |
| liwei    | 3      | 88.44  | 0.5162574691353392   | 0.2281579303734177   | 20.085536923187668  | 4.482324355989245   |
| wutong   | 6      | 56.55  | 0.2753658209591686   | 0.07479382813444624  | 403.4287934927351   | 4.03512520256213    |
| lilisi   | 5      | 88.88  | 0.25777632824045826  | 0.7431577182910525   | 148.4131591025766   | 4.487287145331375   |
| qishili | 5 | 66.66 | 0.06419187859857822 | 0.9495832704567262 | 148.4131591025766 | 4.199605072879594 | +----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+


## 以10为底对数: log10  以2为底对数: log2 

## 对数: log
语法: log(double base, double a)
说明: 返回以 base 为底的 a 的对数

select log10(100),log2(8),log(4,256) from order_detail limit 1; +------+------+------+--+
| _c0 | _c1 | _c2 | +------+------+------+--+
| 2.0 | 3.0 | 4.0 | +------+------+------+--+

## 幂运算: pow power  开平方: sqrt

select pow(2,4), power(2,4),sqrt(16) from order_detail limit 1; +-------+-------+------+--+
| _c0 | _c1 | _c2 | +-------+-------+------+--+
| 16.0 | 16.0 | 4.0 | +-------+-------+------+--+

## 二进制: bin  十六进制: hex  反转十六进制: unhex  

## 进制转换: conv
语法: conv(BIGINT num, int from_base, int to_base)
说明: 将数值 num 从 from_base 进制转化到 to_base 进制

select bin(7),hex('19'),hex('abc'),unhex('616263'),unhex('41'),conv(17,10,16),conv(17,10,2) from order_detail limit 1; +------+-------+---------+------+------+------+--------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | +------+-------+---------+------+------+------+--------+--+
| 111 | 3139 | 616263 | abc | A | 11 | 10001 | +------+-------+---------+------+------+------+--------+--+

## 绝对值:abs  正取余:pmod  正弦:sin  反正弦:asin  余弦:cos  反余弦:acos  返回A的值:positive  返回A的相反数:negative

select abs(-13),abs(10.10),pmod(9,4),pmod(-9,4),
sin(0.8),asin(0.7173560908995228),cos(0.9), acos(0.6216099682706644),
positive(-10),negative(-10)
from order_detail limit 1; +------+-------+------+------+---------------------+------+---------------------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | _c8 | _c9 | +------+-------+------+------+---------------------+------+---------------------+------+------+------+--+
| 13 | 10.1 | 1 | 3 | 0.7173560908995228 | 0.8 | 0.6216099682706644 | 0.9 | -10 | 10 | +------+-------+------+------+---------------------+------+---------------------+------+------+------+--+

日期函数

## UNIX时间戳转日期: from_unixtime ## 日期转UNIX时间戳,指定格式日期转UNIX 时间戳,获取当前UNIX时间戳: unix_timestamp 说明: 转换格式为"yyyy-MM-dd HH:mm:ss"的日期到 UNIX 时间戳。如果转化失败,则返回 0。 select 
 from_unixtime(1323308943),
 from_unixtime(1323308943,'yyyyMMdd'),
 unix_timestamp(),
 unix_timestamp('2017-12-07 16:01:03'),
 unix_timestamp('20171207 16-01-03','yyyyMMdd HH-mm-ss')
from 
order_detail limit 1; +----------------------+-----------+-------------+-------------+-------------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | +----------------------+-----------+-------------+-------------+-------------+--+
| 2011-12-08 09:49:03 | 20111208 | 1489029488 | 1512633663 | 1512633663 | +----------------------+-----------+-------------+-------------+-------------+--+

## 日期时间转日期:to_date 日期转年:year 日期转月:month 日期转天:day 日期转小时:hour 日期转分钟:minute 日期转秒:second select
to_date('2016-12-08 10:03:01'), year('2016-12-08 10:03:01'), month('2016-12-08'), day('2016-12-08 10:03:01'), hour('2016-12-08 10:03:01'), minute('2016-12-08 10:03:01'), second('2016-12-08 10:03:01') from order_detail limit 1;
+-------------+-------+------+------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | +-------------+-------+------+------+------+------+------+--+
| 2016-12-08 | 2016 | 12 | 8 | 10 | 3 | 1 | +-------------+-------+------+------+------+------+------+--+


## 日期转周:weekofyear  日期比较:datediff
select 
weekofyear('2016-12-08 10:03:01'),
datediff('2016-12-08','2016-11-27') 
from order_detail limit 1; +------+------+--+
| _c0 | _c1 | +------+------+--+
| 49 | 11 | +------+------+--+


## 日期增加: date_add 日期减少: date_sub
select date_add('2016-12-08',10),date_add('2016-12-08',-10),
date_sub('2016-12-08',-10),date_sub('2016-12-08',10) from order_detail limit 1; +-------------+-------------+-------------+-------------+--+
| _c0 | _c1 | _c2 | _c3 | +-------------+-------------+-------------+-------------+--+
| 2016-12-18 | 2016-11-28 | 2016-12-18 | 2016-11-28 | +-------------+-------------+-------------+-------------+--+

select 
date_add('20161208',10), from_unixtime(unix_timestamp(date_add('2016-12-08',10)),'yyyyMMdd'),
from_unixtime(unix_timestamp(date_add('2016-12-08',10),'yyyy-MM-dd'),'yyyyMMdd') from order_detail limit 1;

+-------+-------+-----------+--+
| _c0 | _c1 | _c2 | +-------+-------+-----------+--+
| NULL | NULL | 20161218 | +-------+-------+-----------+--+

条件函数

## IF CASE COALESCE
说明: COALESCE返回参数中的第一个非空值;如果所有值都为 NULL,那么返回 NULL

select user_id,device_id,user_type,sales,
if(user_type='new',user_id,'***'), 
COALESCE(null,user_id,device_id,user_type),
COALESCE(null,null,device_id,user_type),
case user_type when 'new' then 'new_user' when 'old' then 'old_user' 
else 'others' end,
case when user_type='new' and sales>=5 then 'gold_user' when user_type='old' and sales<3 then 'bronze_user' 
else 'silver_user' end
from order_detail;


+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| user_id | device_id | user_type | sales |   _c4    |   _c5    |     _c6     |    _c7    |     _c8      |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| zhangsa | dfsadsa323 | new | 2      | zhangsa | zhangsa | dfsadsa323 | new_user | silver_user |
| lisi | 543gfd      | old | 1      | ***      | lisi | 543gfd      | old_user | bronze_user |
| wanger | 65ghf       | new | 3      | wanger | wanger | 65ghf       | new_user | silver_user |
| liiu | fdsfagwe | new | 1      | liiu | liiu | fdsfagwe | new_user | silver_user |
| qibaqiu | fds | new | 1      | qibaqiu | qibaqiu | fds | new_user | silver_user |
| wangshi | f332 | old | 2      | ***      | wangshi | f332 | old_user | bronze_user |
| liwei | hfd | old | 3      | ***      | liwei | hfd | old_user | silver_user |
| wutong | 543gdfsd    | new | 6      | wutong | wutong | 543gdfsd    | new_user | gold_user |
| lilisi | dsfgg | new | 5      | lilisi | lilisi | dsfgg | new_user | gold_user |
| qishili | fds | new | 5      | qishili | qishili | fds | new_user | gold_user |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+

字符串函数


## 字符串长度:length 字符串反转:reverse 字符串连接:concat 带分隔符字符串连接:concat_ws select 
user_id,device_id,user_type,length(user_id),reverse(user_id), concat(user_id,device_id,user_type),concat_ws('_',user_id,device_id,user_type) from order_detail;


+----------+-------------+------------+------+----------+-----------------------+-------------------------+--+
| user_id | device_id | user_type | _c3 | _c4 | _c5 | _c6 | +----------+-------------+------------+------+----------+-----------------------+-------------------------+--+
| zhangsa  | dfsadsa323  | new        | 7    | asgnahz  | zhangsadfsadsa323new  | zhangsa_dfsadsa323_new  |
| lisi     | 543gfd      | old        | 4    | isil     | lisi543gfdold         | lisi_543gfd_old         |
| wanger   | 65ghf       | new        | 6    | regnaw   | wanger65ghfnew        | wanger_65ghf_new        |
| liiu     | fdsfagwe    | new        | 4    | uiil     | liiufdsfagwenew       | liiu_fdsfagwe_new       |
| qibaqiu  | fds         | new        | 7    | uiqabiq  | qibaqiufdsnew         | qibaqiu_fds_new         |
| wangshi  | f332        | old        | 7    | ihsgnaw  | wangshif332old        | wangshi_f332_old        |
| liwei    | hfd         | old        | 5    | iewil    | liweihfdold           | liwei_hfd_old           |
| wutong   | 543gdfsd    | new        | 6    | gnotuw   | wutong543gdfsdnew     | wutong_543gdfsd_new     |
| lilisi   | dsfgg       | new        | 6    | isilil   | lilisidsfggnew        | lilisi_dsfgg_new        |
| qishili  | fds         | new        | 7    | ilihsiq  | qishilifdsnew         | qishili_fds_new         |


## 字符串截取函数: substr,substring
语法: substr(string A, int start),substring(string A, int start)
说明:返回字符串 A 从 start 位置到结尾的字符串

语法: substr(string A, int start, int len),substring(string A, int start, int len)
说明:返回字符串A从start位置开始,长度为len的字符串

select 
user_id,substr(user_id,3),substr(user_id,-2), substring(user_id,1,2),substr(user_id,-2,2) from order_detail;

+----------+--------+------+------+------+--+
| user_id | _c1 | _c2 | _c3 | _c4 | +----------+--------+------+------+------+--+
| zhangsa  | angsa  | sa   | zh   | sa   |
| lisi     | si     | si   | li   | si   |
| wanger   | nger   | er   | wa   | er   |
| liiu     | iu     | iu   | li   | iu   |
| qibaqiu  | baqiu  | iu   | qi   | iu   |
| wangshi  | ngshi  | hi   | wa   | hi   |
| liwei    | wei    | ei   | li   | ei   |
| wutong   | tong   | ng   | wu   | ng   |
| lilisi   | lisi   | si   | li   | si   |
| qishili | shili | li | qi | li | +----------+--------+------+------+------+--+


## 字符串转大写:upper,ucase  字符串转小写:lower,lcase

select user_id,upper(user_id),ucase(user_id), lower(upper(user_id)),lcase(ucase(user_id)),lower(ucase(user_id))
from order_detail; +----------+----------+----------+----------+----------+----------+--+
| user_id | _c1 | _c2 | _c3 | _c4 | _c5 | +----------+----------+----------+----------+----------+----------+--+
| zhangsa  | ZHANGSA  | ZHANGSA  | zhangsa  | zhangsa  | zhangsa  |
| lisi     | LISI     | LISI     | lisi     | lisi     | lisi     |
| wanger   | WANGER   | WANGER   | wanger   | wanger   | wanger   |
| liiu     | LIIU     | LIIU     | liiu     | liiu     | liiu     |
| qibaqiu  | QIBAQIU  | QIBAQIU  | qibaqiu  | qibaqiu  | qibaqiu  |
| wangshi  | WANGSHI  | WANGSHI  | wangshi  | wangshi  | wangshi  |
| liwei    | LIWEI    | LIWEI    | liwei    | liwei    | liwei    |
| wutong   | WUTONG   | WUTONG   | wutong   | wutong   | wutong   |
| lilisi   | LILISI   | LILISI   | lilisi   | lilisi   | lilisi   |
| qishili | QISHILI | QISHILI | qishili | qishili | qishili | +----------+----------+----------+----------+----------+----------+--+

## 去两边的空格:trim  左边去空格:ltrim  右边去空格:rtrim

select trim(' abc '),ltrim(' abc'),rtrim('abc ') from order_detail limit 1; +------+------+------+--+
| _c0 | _c1 | _c2 | +------+------+------+--+
| abc | abc | abc | +------+------+------+--+


## 正则表达式替换: regexp_replace 说明:将字符串 A 中的符合 java 正则表达式 B 的部分替换为 C。注意,在有些情况下要使用转义字符, 类似 oracle 中的 regexp_replace 函数。

## 正则表达式解析: regexp_extract 将字符串 subject 按照 pattern 正则表达式的规则拆分,返回 index 指定的字符。 注意,在有些情况下要使用转义字符,如等号要用双竖线转义,这是java正则表达式的规则。 
select user_id,regexp_replace(user_id, 'li|ng', '**'), regexp_extract(user_id,'li(.*?)(si)',1), regexp_extract(user_id,'li(.*?)(si)',2), regexp_extract(user_id,'li(.*?)(si)',0) from order_detail;
+----------+----------+------+------+---------+--+
| user_id | _c1 | _c2 | _c3 | _c4 | +----------+----------+------+------+---------+--+
| zhangsa  | zha**sa | | | | | lisi | **si | | si | lisi | | wanger | wa**er | | | | | liiu | **iu | | | | | qibaqiu | qibaqiu | | | | | wangshi | wa**shi | | | | | liwei | **wei | | | | | wutong | wuto**   |      |      |         |
| lilisi   | ****si | li | si | lilisi | | qishili | qishi**  |      |      |         |
+----------+----------+------+------+---------+--+


select 
regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k1\\=([^&]+)', 1), regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k2\\=([^#]+)', 1)
from order_detail limit 1; +-----------+--------+--+
| _c0 | _c1 | +-----------+--------+--+
| v1543643 | v3245 | +-----------+--------+--+


## URL解析:parse_url 语法: parse_url(string urlString, string partToExtract [, string keyToExtract])
说明:返回 URL 中指定的部分。
partToExtract 的有效值为: HOST, PATH, QUERY, REF,PROTOCOL, AUTHORITY, FILE, and USERINFO.
举例:

select 
parse_url('https://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'),
parse_url('https://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY','k2'),
parse_url('https://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL'),
parse_url('https://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE')
from order_detail limit 1; +---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | +---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+
| facebook.com | /path1/p.php | k1=v1&k2=v2 | v2 | Ref1 | http | facebook.com | /path1/p.php?k1=v1&k2=v2 | +---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+


## json解析:
get_json_object
语法: get_json_object(string json_string, string path) 说明:解析 json 的字符串 json_string,返回 path 指定的内容。如果输入的 json 字符串无效,那么返回 NULL。

select 
get_json_object(
'{"store": {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} }, "email":"amy@only_for_json_udf_test.net", "owner":"amy" }',
'$.owner'),
get_json_object(
'{"store": {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} }, "email":"amy@only_for_json_udf_test.net", "owner":"amy" }',
'$.store.fruit[0].type')
from order_detail limit 1; +------+--------+--+
| _c0 | _c1 | +------+--------+--+
| amy | apple | +------+--------+--+


## json_tuple 语法: json_tuple(string jsonStr,string k1,string k2, ...)
参数为一组键k1,k2……和JSON字符串,返回值的元组。该方法比 get_json_object 高效,因为可以在一次调用中输入多个键.

select a.user_id, b.* from order_detail a 
lateral view 
json_tuple('{"store": {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} }, "email":"amy@only_for_json_udf_test.net", "owner":"amy" }', 'email', 'owner') b as email, owner limit 1; +----------+---------------------------------+--------+--+
| user_id | email | owner | +----------+---------------------------------+--------+--+
| zhangsa | amy@only_for_json_udf_test.net | amy | +----------+---------------------------------+--------+--+

## parse_url_tuple

SELECT b.*
from 
(
select 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' as urlstr
from
order_detail limit 1 )a LATERAL VIEW parse_url_tuple(a.urlstr, 'HOST', 'PATH', 'QUERY', 'QUERY:k1') b as host, path, query, query_k1 LIMIT 1;

+---------------+---------------+--------------+-----------+--+
| host | path | query | query_k1 | +---------------+---------------+--------------+-----------+--+
| facebook.com | /path1/p.php | k1=v1&k2=v2 | v1 | +---------------+---------------+--------------+-----------+--+


空格字符串:space   重复字符串:repeat 首字符ascii:ascii 

select space(10), length(space(10)), repeat('abc',5), ascii('abcde')
from order_detail limit 1; +-------------+------+------------------+------+--+
| _c0 | _c1 | _c2 | _c3 | +-------------+------+------------------+------+--+
| | 10 | abcabcabcabcabc | 97 | +-------------+------+------------------+------+--+


左补足函数:lpad  右补足函数:rpad 
语法: lpad(string str, int len, string pad)
说明:lpad将 str 进行用 pad 进行左补足到 len 位, rpad将 str 进行用 pad 进行右补足到 len 位
注意:与 GP,ORACLE 不同; pad不能默认

select lpad('abc',10,'td'),rpad('abc',10,'td') 
from order_detail limit 1; +-------------+-------------+--+
| _c0 | _c1 | +-------------+-------------+--+
| tdtdtdtabc | abctdtdtdt | +-------------+-------------+--+

分割字符串函数: split

集合查找函数: find_in_set
语法: find_in_set(string str, string strList)
说明: 返回 str 在 strlist 第一次出现的位置, strlist 是用逗号分割的字符串。如果没有找该 str 字符,则返回 0


select 
split('abtcdtef','t'),
find_in_set('ab','ef,ab,de'),
find_in_set('at','ef,ab,de') 
from order_detail limit 1; +-------------------+------+------+--+ | _c0        | _c1 | _c2  |
+-------------------+------+------+--+
| ["ab","cd","ef"]  | 2    | 0    |
+-------------------+------+------+--+

## string转map:str_to_map 
语法:str_to_map(text[, delimiter1, delimiter2])
说明:使用两个分隔符将文本拆分为键值对。 Delimiter1将文本分成K-V对,Delimiter2分割每个K-V对。
对于delimiter1默认分隔符是',',对于delimiter2默认分隔符是':'。
select str_to_map('aaa:11&bbb:22', '&', ':')
from order_detail limit 1; +--------------------------+--+
| _c0 | +--------------------------+--+
| {"bbb":"22","aaa":"11"} | +--------------------------+--+

select str_to_map('aaa:11&bbb:22', '&', ':')['aaa']
from order_detail limit 1; +------+--+
| _c0 | +------+--+
| 11 | +------+--+

select str_to_map('aaa:11,bbb:22')
from person limit 1;

{"bbb":"22","aaa":"11"}

集合统计函数

## 个数统计:count  总和统计:sum
语法: count(*), count(expr), count(DISTINCT expr[, expr_.]) 说明: count(*)统计检索出的行的个数,包括 NULL 值的行; 
count(expr)返回指定字段的非空值的个数; 
count(DISTINCT expr[, expr_.])返回指定字段的不同的非空值的个数 语法: sum(col), sum(DISTINCT col)
说明: 
sum(col)统计结果集中 col 的相加的结果; sum(DISTINCT col)统计结果中 col 不同值

select 
count(*),count(user_type),count(distinct user_type), sum(sales),sum(distinct sales) from order_detail; +------+------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | +------+------+------+------+------+--+
| 10 | 10 | 2 | 29 | 17 | +------+------+------+------+------+--+

## 平均值统计:avg  最小值统计:min  最大值统计:max

select 
avg(sales),avg(distinct sales),min(sales),max(distinct sales)
from order_detail; +------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 | +------+------+------+------+--+
| 2.9 | 3.4 | 1 | 6 | +------+------+------+------+--+


## 标准差:stddev_samp, stddev, stddev_pop
stddev_pop <==> stddev ## 方差:var_samp, var_pop

当我们需要真实的标准差/方差的时候最好是使用: stddev stddev_pop var_pop
而只是需要得到少量数据的标准差/方差的近似值可以选用: stddev_samp var_samp

select 
var_pop(sales),var_samp(sales),
stddev_pop(sales),stddev(sales),stddev_samp(sales), 
pow(stddev_pop(sales),2),pow(stddev_samp(sales),2)
from order_detail; +-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | +-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+
| 3.09 | 3.433333333333333 | 1.7578395831246945 | 1.7578395831246945 | 1.8529256146249728 | 3.0899999999999994 | 3.4333333333333336 | +-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+


## 百分位数: percentile 近似百分位数: percentile_approx 直方图: histogram_numeric

语法: percentile_approx(DOUBLE col, p [, B]) 返回值: double 说明: 求近似的第 pth 个百分位数, p 必须介于 0 和 1 之间,返回类型为 double,但是col 字段支持浮点类型。 参数 B 控制内存消耗的近似精度, B越大,结果的准确度越高。默认为 10,000。 当 col 字段中的 distinct 值的个数小于 B 时,结果为准确的百分位数 
select 
percentile(sales,0.2),
percentile_approx(sales,0.5,10000), histogram_numeric(sales,5)
from order_detail; +------+------+----------------------------------------------------------------------------------------------+--+
| _c0 | _c1 | _c2 | +------+------+----------------------------------------------------------------------------------------------+--+
| 1.0  | 2.0  | [{"x":1.0,"y":3.0},{"x":2.0,"y":2.0},{"x":3.0,"y":2.0},{"x":5.0,"y":2.0},{"x":6.0,"y":1.0}]  |
+------+------+----------------------------------------------------------------------------------------------+--+

select 
inline(histogram_numeric(sales,10)) from order_detail;

+------+------+--+
| x | y | +------+------+--+
| 1.0  | 3.0  |
| 2.0  | 2.0  |
| 3.0  | 2.0  |
| 5.0  | 2.0  |
| 6.0 | 1.0 | +------+------+--+

### 后面可以输入多个百分位数,返回类型也为 array<double>,其中为对应的百分位数。
select 
percentile(sales,array(0.2,0.4,0.6)),
percentile_approx(sales,array(0.2,0.4,0.6),10000) from order_detail;

+----------------+----------------+--+
| _c0 | _c1 | +----------------+----------------+--+
| [1.0,2.0,3.0]  | [1.0,1.5,2.5]  |
+----------------+----------------+--+

复杂类型访问操作及统计函数

测试数据集:
tony    1338    hello,woddd     1,2     a1,a2,a3        k1:1.0,k2:2.0,k3:3.0    s1,s2,s3,4
mark    5453    kke,ladyg       2,3     a4,a5,a6        k4:4.0,k5:5.0,k2:6.0    s4,s5,s6,6
ivyfd   4323    aa,thq,dsx      3,6     a7,a8,a9        k7:7.0,k8:8.0,k2:9.0    s7,s8,s9,9
drop table employees;
create external table if not exists employees(
name string,
salary string,
happy_word string, happy_num array<int>,
subordinates array<string>,
deductions map<string,float>,
address struct<street:string,city:string,state:string,zip:int>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile;

hdfs dfs -put /home/liguodong/data/muldata.txt /temp/lgd

load data inpath '/temp/lgd/muldata.txt' overwrite into table employees;

select * from employees;

Getting log thread is interrupted, since query is done! +--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+
| name | salary | happy_word | happy_num | subordinates | deductions | address | +--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+
| tony   | 1338    | hello,woddd  | [1,2]      | ["a1","a2","a3"]  | {"k1":1.0,"k2":2.0,"k3":3.0}  | {"street":"s1","city":"s2","state":"s3","zip":4}  |
| mark   | 5453    | kke,ladyg    | [2,3]      | ["a4","a5","a6"]  | {"k4":4.0,"k5":5.0,"k2":6.0}  | {"street":"s4","city":"s5","state":"s6","zip":6}  |
| ivyfd  | 4323    | aa,thq,dsx   | [3,6]      | ["a7","a8","a9"]  | {"k7":7.0,"k8":8.0,"k2":9.0}  | {"street":"s7","city":"s8","state":"s9","zip":9}  |
+--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+

## 访问数组 Map 结构体

select 
name,salary,
subordinates[1],deductions['k2'],deductions['k3'],address.city 
from employees; +--------+---------+------+------+-------+-------+--+
| name | salary | _c2 | _c3 | _c4 | city | +--------+---------+------+------+-------+-------+--+
| tony   | 1338    | a2   | 2.0  | 3.0   | s2    |
| mark   | 5453    | a5   | 6.0  | NULL  | s5    |
| ivyfd | 4323 | a8 | 9.0 | NULL | s8 | +--------+---------+------+------+-------+-------+--+

## Map类型长度  Array类型长度

select size(deductions),size(subordinates) from employees limit 1;

+------+------+--+
| _c0 | _c1 | +------+------+--+
| 3 | 3 | +------+------+--+

## 类型转换: cast

select cast(salary as int),cast(deductions['k2'] as bigint) from employees;

+---------+------+--+
| salary | _c1 | +---------+------+--+
| 1338    | 2    |
| 5453    | 6    |
| 4323 | 9 | +---------+------+--+

### LATERAL VIEW 行转列
SELECT 
name, ad_subordinate FROM employees LATERAL VIEW explode(subordinates) addTable AS ad_subordinate;
+--------+-----------------+--+
| name | ad_subordinate | +--------+-----------------+--+
| tony   | a1              |
| tony   | a2              |
| tony   | a3              |
| mark   | a4              |
| mark   | a5              |
| mark   | a6              |
| ivyfd  | a7              |
| ivyfd  | a8              |
| ivyfd | a9 | +--------+-----------------+--+

SELECT 
name, count(1)
FROM employees 
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate group by name; +--------+------+--+ | name | _c1  |
+--------+------+--+
| ivyfd  | 3    |
| mark   | 3    |
| tony | 3 | +--------+------+--+

SELECT ad_subordinate, ad_num 
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate LATERAL VIEW explode(happy_num) addTable2 AS ad_num; +-----------------+---------+--+ | ad_subordinate  | ad_num | +-----------------+---------+--+ | a1 | 1 | | a1 | 2 | | a2 | 1 | | a2 | 2 | | a3 | 1 | | a3 | 2 | | a4 | 2 | | a4 | 3 | | a5 | 2 | | a5 | 3 | | a6 | 2 | | a6 | 3 | | a7 | 3 | | a7 | 6 | | a8 | 3 | | a8 | 6 | | a9 | 3 | | a9 | 6 | +-----------------+---------+--+ ### 多个LATERAL VIEW
SELECT 
name, count(1) 
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate LATERAL VIEW explode(happy_num) addTable2 AS ad_num group by name; +--------+------+--+
| name | _c1 | +--------+------+--+
| ivyfd  | 6    |
| mark   | 6    |
| tony | 6 | +--------+------+--+

### 不满足条件产生空行
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW 
explode(array()) BB AS a limit 10; +-------+----+--+
| name | a | +-------+----+--+
+-------+----+--+

### OUTER 避免永远不产生结果,无满足条件的行,在该列会产生NULL值。
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW 
OUTER explode(array()) BB AS a limit 10; +--------+-------+--+
| name | a | +--------+-------+--+
| tony   | NULL  |
| mark   | NULL  |
| ivyfd | NULL | +--------+-------+--+

### 字符串切分成多列
SELECT 
name, word
FROM employees
LATERAL VIEW explode(split(happy_word,',')) addTable AS word; +--------+--------+--+
| name | word | +--------+--------+--+
| tony   | hello  |
| tony   | woddd  |
| mark   | kke    |
| mark   | ladyg  |
| ivyfd  | aa     |
| ivyfd  | thq    |
| ivyfd | dsx | +--------+--------+--+

原文链接:https://blog.csdn.net/scgaliguodong123_/article/details/60881166

本站声明:网站内容来源于网络,如有侵权,请联系我们,我们将及时处理。

本博客所有文章如无特别注明均为原创。
复制或转载请以超链接形式注明转自起风了,原文地址《Hive常用函数大全(一)(关系/数学/逻辑/数值/日期/条件/字符串/集合统计/复杂类型)
   

还没有人抢沙发呢~