hive的基本操作

106 阅读 0 评论 70 点赞

我是靠谱客的博主动听抽屉，最近开发中收集的这篇文章主要介绍hive的基本操作，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

创建数据库

#普通创建
create database dbname;
#没用dbname进行创建有着不创建
create database if not exists dbname;
create database if not exists dbname comment '备注信息';

查看库的信息

#查看数据库的信息
desc database dbname;
#查看数据库的信息详细
desc database extended dbname;
#显示创建语法
show create database dbname;

切换数据库
use dbname；

数据库删除

#删除空的数据库
drop if database dbname;
#删除数据库(强删)
drop database dbname cascade;

表的创建

#普通创建
create table stu(id int,name string,sex string);
#备注信息
create table stu(id int,name string comment '备注信息',sex string);
#指定分隔符
创表后默认的列分隔符为^A,行分隔符为n
create table if not exists stu(id int,name string,sex string)
row fromat delimited
fields terminated by 't'             #指定列分割符
lines terminated by 'n';             #指定行分割符

表的查看

#查看表
show tables;
#查看表结构
desc stu;
#查看详细表结构
desc extended stu;
#查看建表语句
show create table stu;

表的修改语法

#更改表名
alter table oldtablename rename to newtablename;
#更改列名
alter table tablename chang column oldColumnName newColumnName;
#增加字段
alter table tablename add columns(sex string,~);
#删除字段
alter table tablename replace columns(sex string,~);

删除表

drop table if exists tableName;

上传数据

#使用hdfs dfs上传到指定的路径下
hdfs dfs -put file.txt /user/hive/warehouse/work.db
#使用load命令
load data local inpath '路径' overwrite into table tablename;
#基于查询插入&插入多表
insert into tablename select * from tablename1
----------------------------------------------
from tablename
insert into tablename1 select * where 条件
insert into tablename2 select * where 条件
----------------------------------------------
#克隆命令
create table tablename as select --from--where;
#克隆表结构
create table tablename like tablename1;

数据导出

#hive数据导出到目录下
insert overwrite local directory '本地目录' select--from--where
insert overwrite directory 'hdfs目录' select--from--where
#hive数据导出到目录下指定分隔符
insert overwrite local directory '本地目录' row delimited fieds terminated by '分割符'
select--from--where
[root@moyue01 ~]# hive -e e 'hql' >> '文件路径'

内部表

#一般用于测试
${hive.metastore.warehouse.dir}数据一般在指定目录下
hive删除表数据消失

外部表

#extended关键字
create extended table tablename(id int,~)location '可以指定数据位置'
在hive中删除只会删除MySQL元数据hdfs上的原始数据不会删除

内部表外部表转换

#内部表转外部表
alter table tablename set tblproperties('EXTERNAL'='TRUE');
#外部表转内部表
alter table tablename set tblproperties('EXTERNAL'='FALSE');

数据导入

#上传到hdfs上
hdfs dfs -put file.txt /user/hive/warehouse/work.db
#hive命令行使用
load data [local#加上local表示在linux本地获取，不加默认在hdfs上获取] inpath '地址' into table 表名;
#基于其他表的查询结果
insert into tablename select * from tablename1
#克隆
create table tablename as select ~ from~
create table tablename like tablename1 location'地址'
#只克隆表结构
create table tablename like tablename1

导出

#上传到linux本地
insert overwrite local directory '本地目录' select ~ from tablename1;
#上传到hdfs
insert overwrite directory '本地目录' select ~ from tablename1;

导出指定分割符

#linux本地
insert overwrite local directory '目录' row format delimited fields terminated by '分割符'
#hdfs
insert overwrite directory '目录' row format delimited fields terminated by '分割符'

exists用法

#exists的用法
select * from tablename a where exists(select 1 from tablename tablename b where a.xx=b.xx)

left semi join 和exists差不多左链接
select * from tablename a left semi join tablename b on a.xx = b.xx;

hive的日志位置
#/tmp/root/
日志名为hive.log

在linux命令行上运行hql

hive -e 'sql语句';#数据库.表名
hive --database 数据库 -e 'sql语句';#指定数据库

hive类型

#整数int bigint smallint tinyint
#浮点double float decimal
#布尔boolean
#字符串string varchar(length) char(length)
#日期date timestamp

Array类型

create table if not exists tablename(
uname string,
score array<int>
)
row format delimited fields terminated by 't' collection items terminated by ',';
load data local inpath '路径' into table 表名;
#score[下标]获取对应数据
#size(score)获取个数

explode应用

#展开函数explode应用
desc scores;
OK
name                    string                                      
score                   array<int>
select name,t1.*from scores lateral view explode(score) t1 as scores;
-------------------------------------------------------------------------------------------
。。虚拟表别名.from 原表名 lateral view explode (要展开的表名) 虚拟表别名 as 展开字段的列别名
-------------------------------------------------------------------------------------------

拼接字符串

concat(元素A,元素B,元素C)
concat(元素A,'分割符',元素B)
#一次性指定分割符
concat_ws(分割符,元素A,元素B,元素C)
group_concat(链接字段 order by 排序字段 desc separator 分隔符)

搜集函数的应用

collect_list        #有序可重复
collect_set         #无序不可重复
create table score_1 as select name,t1.*from scores lateral view explode(score) t1 as scores;
-------------------------------------------------------------------------------------------
desc score_1;
name                    string                                      
scores                  int
select name,collect_list(scores) from score_1 group by name;
lisi    [67,75,83,94]
wangwu  [23,12]
zhangsan        [78,89,92,96]
-------------------------------------------------------------------------------------------
select name,collect_set(scores) from score_1 group by name;
lisi    [67,75,83,94]
wangwu  [23,12]
zhangsan        [78,89,92,96]

map类型

create table tablename(
colName map<T,T>
)
select * from scores_map;
OK
zhangsan        {"chinese":90,"math":87,"english":63,"nature":76}
lisi    {"chinese":60,"math":30,"english":78,"nature":0}
wangwu  {"chinese":89,"math":25}
-------------------------------------------------------------------------------------------
create table scores_map(name string,scores map<string,int>)
row format delimited fields terminated by 't'collection items terminated by ',' map keys terminated by ':' lines terminated by 'n';
-------------------------------------------------------------------------------------------
查询每个人的数学成绩
select name , scores["math"] from scores_map;
OK
zhangsan        87
lisi    30
wangwu  25
展开函数
select explode(scores) from scores_map;
OK
chinese 90
math    87
english 63
nature  76
chinese 60
math    30
english 78
nature  0
chinese 89
math    25
-------------------------------------------------------------------------------------------
select name,subject,score  from scores_map lateral view explode(scores) t1 as subject,score;
zhangsan        chinese 90
zhangsan        math    87
zhangsan        english 63
zhangsan        nature  76
lisi    chinese 60
lisi    math    30
lisi    english 78
lisi    nature  0
wangwu  chinese 89
wangwu  math    25

struct类型

create table tablename(
filedsName struct<subName1:Type,subName:Type,.....>
)
#创建表
create table struct_1(name string,scores struct<math:int,chinese:int,english:int,nature:int>)
#指定分隔符
row format delimited fields terminated by 't' collection items terminated by',';
#插入数据
load data local inpath'./data/struct1.txt'into table struct_1;
#表预览
select * from struct_1;
OK
zhangsan        {"math":90,"chinese":87,"english":63,"nature":76}
lisi    {"math":60,"chinese":30,"english":78,"nature":0}
wangwu  {"math":89,"chinese":25,"english":81,"nature":9}
#简单查询
select name,scores.math,scores.english from struct_1;
OK
zhangsan        90      63
lisi    60      78
wangwu  89      81
通过key查找values

内置函数

内置函数
#查看函数
show functions         #查看hive中的函数
desc functions 函数    #查看函数用法
-------------------------------------------------------------------------------------------
#时间函数
1.current_date()
2.current_timestamp()
3.unix_timestamp()
#日期转时间戳函数
hive (default)> select unix_timestamp("1970-1-1 00:00:00","yyyy-MM-dd HH:mm:ss");
OK
-28800
东八区凌晨返回的是本初子午线，这时候正是1969-12-31下午4点整
#时间戳日期函数
-------------------------------------------------------------------------------------------
hive (default)> select from_unixtime(28800,"yyyy-MM-dd HH:mm:ss");
OK
1970-01-01 16:00:00
本初子午线8点为东八区16点
-------------------------------------------------------------------------------------------
#datediff()
datediff()    #计算时间差函数
select datediff(current_date(),'2020-9-6');
#current_date()当天时间    datediff相减    当天时间-日期得到天数
#months_between()
select months_between(current_date(),'2020-9-6');
#返回相差的月数 会有小数
-------------------------------------------------------------------------------------------
#日期时间分量函数
year()    #select year('2012-11-3 09:12:03') 查看年
month()   #select month('2012-11-3 09:12:03') 查看月
day()     #select day('2012-11-3 09:12:03') 查看天
hour()    #select day('2012-11-3 09:12:03') 查看小时
minute()  #select day('2012-11-3 09:12:03') 查看分钟
second()  #select day('2012-11-3 09:12:03') 查看秒
#日期定位函数
last_day()#返回指定日期最后一天的日期
hive (default)> select last_day(current_date());
OK
2021-12-31
next_day()#返回指定日期下一周的星期一
hive (default)> select next_day(current_date(),'mon');
OK
2021-12-20
-------------------------------------------------------------------------------------------
日期加减函数
date_add()
select date_add(current_date(),-1)
#获取后天的日期
date_sub()
select date_sub(current_date(),1)
#获取前n天的日期
add_months()
select add_months(current_date(),1)
#获取后n月的日期
-------------------------------------------------------------------------------------------
字符串转日期
to_date(express)
select to_date("2014-8-5");#默认格式yyyy-MM-dd
日期转字符串
date_format(current_date(),'yyyy-MM-dd');
日期截断函数
select trunc(current_date(),'YYYY');
阶段到年，表示日期分量回到最小值

字符串函数

lower/lcase(转小写)
select lower('ABC');
--------------------
upper/ucase(转大写)
select upper('abc');
--------------------
length(查看字符串长度,字符串)
select length('abc');
--------------------
concat(字符拼接)
select concat("A","B");
--------------------
concat_Ws(指定分隔符)
select concat_ws('-','a','b','c');
--------------------
substr(求字符串)
select substr('abcde',3);
从第几个开始读取
--------------------
split(str,regex)切分字符串，返回数组
select split("a-b-c-d-e-f","-");
--------------------
select substr("helloworld",4,2);
从第4个开始截取2个

常用的数学函数

max()    #最大
min()    #最小
avg()    #平均
sum()    #和
count()  #行数
rount()  #四舍五入
ceil()   #返回是大于参数的最小整数
floor()  #返回是小于参数的最大整数
rand()   #返回的是[0,1)之间的任意小数

其他函数

nvl(value,default value)
#value如果为空返回default 否则返回value
if(p1,p2,p3)
#p1为true返回p2,p1为false返回p3
isnull()判断是否为空
isnotnull()判断是否不为空
coalesce(value1,value2)返回第一个不为空的

高级函数-窗口函数over

select a.*,b.num  from t_order a join (select count(*) num from t_order) b;
t_order和count(*)匹配
select *,count(1) over() from t_order;
over()以整张表为一个分区

distribute by + sort by 组合
distribute by 指定字段分区    sort by 指定字段升降序
over(distribute by 字段 sort by 字段 [desc|asc])

partition by + order by 组合
partition by 分组字段 order by 升降字段 [desc|asc]

window子句

current row:当前行
preceding  :向前
following  :向后
unbounded preceding :从起点
unbounded following :从终点
----------------------------------------------------------------------------------------
sum(cost) over(partition by name order by orderdate rows between unbounded preceding and current row)as 列别名
--当前行和第一行所有数据做聚合
LAG()：取之前行的值
LEAD()：取之后的行的值
FIRST_VALUE()：取第一行的值
LAST_VALUE()：取第一行的值
lag(score,1)over(partition by class order by score desc )

排名函数

#必须配合over函数
row_number()
#不重复不间断
分数   名次
100     1
99      2
99      3
98      4
dense_rank()
#重复间断
分数   名次
100     1
99      2
99      2
98      4
rank()
#重复不间断
分数   名次
100     1
99      2
99      2
98      3

其他函数

ntile函数

select * , ntile(3) over(sort by 升降字段 desc) from 表名;
将行数分为三份

lag()和lead()函数

select lag(字段名,n行[,value]) from 表名
取前n行如果没有则取value
select lead(字段名,n行[,value]) from 表名
取后n行如果没有则取value

first_value和last_value

a    最后一行
b
c
d
e
f
g    第一行
first_value(colName)取分组排序后，截至到当前行，第一个值
last_value(colName)取分组内排序后，截至到当前行，最后一个值
> select *,first_value(cost) over(partition by name order by orderdate) firstV,       
> first_value(cost) over(partition by name order by orderdate  desc) lastV from order;
lisi    2019-10-13      20      30      20
lisi    2019-10-12      30      30      20
mart    2018-04-13      94      62      94
mart    2018-04-11      75      62      94
mart    2018-04-09      68      62      94
mart    2018-04-08      62      62      94
neil    2018-06-12      80      12      80
neil    2018-05-10      12      12      80
saml    2018-04-06      42      10      42
saml    2018-02-03      23      10      42
saml    2018-01-08      55      10      42
saml    2018-01-05      46      10      42
saml    2018-01-01      10      10      42
tony    2018-01-07      50      15      50
tony    2018-01-04      29      15      50
tony    2018-01-02      15      15      50
wangwu  2019-10-14      20      10      20
wangwu  2019-10-13      10      10      20
zsan    2019-10-13      30      10      30
zsan    2019-10-12      20      10      30
zsan    2019-10-11      10      10      30

自定义函数

1.UDF
一进一出
2.UDAF
多进一出
3.UDTF
一进多出

分区表

创建分区表
create table if not exists emp_part1(
empno into comment'编号',
ename string,
job string,
mgr int,
hiredate date,
sal double,
comm double,
deptno int
)
partitioned by (year string) #分区字段
row format delimited
fields terminated by ',';
load data local inpath '' overwrite into table emp_part1 partition(year="2021")
#------------------------------------------------------------------------------------------
load data local inpath '' overwrite into table emp_part1 partition(year="2021")
添加分区数据

二级分区表
create table if not exists emp_part1(
empno int,ename string,job string,sex string
)
partitioned by (year string,month string)
row format delimited
fields terminated by 't';
#和一级分区区别为添加子分区
加载数据
load data local inpath"./map.txt"into table emp_part1 partition(year="",month="");

三级分区表
create table emp_part2(
empno int,ename string,job string,sex string
)
partitioned by (year="",month="",day="")
row format delimited
fields terminated by 't';
load data local inpath'./map.txt'into table emp_part2
partition(year="",month="",city="");

CRUD

#查看分区信息
show partitions tablename;
#增加分区数
alter table tablename add partition(colname type,.......);
alter table tablename add partition(colname type,.......)partition(colname type,......);
alter table tablename add aprtition(colname type,.......)location '路径';
alter table tablename add aprtition(colname type,.......)location '路径' 
partition(colname type,.....) location '路径';
例如：alter table emp_part1 add partition(year="2021",month="08");
--------------------------------------------------------------------------------------
alter table emp_part1 add partition(year="2021",month="03")
partition(year="2020",month="10");
--------------------------------------------------------------------------------------
alter table emp_part1 add partition(year="2020",month="05")location 'path';
hdfs上的路径
--------------------------------------------------------------------------------------
#修改映射
alter table tablename partition(colName=value,.....)set location 'new path';
#分区删除
alter table tablename drop partition(colName=value,.....);
删除多个分区
alter table tablename drop partition(colName=value,.....),partition(colName=value,....);

分区创建

insert 动态加载数据
load    静态加载
insert into emp_part partition(year) select * from emp_part;
----------------------------------------------------------------------------------------
FAILED: SemanticException [Error 10096]: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict
----------------------------------------------------------------------------------------
需要有一个静态否则会报错,否则要修改严格模式
set hive.exec.dynamic.partition.mode=nonstrict
开启严格模式
set hive.exec.dynamic.partition.mode=strict
----------------------------------------------------------------------------------------
insert into userinfo_part partition(year="2019",month,day) 动态分区
select id,name,month,day from userinfo;                    静态分区

分桶

先创建表在创建分桶表
create table student_bucket(sno int,name string,sex string,age int,academy string)
clustered by(sno) sorted by(age desc)into 4 buckets
row format delimited fields terminated by ',';
#--------------------------------------------------------------------------------------
clustered by(分桶字段) sorted by(升降字段 desc) into 分桶数 buckets

加载数据
不能使用load要动态加载insert
insert into     追加
insert overwrite覆盖
1.insert into table student_bucket select * from student disteribute by (sno) sort by (age desc);
2.insert overwrite table student_bucket select * from student distribute by (sno) sort by(age desc);
-------------------------------------------------------------------------------------
insert overwrite table 窗口表   select * from 原表 distribute by(分窗字段) sort by(排序字段 desc)
-------------------------------------------------------------------------------------

查询

select * from student_bucket;
select * from student_bucket tablesample(bucket 1 out of 1);
查询4桶的第一桶
select * from student_bucket tablesample(bucket 1 out of 4 on sno);
查询4桶中的第一桶和第三桶
select * from student_bucket tablesample(bucket 1 out of 4 on sno)
union 
select * from student_bucket tablesample(bucket 3 out of 4 on sno);
-------------------------------------------------------------------
select * from student_bucket tablesample(bucket 1 out of 2 on sno);
查询4桶中的第二桶和第四桶
select * from student_bucket tablesample(bucket 2 out of 2 on sno);
查询5桶中的第三桶
select * from student_bucket tablesample(bucket 3 out of 5 on sno);

查询三行数据
    select * from student_bucket limit 3;
    select * from student_bucket tablesample(3 rows);
查询百分比的数据
    select * from student_bucket tablesample(13 percent);大小的百分比所占的那一行。
    
查询固定大小的数据
    select * from student_bucket tablesample(68b); 单位（K,KB,MB,GB...）
    固定大小所占的那一行。
随机抽三行数据
    select * from student_bucket order by rand() limit 3;