1、Hive中数据库Database基本操作
CREATE TABLE IF NOT EXISTS DEFAULT.log_20180605(
ip string comment 'remote ip address',
user string,
req_url string comment 'user request url'
)
comment 'web access logs'
row format delimited fields terminated by ' '
stored as textfile;
load data local inpath '/opt/datas/log.txt' into table default.log_20180605;
create table if not exists default.log_20180605_sa
as select ip,req_url from default.log_20180605;
2、Hive中常见的三种表创建方式
create database db_hive_01;
create database if not exists db_hive_02;
create database if not exists db_hive_03 location '/user/hadoop/hive/warehouse/db_hive_03.db';
show databases;
show databases like 'db_hive*';
use db_hive;
desc databases db_hive_03;
desc databases extended db_hive_03;
drop database db_hive_03;
drop database db_hive_03 cascade;
drop database if exists db_hive_03;
3、Hive的数据类型
Numeric Types
TINYINT (1-byte signed integer, from -128 to 127)
SMALLINT (2-byte signed integer, from -32,768 to 32,767)
INT/INTEGER (4-byte signed integer, from -2,147,483,648 to 2,147,483,647)
BIGINT (8-byte signed integer, from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807)
FLOAT (4-byte single precision floating point number)
DOUBLE (8-byte double precision floating point number)
DOUBLE PRECISION (alias for DOUBLE, only available starting with Hive 2.2.0)
DECIMAL
Introduced in Hive 0.11.0 with a precision of 38 digits
Hive 0.13.0 introduced user-definable precision and scale
NUMERIC (same as DECIMAL, starting with Hive 3.0.0)
Date/Time Types
TIMESTAMP (Note: Only available starting with Hive 0.8.0)
DATE (Note: Only available starting with Hive 0.12.0)
INTERVAL (Note: Only available starting with Hive 1.2.0)
String Types
STRING
VARCHAR (Note: Only available starting with Hive 0.12.0)
CHAR (Note: Only available starting with Hive 0.13.0)
Misc Types
BOOLEAN
BINARY (Note: Only available starting with Hive 0.8.0)
Complex Types
arrays: ARRAY<data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
maps: MAP<primitive_type, data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
structs: STRUCT<col_name : data_type [COMMENT col_comment], ...>
union: UNIONTYPE<data_type, data_type, ...> (Note: Only available starting with Hive 0.7.0.)
4、Hive中的表操作
员工表
create table if not exists default.emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
row format delimited fields terminated by '\t';
部门表
create table if not exists default.dept(
deptno int,
dname string,
loc string
)
row format delimited fields terminated by '\t';
loca data local inpath '/opt/datas/emp.txt' overwrite into table default.emp;
loca data local inpath '/opt/datas/dept.txt' overwrite into table default.dept;
create table if not exists default.dept_cats
as
select * from dept;
truncate table dept_cats;
create table if not exists default.dept_like
like
default.dept;
alter table dept_like rename to dept_like_rename;
drop table if exists dept_like_rename;
5、在Hive中表的类型
5.1、管理表
5.2、托管表(外部表)
create EXTERNAL table IF NOT EXISTS default.emp_ext(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
5.3、内部表和外部表区别
内部表:
内部表也称之为MANAGED_TABLE;
默认存储在/user/hive/warehouse下,也可以通过location指定;
删除表时,会删除表数据以及元数据;
外部表:
外部表称之为EXTERNAL_TABLE;
在创建表时可以自己指定目录位置(LOCATION);
删除表时,只会删除元数据,不会删除表数据;
create EXTERNAL table IF NOT EXISTS default.emp_ext2(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/user/hadoop/hive/warehouse/emp_ext2';
dfs -put /opt/datas/emp.txt /user/hadoop/hive/warehouse/emp_ext2;
drop table default.emp_ext2;
这时候数据还在;
6、分区表
6.1、分区表实际上就是对应一个HDFS文件系统上的独立的文件夹,该文件夹下是该分区所有的数据文件。Hive中的分区就是分目录,把一个大数据集根据业务需要分割成更下的数据集。
在查询时通过where子句中的表达式来选择查询所需要的指定的分区,这样查询效率会提高很多。
create EXTERNAL table IF NOT EXISTS default.emp_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int
)
partitioned by (month string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
load data local inpath '/opt/datas/emp.txt' into table default.emp_partition partition (month='201805');
select * from default.emp_partition where month='201805';
select count(distinct ip) from default.emp_partition where month='201805'
union
select count(distinct ip) from default.emp_partition where month='201804'
union
select count(distinct ip) from default.emp_partition where month='201803';
直接执行;也可以存储为xx.sql,bin/hive -f xx.sql
二级分区:
建表时partitioned by (month string,day string)
select * from emp_partition where month='201805' and day='12';
6.2、注意事项
1.无分区
create table if not exists default.dept_nopart(
deptno int,
dname string,
loc string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_nopart;
select * from dept_nopart;
有数据
2.有分区
create table if not exists default.dept_part(
deptno int,
dname string,
loc string
)
partitioned by(day string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20180512;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20180512;
select * from dept_nopart;
无数据
----------------------------------------------------------------------------------------------------------------
use metastore;
select * from PARTITIONS;//记录分区
处理方法1:
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20180512;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20180512;
msck repair table dept_part;
select * from PARTITIONS;
select * from dept_part;
有数据了
处理方法2:
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20180513;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20180513;
alter table dept_part add partition(day='20180513');
select * from dept_part;
有数据了
7、导入数据进入Hive的六大方式
LOAD DATA [LOCAL] INPATH 'filepath'
[OVERWRITE] INTO TABLE tablename
[PARTITION (partcol1=val1, partcol2=val2 ...)]
* 原始文件存储的位置
* 本地 local
* hdfs
* 对表的数据是否覆盖
* 覆盖 overwrite
* 追加
* 分区表加载,特殊性
PARTITION (partcol1=val1, partcol2=val2 ...)
1> 加载本地文件到hive表
locad data local inpath '/opt/datas/emp.txt' into table default.emp;
2> 加载hdfs文件到hive中
load data inpath '/user/hadoop/hive/datas/emp.txt' into table default.emp;
3> 加载数据覆盖表中已有的数据
load data inpath '/user/hadoop/hive/datas/emp.txt' overwrite into table default.emp;
4> 创建表是通过select加载
create table default.emp_ci like emp;
insert into table default.emp_cli select * from default.emp;
5> 创建表是通过insert加载
insert into table default.emp_cli values(a,b,c);
6> 创建表的时候通过location指定加载
create table default.emp_ce(
id int,
name string
)
location '/user/hadoop/hive/datas/emp.txt'
8、导出Hive表数据的几种方式
1.第一种导出
insert overwrite local directory '/opt/datas/hive_exp_emp'
select * from default.emp;
insert overwrite local directory '/opt/datas/hive_exp_emp2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '\n'
select * from default.emp;
2.第二种导出
bin/hive -e "select * from default.emp;" > /opt/datas/exp_res.txt
3.第三中导出
insert overwrite directory '/user/hive/warehouse/hive_exp_emp'
select * from default.emp;
=============
sqoop
hdfs/hive -> rdbms
rdbms -> hdfs/hive/hbase
9、Hive中常见的查询
全表查询.指定字段查询
=, >=, <=, between and,limt
(not) in, is(not) null, in, not in
max,min,count,sum,avg
group by, having
join
GROUP BY
分组
emp表
* 每个部门的平均工资
SELECT T.DEPTNO,AVG(T.SAL) AVG_SAL FROM EMP T GROUP BY T.DEPTNO;
* 每个部门中每个岗位的最高薪水
SELECT T.DEPTNO, T.JOB, MAX(T.SAL) MAX_SAL FROM EMP T GROUP BY T.DEPTNO,T.JOB;
HAVING
* where 是针对单条记录进行筛选
* having 是针对分组结果进行筛选
求每个部门的平均薪水大于2000的部门
SELECT DEPTNO, AVG(SAL) FROM EMP GROUP BY DEPTNO;
SELECT DEPTNO, AVG(SAL) AVG_SAL FROM EMP GROUP BY DEPTNO HAVING AVG_SAL>2000 ;
JOIN
两个表进行链接
m n
m表中的一条记录和n表中的一条记录组成一条记录
等值JOIN
join ... on
select e.empno,e.ename,d.deptno,d.dname from emp e join dept d on e.deptno=d.deptno;
左链接
left join
select e.empno,e.ename,d.deptno,d.dname from emp e left join dept d on e.deptno=d.deptno;
右链接
right join
select e.empno,e.ename,d.deptno,d.dname from emp e right join dept d on e.deptno=d.deptno;
全连接
full join
select e.empno,e.ename,d.deptno,d.dname from emp e full join dept d on e.deptno=d.deptno;
10、数据导入导出
Export
导出,将hive表中的数据,导出到外部
EXPORT TABLE TABLENAME TO 'export_target_path';
export_target_path:指的是HDFS上的路径
export table default.emp to '/user/hadoop/hive/export/emp_exp';
Import
导入,将外部数据导入hive表中
create table db_hive.emp like default.emp;
import table db_hive.emp from '/user/hadoop/hive/export/emp_exp';
11、Hive中order by,sort by,distribute by和cluster by
* order by
对全局数据的一个排序,仅仅只有一个reduce
SELECT * FROM EMP ORDER BY EMONO DESC;
* sort by
对每一个reduce内部数据进行排序的,全局结果集来说不是排序
set mapreduce.job.reduces = 3;
select * from emp sort by empno asc;
insert overwrite local directory '/opt/datas/sortby-res' select * from emp sort by empno asc;
* distribute by
分区partition
类似于MapReduce中分区partition,对数据进行分区,结合sort by进行使用
insert overwrite local directory '/opt/datas/distby-res' select * from emp distribute by deptno sort by empno asc;
注意事项:
distribute by 必须要在sort by 前面.
* cluster by
当distribute by 和 sort by字段相同时,可以使用cluster by;
insert overwrite local directory '/opt/datas/clusterby-res' select * from emp cluster by empno;
12、Hive中自带Function使用及自定义UDF编程及使用
Hive自带了一些函数,比如max/min,但是数量有限,自己可以通过
自定义UDF来方便的扩展.
UDF:用户自定义函数,允许用户扩展HiveQL功能:
UDF(User-Defined-Function)
一进一出
UDAF(User-Defined Aggregation Funcation)
聚集函数,多进一出;
类似于:count/max/min
UDTF(User-Defined Table-Generating Functions)
一进多出;
如lateral view explore()
User Definition Function
show functions;
desc function split;
desc function extended split;
https://cwiki.apache.org/confluence/display/Hive/HivePlugins
编程步骤:
1.继承org.apache.hadoop.hive.ql.UDF
2.需要事先evaluate函数,evaluate函数支持重载
注意事项:
1.UDF必须要有返回类型,可以返回null,但是返回类型不能为void;
2.UDF中蝉蛹Text/LongWritable等类型,不推荐使用java类型;
1>First, you need to create a new class that extends UDF, with one or more methods named evaluate.
2>Usage
add jar /opt/datas/hiveudf.jar
create temporary function my_lower as "com.senior.hive.udf.LowerUDF";
select ename,my_lower(ename) lowername from emp limit 5;
--CREATE FUNCTION myfunc AS 'myclass' USING JAR 'hdfs:///path/to/jar';
dfs -mkdir -p /user/hadoop/hive/jars/;
dfs -put /opt/datas/hiveudf.jar /user/hadoop/hive/jars;
CREATE function self_lower AS 'com.senior.hive.udf.LowerUDF' USING JAR 'hdfs://hadoop-senior:8020/user/hadoop/hive/jars/hiveudf.jar';
package com.senior.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* 1. Implement one or more methods named
* "evaluate" which will be called by Hive.
* 2."evaluate" should never be a void method.
* However it can return "null" if needed.
* @author Guardian
*
*/
public class LowerUDF extends UDF{
public Text evaluate(Text str){
//validate
if (null == str.toString()) {
return null;
}
//lower
return new Text(str.toString().toLowerCase());
}
public static void main(String[] args) {
System.out.println(new LowerUDF().evaluate(new Text("HIVE")));
}
}