1 创建分桶表: 把一个大文件拆分小文件来处理
分桶也是一种用于优化查询而设计的表类型。创建分桶表时,指定桶的个数和分桶的依据字段,Hive就可以自动将数据分桶存储。查询时只需要遍历一个桶里的数据,或者遍历部分桶,这样就提高了查询效率。
分桶的实质是将数据分成不同的文件。Hive中的分桶和Hadoop中的Reduce个数相同。
分桶表的作用: 1.提高查询效率 2.提高采样效率 3.提高join的效率
分桶原理:是把hive单个数据文件,拆分成均匀大小的数据
create table 分桶表的表名 ( 字段名1 数据类型,字段名 2 数据类型2 , .... )
clustered by ( 分桶字段 ) into 分桶的个数 buckets
row format delimited
fields terminated by '\001' --指定字段分隔符
collection items terminated by '\002' -- 指字集合的分隔符
map keys terminated by '\003' --指定map的分隔符
lines terminated by '\n' --指定行的分隔符
注:分桶字段 必须是必须在表字段中
create table students_bucket ( id string, name string, age int )
clustered by ( id ) sorted by ( id asc ) into 4 buckets
create table students_bucket ( id string, name string, java float,c float,oracle float,hadoop float ,sex string )
clustered by ( id ) sorted by ( id asc ) into 4 buckets
row format delimited
fields terminated by ',' ; --指定字段分隔符
insert into students_bucket select id,name,java,c, mysql as oracle,hadoop,sex from students;
-- 分桶只能使用insert 方法来执行MR程序 进行分桶 不能使用load data
load data local inpath '/home/zx/data/students2.csv' overwrite into table students_bucket
2 自动分桶:
-1.原始数据表
create table students1(
id string,
name string,
score1 float,
score2 float,
score3 float,
score float
)row format delimited
fields terminated by ',';
--2.加载数据
load data local inpath '/home/sheng/data/students.csv' into table students1;
--3.创建分桶表
create table students_bucket1 (
id string,
name string,
score1 float,
score2 float,
score3 float,
score float
)
clustered by ( id ) sorted by ( score asc ) into 4 buckets
row format delimited
fields terminated by ','
--4.设置自动分桶开关
set hive.enforce.bucketing=true;
--4*把原始数据插入到分桶表中,并自动进行分桶
insert overwrite table students_bucket1 select * from students1;
分桶采样
select * from students_bucket1
tablesample( bucket 1 out of 4 on id );
3 外部分桶表
create EXTERNAL table students_external_bucket (
id string,
name string,
java float,
c float,
mysql float,
hadoop float,
sex string
)
clustered by ( id ) sorted by ( id asc ) into 4 buckets
row format delimited
fields terminated by ','
LOCATION '/user/zx/bucket';
insert into students_external_bucket select * from students;
4 分区分桶表
create table students_p_b (
id string,
name string,
java float,
c float,
mysql float,
hadoop float
)
PARTITIONED BY (
sex string
)
clustered by ( id ) sorted by ( id asc ) into 4 buckets
row format delimited
fields terminated by ','
insert into students_p_b partition( sex ) select id,name,java,c,mysql , hadoop ,if ( sex='男','man','woman') as sex from students;