一、hive各种不同存储数据格式介绍
1.不同存储格式
(1)TEXTFILE:行存储(磁盘开销大)
(2)RCFILE:数据是按行进行分块,每块按照列存储(压缩快)
(3)ORC:rcfile的改良版(第一选择)
(4)PARQUET:列式存储,良好压缩性能(第二选择)
(5)AVRO:为了解析Avro格式的数据 (0.40)
(6)INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname 自定义格式
2.使用方法
STORED AS file_format
二、实例
1.创建数据库
create database datafile;
2.原文本数据
create table file_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by "\t";
load data local inpath '/opt/datas/2015082818' into table file_source;
3.各种不同存储格式比较
(1)textfile
create table file_text(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by "\t"
STORED AS TEXTFILE;
insert into table file_text select * from file_source;
结果:
hdfs查看TEXTFILE格式上传文件:27.48MB,相对之前load直接上传的37.6MB,大小会减少。
(2)parquet
create table file_parquet(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by "\t"
STORED AS parquet;
insert into table file_parquet select * from file_source;
结果:
hdfs查看parquet格式上传文件:16.14MB,相对之前load直接上传的37.6MB,大小会减少。
(3)orc
create table file_orc(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by "\t"
STORED AS orc;
insert into table file_orc select * from file_source;
4.总结
file_source 37.6M
file_text 27.48 MB
file_parquet 16.14 MB
file_orc 4.4 MB
所以,尽量用orc,同时orc的sql查询快速。