自定义Hive文件和记录格式(十)
其他
2018-11-26 08:51:07
阅读次数: 0
create table 语句中默认的是stored as textfile
练习了store as sequencefile,省空间,提升i/o性能
PIG中输入输出分隔符默认是制表符\t,而到了hive中,默认变成了八进制的\001, 也就是ASCII:
ctrl - A Oct Dec Hex ASCII_Char 001 1 01 SOH (start of heading) ,官方的解释说是尽量不和文中的字符重复,因此选用了 crtrl - A
SerDe是序列化/反序列化的简写形式
create table test(name string)n stored as sequencefile;
create table test1(name string);
stored as 影响输入和输出的格式
insert overwrite table test select * from prov;
insert overwrite table test1 select * from prov;
区别test与test1的存储格式区别:
test1、
{STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'}
test、
{STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'}
解析XML
SELECT xpath ('<a><b id="foo" >bl</b><b id="bar" >b2</b></a>', '//@id')
FROM waibubiao1 LIMIT 1; // ["foo","bar"]
SELECT xpath('<a><b id="foo" class="bb">b123</b><b id="bar" >b2</b></a>', 'a/*[@class="bb"]/text()')
FROM waibubiao1 LIMIT 1; //["b123"]
SELECT xpath('<a><b id="foo" class="bb">b123</b><b id="bar" >b2</b></a>', 'a/*[@class="bb"]/text()')
FROM waibubiao1 ;//waibubiao1中有六条记录,'a/*[@class="bb"]/text()':a下面的class='bb'的值{text()}
["b123"]
["b123"]
["b123"]
["b123"]
["b123"]
["b123"]
转载自blog.csdn.net/weixin_42325841/article/details/81501405