内置运算符
内置函数
测试各种内置函数的快捷方法:
1、创建一个dual表
create table dual(id string);
2、load data local inpath 'home/hadoop/dual.dat' into table dual; 一个文件(一行,一个空格)到dual表
3、select substr('angelababy',2,3) from dual;
select substr('angelababy',0,3) from dual;==>和下一句返回的一模一样
select substr('angelababy',1,3) from dual;==>和上一句返回的一模一样
4、select concat('a','b')from dual;
自定义函数和Transform
当Hive提供的内置函数无法满足你的业务处理需要时,此时就可以考虑使用用户自定义函数(UDF:user-defined function)。
1、创建一个java工程,将apache-hive-1.2.2-bin.tar.gz解压缩后的安装路径,找到lib文件夹,将其中的jar包放到工程中
2、创建包名cn.itcast.bigdata.udf
3、创建一个自定义类ToLowerCase--将传入的字符串变为小写
package cn.itcast.bigdata.udf; import java.util.HashMap; import org.apache.hadoop.hive.ql.exec.UDF; public class ToLowerCase extends UDF { public static HashMap<String, String> provinceMap = new HashMap<String, String>(); static { provinceMap.put("136", "beijing"); provinceMap.put("137", "shanghai"); provinceMap.put("138", "shenzhen"); } // 必须是public--重载此方法 public String evaluate(String field) { String result = field.toLowerCase(); return result; } //必须是public--重载此方法--两个方法同样的不会有影响 public String evaluate(int phonenbr) { String pnb = String.valueOf(phonenbr); return provinceMap.get(pnb.substring(0, 3)) == null ? "huoxing":provinceMap.get(pnb.substring(0,3)); } }
4、打成jar包,上传至hive节点的服务器,并在服务器添加hive函数,将jar包对应起来
hive>add JAR /home/hadoop/udf.jar; Hive>create temporary function 自定义函数名tolow as 'cn.itcast.bigdata.udf.ToProvince';
5、查看select * from t_p;
6、添加数据 insert into t_p values(13,'ANGELA');
7、查看select * from t_p;可以看到大写的数据
8、select id,自定义函数名tolow(name) from t_p;可以看到已变成小写,selct时,我们对name进行函数处理,导致小写
Hive>create temporary function 自定义函数名getprovince as 'cn.itcast.bigdata.udf.ToProvince';select phonenbr,自定义函数名getprovince(phonenbr),flow from t_flow;
Hive自定义函数类别
UDF 作用于单个数据行,产生一个数据行作为输出。(数学函数,字符串函数)
UDAF(用户定义聚集函数):接收多个输入数据行,并产生一个输出数据行。(count,max)
UDF开发实例
l 简单UDF示例
1、先开发一个java类,继承UDF,并重载evaluate方法
package cn.itcast.bigdata.udf import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; public final class Lower extends UDF{ public Text evaluate(final Text s){ if(s==null){return null;} return new Text(s.toString().toLowerCase()); } }
2、打成jar包上传到服务器
3、将jar包添加到hive的classpathhive> add JAR /home/hadoop/udf.jar;
4、创建临时函数与开发好的java class关联
Hive>create temporary function 自定义函数名 as 'cn.itcast.bigdata.udf.ToProvince';
5、即可在hql中使用自定义的函数strip
select 自定义函数名(name),age from t_test;
l Json数据解析UDF开发
数据准备
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"} {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"} {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"} {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"} {"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"} {"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"} {"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"} {"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"} {"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"} {"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"} {"movie":"595","rate":"5","timeStamp":"978824268","uid":"1"} {"movie":"938","rate":"4","timeStamp":"978301752","uid":"1"} {"movie":"2398","rate":"4","timeStamp":"978302281","uid":"1"} {"movie":"2918","rate":"4","timeStamp":"978302124","uid":"1"} {"movie":"1035","rate":"5","timeStamp":"978301753","uid":"1"} {"movie":"2791","rate":"4","timeStamp":"978302188","uid":"1"} {"movie":"2687","rate":"3","timeStamp":"978824268","uid":"1"} {"movie":"2018","rate":"4","timeStamp":"978301777","uid":"1"} {"movie":"3105","rate":"5","timeStamp":"978301713","uid":"1"} {"movie":"2797","rate":"4","timeStamp":"978302039","uid":"1"}
1、create table t_json(line string) row format delimited;
2、load data local inpath 'home/hadoop/rating.json' into table t_json;
3、select * from t_json limit 10;
4、自定义类JsonParser
package cn.itcast.bigdata.udf; import org.apache.hadoop.hive.ql.exec.UDF; import parquet.org.codehaus.jackson.map.ObjectMapper;//之前已经将lib的jar包添加到工程中 public class JsonParser extends UDF { public String evaluate(String jsonLine) { ObjectMapper objectMapper = new ObjectMapper(); try { MovieRateBean bean = objectMapper.readValue(jsonLine, MovieRateBean.class); return bean.toString(); } catch (Exception e) { } return ""; } }
MovieRateBean
package cn.itcast.bigdata.udf; //{"movie":"1721","rate":"3","timeStamp":"965440048","uid":"5114"}属性名必须和key一模一样 public class MovieRateBean { private String movie; private String rate; private String timeStamp; private String uid; //碍于篇幅 --这里隐藏get和set方法 @Override public String toString() {//解析后返回字符串 return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid; } }
5、打包上传至服务器、将包添加至hive的classpath
6、Hive>create temporary function 自定义函数名parsejson as 'cn.itcast.bigdata.udf.Parsejson';
7、查看数据select parsejson(line) from t_json limit 10;
8、其他方式
create table rat_json(line string) row format delimited; load data local inpath '/home/hadoop/rating.json' into table rat_json; drop table if exists t_rating; create table t_rating(movieid string,rate int,timestring string,uid string) row format delimited fields terminated by '\t'; insert overwrite table t_rating 新创建表t_rating select split得到数组(自定义函数parsejson(line),'\t')[0]as movieid, split(parsejson(line),'\t')[1] as rate, split(parsejson(line),'\t')[2] as timestring, split(parsejson(line),'\t')[3] as uid from rat_json limit 10;
Transform实现
Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能
适合实现Hive中没有的功能又不想写UDF的情况
使用示例1:下面这句sql就是借用了weekday_mapper.py对数据进行了处理.
CREATE TABLE u_data_new ( movieid INT, rating INT, weekday INT, userid INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'; add FILE weekday_mapper.py;//如果是java则是add jar INSERT OVERWRITE TABLE u_data_new SELECT TRANSFORM (movieid , rate, timestring,uid)//查询4个字段用python脚本来处理 USING 'python weekday_mapper.py' AS (movieid, rating, weekday,userid) //将time string转化为星期几 as返回4个结果 FROM t_rating;
其中weekday_mapper.py内容如下
#!/bin/python import sys import datetime for line in sys.stdin: line = line.strip() movieid, rating, unixtime,userid = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '\t'.join([movieid, rating, str(weekday),userid])//左右字段加入/t分隔符
步骤整理:
transform案例: 1、先加载rating.json文件到hive的一个原始表 rat_json create table rat_json(line string) row format delimited; load data local inpath '/home/hadoop/rating.json' into table rat_json; 2、需要解析json数据成四个字段,插入一张新的表 t_rating insert overwrite table t_rating select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json; 3、使用transform+python的方式去转换unixtime为weekday 先编辑一个python脚本文件 ########python######代码 vi weekday_mapper.py #!/bin/python import sys import datetime for line in sys.stdin: line = line.strip() movieid, rating, unixtime,userid = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '\t'.join([movieid, rating, str(weekday),userid]) 保存文件 然后,将文件加入hive的classpath: hive>add FILE /home/hadoop/weekday_mapper.py; hive>create TABLE u_data_new as//通过查询得到数据 SELECT TRANSFORM (movieid, rate, timestring,uid)//和t_rating对应 作为输入传递到python脚本 USING 'python weekday_mapper.py' AS (movieid, rate, weekday,uid)//根据结果 另外一个表的字段 插入到新表u_data_new FROM t_rating; select distinct(weekday) from u_data_new limit 10;