
对于时间格式进行数据清洗,比如:2015-08-28 18:10:00,从中获取日期和小时。
(3)使用Select SQL进行数据分析。

create database track_log;


create table yhd_source(
id              string,
url             string,
referer         string,
keyword         string,
type            string,
guid            string,
pageId          string,
moduleId        string,
linkId          string,
attachedInfo    string,
sessionId       string,
trackerU        string,
trackerType     string,
ip              string,
trackerSrc      string,
cookie          string,
orderCode       string,
trackTime       string,
endUserId       string,
firstLink       string,
sessionViewNo   string,
productId       string,
curMerchantId   string,
provinceId      string,
cityId          string,
fee             string,
edmActivity     string,
edmEmail        string,
edmJobId        string,
ieVersion       string,
platform        string,
internalKeyword string,
resultSum       string,
currentPage     string,
linkPosition    string,
buttonPosition  string
row format delimited fields terminated by '\t'
stored as textfile;

注:shift+alt 可以进行下拉列式编辑

load data local inpath '/opt/datas/2015082818' into table yhd_source;
load data local inpath '/opt/datas/2015082819' into table yhd_source;


create table yhd_part1(
id string,
url string,
guid string
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';


insert into table yhd_part1 partition (date='20150828',hour='18') select id,url,guid from yhd_qingxi where date='28' and hour='18';
insert into table yhd_part1 partition (date='20150828',hour='19') select id,url,guid from yhd_qingxi where date='28' and hour='19';


select id,date,hour from yhd_part1 where date='20150828' and hour='18';


create table yhd_qingxi(
id string,
url string,
guid string,
date string,
hour string
row format delimited fields terminated by '\t';


insert into table yhd_qingxi select id,url,guid,substring(trackTime,9,2) date,substring(trackTime,12,2) hour from yhd_source;


		<description>Whether or not to allow dynamic partitions in DML/DDL.</description>


		<description>In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions.</description>

使用非严格模式:set hive.exec.dynamic.partition.mode=nonstrict;

create table yhd_part2(
id string,
url string,
guid string
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';

执行动态分区:insert into table yhd_part2 partition (date,hour) select * from yhd_qingxi;
注:也可以不写select * ,但是要写全字段
(10)首先根据select * 找到表,按照里面的字段date hour进行匹配

insert into table yhd_part2 partition (date='20150828',hour='19') select id,url,guid from yhd_qingxi where date='28' and hour='19';


select date,hour,count(url) PV from yhd_part2 group by date,hour;


|   date    | hour  |   pv   |
| 20150828  | 18    | 64972  |
| 20150828  | 19    | 61162  |


select date,hour,count(distinct guid) UV from yhd_part1 group by date,hour; 


|   date    | hour  |   uv   |
| 20150828  | 18    | 23938  |
| 20150828  | 19    | 22330  |


create table if not exists result as select date,hour,count(url) PV ,count(distinct guid) UV from yhd_part1 group by date,hour; 


| result.date  | result.hour  | result.pv  | result.uv  |
| 20150828     | 18           | 64972      | 23938      |
| 20150828     | 19           | 61162      | 22330      |


create table if not exists save(
date varchar(30) not null,
hour varchar(30) not null,
pv varchar(30) not null,
uv varchar(30) not null,
primary key(date,hour)


bin/sqoop export \
--connect \
jdbc:mysql://bigdata-senior01.ibeifeng.com:3306/sqoop \
--username root \
--password 123456 \
--table save \
--export-dir /user/hive/warehouse/track_log.db/result \
--num-mappers 1 \
--input-fields-terminated-by '\001'


| date     | hour | pv    | uv    |
| 20150828 | 18   | 64972 | 23938 |
| 20150828 | 19   | 61162 | 22330 |
