数据源是:
baidu CN A E [17/Jul/2018:17:07:50 +0800] 2 223.104.18.110 - 112.29.213.35:80 0 v2.go2yd.com GET http://v1.go2yd.com/user_upload/1531633977627104fdecdc68fe7a2c4b96b2226fd3f4c.mp4_bd.mp4 HTTP/1.1 - bytes 13869056-13885439/25136186 TCP_HIT/206 112.29.213.35 video/mp4 17168 16384 -:0 0 0 - - - 11451601 - "JSP3/2.0.14" "-" "-" "-" http - 2 v1.go2yd.com 0.002 25136186 16384 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1531818470104-11451601-112.29.213.66#2705261172 644514568
之间的分割符是制表符"\t"
现在需要将其过滤为 ”cdn+region+level+time+ip+domain+url+traffic“ 具体数据为:
baidu CN E 20180717170750 223.104.18.110 v2.go2yd.com http://v1.go2yd.com/user_upload/1531633977627104fdecdc68fe7a2c4b96b2226fd3f4c.mp4_bd.mp4 16384
实现方法
package utils;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;
public class LogUtils {
public String parse(String log){
String result = null;
try {
DateFormat sourceFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
DateFormat targetFormat = new SimpleDateFormat("yyyyMMddHHmmss");
String[] splits = log.split("\t");
//cdn
String cdn = splits[0];
//region
String region = splits[1];
//level
String level = splits[3];
//time [17/Jul/2018:17:07:50 +0800]
String timeStr = splits[4];
//截取之后的内容 17/Jul/2018:17:07:50
String time = timeStr.substring(1,timeStr.length()-7);
time = targetFormat.format(sourceFormat.parse(time));
//ip
String ip = splits[6];
//domain
String domain = splits[10];
//url
String url = splits[12];
//traffic
String traffic = splits[20];
/*System.out.println(cdn);
System.out.println(region);
System.out.println(level);
System.out.println(time);
System.out.println(ip);
System.out.println(domain);
System.out.println(url);
System.out.println(traffic);*/
StringBuilder builder = new StringBuilder("");
builder.append(cdn).append("\t")
.append(region).append("\t")
.append(level).append("\t")
.append(time).append("\t")
.append(ip).append("\t")
.append(domain).append("\t")
.append(url).append("\t")
.append(traffic);
result = builder.toString();
} catch (ParseException e) {
e.printStackTrace();
}
return result;
}
}
测试代码:
package com.jeff.hadoop01.utils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import utils.LogUtils;
public class TestLogUtils {
private LogUtils utils;
@Before
public void setUp(){
utils = new LogUtils();
}
@After
public void tearDown(){
utils = null;
}
@Test
public void testLogParse(){
System.out.println(utils.parse("baidu\tCN\tA\tE\t[17/Jul/2018:17:07:50 +0800]\t2\t223.104.18.110\t-\t112.29.213.35:80\t0\tv2.go2yd.com\tGET\thttp://v1.go2yd.com/user_upload/1531633977627104fdecdc68fe7a2c4b96b2226fd3f4c.mp4_bd.mp4\tHTTP/1.1\t-\tbytes 13869056-13885439/25136186\tTCP_HIT/206\t112.29.213.35\tvideo/mp4\t17168\t16384\t-:0\t0\t0\t-\t-\t-\t11451601\t-\t\"JSP3/2.0.14\"\t\"-\"\t\"-\"\t\"-\"\thttp\t-\t2\tv1.go2yd.com\t0.002\t25136186\t16384\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t1531818470104-11451601-112.29.213.66#2705261172\t644514568" ));
}
}