一、项目要求
- 本文讨论的日志处理方法中的日志,仅指Web日志。其实并没有精确的定义,可能包括但不限于各种前端Web服务器——apache、lighttpd、nginx、tomcat等产生的用户访问日志,以及各种Web应用程序自己输出的日志。
二、需求分析: KPI指标设计
PV(PageView): 页面访问量统计
IP: 页面独立IP的访问量统计
Time: 用户每小时PV的统计
Source: 用户来源域名的统计
Browser: 用户的访问设备统计
下面我着重分析浏览器统计
三、分析过程
1、 日志的一条nginx记录内容
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939
"http://www.angularjs.cn/A00n"
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
2、对上面的日志记录进行分析
remote_addr : 记录客户端的ip地址, 222.68.172.190
remote_user : 记录客户端用户名称, –
time_local: 记录访问时间与时区, [18/Sep/2013:06:49:57 +0000]
request: 记录请求的url与http协议, “GET /images/my.jpg HTTP/1.1″
status: 记录请求状态,成功是200, 200
body_bytes_sent: 记录发送给客户端文件主体内容大小, 19939
http_referer: 用来记录从那个页面链接访问过来的, “http://www.angularjs.cn/A00n”
http_user_agent: 记录客户浏览器的相关信息, “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36″
3、java语言分析上面一条日志记录(使用空格切分)
测试结果&#xff1a; 6、算法模型: 并行算法 Browser: 用户的访问设备统计 8、输出文件log_kpi/browerSimpleV内容 AOL Explorer 1 8 R制作图片 data<-read.table(file&#61;"borwer.txt",header&#61;FALSE,sep&#61;",") names(data)<-c("borwer","num") qplot(borwer,num,data&#61;data,geom&#61;"bar") 解决问题 1、排除爬虫和程序点击&#xff0c;对抗作弊 解决办法&#xff1a;页面做个检测鼠标是否动。 2、浏览量 怎么排除图片 3、浏览量排除假点击&#xff1f; 4、哪一个搜索引擎访问的&#xff1f; 5、点击哪一个关键字访问的&#xff1f; 6、从哪一个地方访问的&#xff1f; 7、使用哪一个浏览器访问的&#xff1f; String line &#61; "222.68.172.190 - - [18/Sep/2013:06:49:57 &#43;0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"";String[] elementList &#61; line.split(" ");for(int i&#61;0;i
4、实体Kpi类的代码&#xff1a; 0 : 222.68.172.190
1 : -
2 : -
3 : [18/Sep/2013:06:49:57
4 : &#43;0000]
5 : "GET
6 : /images/my.jpg
7 : HTTP/1.1"
8 : 200
9 : 19939
10 : "http://www.angularjs.cn/A00n"
11 : "Mozilla/5.0
12 : (Windows
13 : NT
14 : 6.1)
15 : AppleWebKit/537.36
16 : (KHTML,
17 : like
18 : Gecko)
19 : Chrome/29.0.1547.66
20 : Safari/537.36"
5、kpi的工具类 public class Kpi {private String remote_addr;// 记录客户端的ip地址private String remote_user;// 记录客户端用户名称,忽略属性"-"private String time_local;// 记录访问时间与时区private String request;// 记录请求的url与http协议private String status;// 记录请求状态&#xff1b;成功是200private String body_bytes_sent;// 记录发送给客户端文件主体内容大小private String http_referer;// 用来记录从那个页面链接访问过来的private String http_user_agent;// 记录客户浏览器的相关信息private String method;//请求方法 get postprivate String http_version; //http版本public String getMethod() {return method;}public void setMethod(String method) {this.method &#61; method;}public String getHttp_version() {return http_version;}public void setHttp_version(String http_version) {this.http_version &#61; http_version;}public String getRemote_addr() {return remote_addr;}public void setRemote_addr(String remote_addr) {this.remote_addr &#61; remote_addr;}public String getRemote_user() {return remote_user;}public void setRemote_user(String remote_user) {this.remote_user &#61; remote_user;}public String getTime_local() {return time_local;}public void setTime_local(String time_local) {this.time_local &#61; time_local;}public String getRequest() {return request;}public void setRequest(String request) {this.request &#61; request;}public String getStatus() {return status;}public void setStatus(String status) {this.status &#61; status;}public String getBody_bytes_sent() {return body_bytes_sent;}public void setBody_bytes_sent(String body_bytes_sent) {this.body_bytes_sent &#61; body_bytes_sent;}public String getHttp_referer() {return http_referer;}public void setHttp_referer(String http_referer) {this.http_referer &#61; http_referer;}public String getHttp_user_agent() {return http_user_agent;}public void setHttp_user_agent(String http_user_agent) {this.http_user_agent &#61; http_user_agent;}&#64;Overridepublic String toString() {return "Kpi [remote_addr&#61;" &#43; remote_addr &#43; ", remote_user&#61;"&#43; remote_user &#43; ", time_local&#61;" &#43; time_local &#43; ", request&#61;"&#43; request &#43; ", status&#61;" &#43; status &#43; ", body_bytes_sent&#61;"&#43; body_bytes_sent &#43; ", http_referer&#61;" &#43; http_referer&#43; ", http_user_agent&#61;" &#43; http_user_agent &#43; ", method&#61;" &#43; method&#43; ", http_version&#61;" &#43; http_version &#43; "]";}}
package org.aaa.kpi;public class KpiUtil {/**** line记录转化成kpi对象* &#64;param line 日志的一条记录* &#64;author tianbx* */public static Kpi transformLineKpi(String line){String[] elementList &#61; line.split(" ");Kpi kpi &#61; new Kpi();kpi.setRemote_addr(elementList[0]);kpi.setRemote_user(elementList[1]);kpi.setTime_local(elementList[3].substring(1));kpi.setMethod(elementList[5].substring(1));kpi.setRequest(elementList[6]);kpi.setHttp_version(elementList[7]);kpi.setStatus(elementList[8]);kpi.setBody_bytes_sent(elementList[9]);kpi.setHttp_referer(elementList[10]);kpi.setHttp_user_agent(elementList[11] &#43; " " &#43; elementList[12]);return kpi;}
}
– Map: {key:$http_user_agent,value:1}
– Reduce: {key:$http_user_agent,value:求和(sum)}
7、map-reduce分析代码 import java.io.IOException;
import java.util.Iterator;import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.hmahout.kpi.entity.Kpi;
import org.hmahout.kpi.util.KpiUtil;import cz.mallat.uasparser.UASparser;
import cz.mallat.uasparser.UserAgentInfo;public class KpiBrowserSimpleV {public static class KpiBrowserSimpleMapper extends MapReduceBase implements Mapper {UASparser parser &#61; null;&#64;Overridepublic void map(Object key, Text value,OutputCollector
Android Webkit 123
Chrome 4867
CoolNovo 23
Firefox 1700
Google App Engine 5
IE 1521
Jakarta Commons-HttpClient 3
Maxthon 27
Mobile Safari 273
Mozilla 130
Openwave Mobile Browser 2
Opera 2
Pale Moon 1
Python-urllib 4
Safari 246
Sogou Explorer 157
unknown 4685