教程目录
0x00 教程内容- 大数据日志分析系统简介
- UserAgentParser的使用
- 实战准备
- 项目实战
- 结果展示
1. 需求
a. 简单统计网站的访问日志中每个浏览器的访问次数
2. 背景及架构
a. 请参考文章:大数据日志分析系统背景及架构
0x02 UserAgentParser1. UserAgentParser的介绍
a. 可以用来解析http user-agent信息的小工具(别人写好的小项目)
2. user-agent信息
a. 信息样式Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36
b. 查看user-agent(进入网站按F12进入检查界面,刷新一下)
1. 下载UserAgentParser小工具
a. 下载地址(可以用git或者直接下载压缩包然后解压):
https://github.com/LeeKemp/UserAgentParser
2. 安装对应的jar包到本地Maven仓库
a. 用Maven打包小工具成jar包(进入主目录,如:E:\workspace\UserAgentParser-master)mvn clean package -DskipTest
b. 安装jar包到本地Maven仓库mvn clean install -DskipTest
1. 构建项目
a. 可参考此文章的0x01 新建maven工程
:
Java API实现HDFS的相关操作
2. 引入依赖
a. 引入依赖(如果没有在安装此jar到本地仓库是无法引入的)
<!-- 添加UserAgent的依赖 -->
<dependency>
<groupId>com.kumkee</groupId>
<artifactId>UserAgentParser</artifactId>
<version>0.0.1</version>
</dependency>
b. 完整的依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.shaonaiyi.hadoop</groupId>
<artifactId>hadoop-learning</artifactId>
<version>1.0</version>
<name>hadoop-learning</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<hadoop-version>2.7.5</hadoop-version>
</properties>
<dependencies>
<!--添加hadoop依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop-version}</version>
</dependency>
<!-- 添加UserAgent的依赖 -->
<dependency>
<groupId>com.kumkee</groupId>
<artifactId>UserAgentParser</artifactId>
<version>0.0.1</version>
</dependency>
<!--添加单元测试依赖-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
</project>
3. 编写测试代码
a. 在java的测试目录创建一个测试包(com.shaonaiyi.hadoop.project
):
b. 新建UserAgentTest测试类:
package com.shaonaiyi.hadoop.project;
import com.kumkee.userAgent.UserAgent;
import com.kumkee.userAgent.UserAgentParser;
/**
* @Auther: 邵奈一
* @Date: 2019/03/27 下午 2:45
* @Description: UserAgent解析测试类
*/
public class UserAgentTest {
public static void main(String[] args) {
String agentSource = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36";
UserAgentParser userAgentParser = new UserAgentParser();
UserAgent agent = userAgentParser.parse(agentSource);
String browser = agent.getBrowser();
String engine = agent.getEngine();
String engineVersion = agent.getEngineVersion();
String os = agent.getOs();
String platform = agent.getPlatform();
boolean isMobile = agent.isMobile();
String version = agent.getVersion();
System.out.println("浏览器:" + browser);
System.out.println("引擎:" + engine);
System.out.println("引擎版本:" + engineVersion);
System.out.println("操作系统:" + os);
System.out.println("平台:" + platform);
System.out.println("是否为移动设备:" + isMobile);
System.out.println("版本号:" + version);
}
}
c. 执行测试代码,可看到结果:
4. 编写实战代码
a. 新建包
b. 新建ParseUserAgentApp类:
package com.shaonaiyi.hadoop.project;
import com.kumkee.userAgent.UserAgent;
import com.kumkee.userAgent.UserAgentParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Auther: 邵奈一
* @Date: 2019/03/27 下午 2:54
* @Description: 使用MapReduce完成浏览器的访问次数统计
*/
public class ParseUserAgentApp {
//Map类实现
public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
LongWritable one = new LongWritable(1);
private UserAgentParser userAgentParser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
userAgentParser = new UserAgentParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//每条日志信息
String lines = value.toString();
String agentSource = lines.substring(getCharacterPosition(lines, "\"", 7) + 1);
UserAgent agent = userAgentParser.parse(agentSource);
String brower = agent.getBrowser();
context.write(new Text(brower), one);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
userAgentParser = null;
}
}
//Reduce类实现
public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (LongWritable value: values){
sum += value.get();
}
context.write(key, new LongWritable(sum));
}
}
/**
* 获取指定字符串中指定标识的字符串出现的索引位置
* @param value 指定的字符串
* @param operator 指定标识
* @param index 索引位置
* @return 返回的索引位置
*/
private static int getCharacterPosition(String value, String operator, int index){
Matcher slashMatcher = Pattern.compile(operator).matcher(value);
int matcherIndex = 0;
while (slashMatcher.find()) {
matcherIndex++;
if (matcherIndex == index) {
break;
}
}
return slashMatcher.start();
}
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
// 若输出路径有内容,则先删除
Path outputPath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(configuration);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath, true);
System.out.println("路径存在,但已被删除");
}
Job job = Job.getInstance(configuration, "ParseUserAgentApp");
job.setJarByClass(ParseUserAgentApp.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
ps:代码其实是此教程:MapReduce入门例子之单词计数 改写来的,请查看学习!
c. 打包(怎么打包都可以,此处教一种新的打包方式)mvn assembly:assembly
1. 上传项目到服务器
a. 打包好项目后,可以在target目录看到有两个包:hadoop-learning-1.0.jar
:没有引入外部依赖的jar包(在本地windows可以用而已)hadoop-learning-1.0-jar-with-dependencies.jar
:引入了外部依赖的jar包(含小工具jar包)
说明:因为我们的服务器没有我们刚开始时打包的UserAgentParser-0.0.1.jar
包,我们只是在我们windows系统本地打了jar包,并安装到了我们windows系统的Maven仓库,实际上我们的服务器上没有的,所以要将hadoop-learning-1.0-jar-with-dependencies.jar
此包拷贝到服务器使用,不然的话也要在服务器的Maven仓库安装好UserAgentParser-0.0.1.jar
才行。
b. 上传项目到服务器
[hadoop-sny@master mr]$ pwd
/home/hadoop-sny/mr
[hadoop-sny@master mr]$ ll
total 352752
-rw-rw-r--. 1 hadoop-sny hadoop-sny 321100030 Dec 13 18:51 big_file_again.txt
-rw-rw-r--. 1 hadoop-sny hadoop-sny 8837 Mar 22 19:34 hadoop-learning-1.0.jar
-rw-rw-r--. 1 hadoop-sny hadoop-sny 39193853 Mar 27 15:34 hadoop-learning-1.0-jar-with-dependencies.jar
-rw-rw-r--. 1 hadoop-sny hadoop-sny 903971 Dec 19 10:45 mapreduce-course-1.0-SNAPSHOT.jar
-rw-rw-r--. 1 hadoop-sny hadoop-sny 30 Dec 19 17:12 small_file.txt
2. 上传日志文件到HDFS
a. 日志样式,如果没有文件,可以自己复制多几次写成文件来使用:
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getadv HTTP/1.1" 200 813 "www.imooc.com" "-" cid=0×tamp=1478707261865&uid=2871142&marking=androidbanner&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=f51e97d1cb1a9caac669ea8acc162b96 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.134.244:80 200 0.027 0.027
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
117.35.88.11 - - [10/Nov/2016:00:01:02 +0800] "GET /article/ajaxcourserecommends?id=124 HTTP/1.1" 200 2345 "www.imooc.com" "http://www.imooc.com/code/1852" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36" "-" 10.100.136.65:80 200 0.616 0.616
182.106.215.93 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.mukewang.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.004 0.004
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/userdynamic HTTP/1.1" 200 19501 "www.imooc.com" "-" cid=0×tamp=1478707261847&uid=2871142&touid=2871142&page=1&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=3837a5bf27ea718fe18bda6c53fbbc14 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.195 0.195
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
114.248.161.26 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getcourseintro HTTP/1.1" 200 2510 "www.imooc.com" "-" cid=283&secrect=86b720f312c2b25da3b20e59e7c89780×tamp=1478707261951&token=4c144b3f4314178b9527d1e91ecc0fac&uid=3372975 "mukewang/5.0.2 (iPhone; iOS 8.4.1; Scale/2.00)" "-" 10.100.136.65:80 200 0.007 0.008
120.52.94.105 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getmediainfo_ver2 HTTP/1.1" 200 633 "www.imooc.com" "-" cid=608&secrect=e25994750eb2bbc7ade1a36708b999a5×tamp=1478707261945&token=9bbdba949aec02735e59e0868b538e19&uid=4203162 "mukewang/5.0.2 (iPhone; iOS 10.0.1; Scale/3.00)" "-" 10.100.136.65:80 200 0.049 0.049
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
112.10.136.45 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.mukewang.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.006 0.006
211.162.33.31 - - [10/Nov/2016:00:01:02 +0800] "GET /u/card HTTP/1.1" 200 331 "www.imooc.com" "http://www.imooc.com/code/2053" - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" "-" 10.100.136.65:80 200 0.371 0.371
116.22.196.70 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxmediauser HTTP/1.1" 200 54 "www.imooc.com" "http://www.imooc.com/code/3500" mid=3500&time=60 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0" "-" 10.100.134.244:80 200 0.026 0.026
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
113.47.86.12 - - [10/Nov/2016:00:01:02 +0800] "GET /socket.io/1/websocket/eHBhkZC47oY64iLMMeXm HTTP/1.1" 101 125 "chat.mukewang.com" "-" - "-" "-" 10.100.15.239:80 101 277.433 277.433
119.130.229.90 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxmediauser HTTP/1.1" 200 54 "www.imooc.com" "http://www.imooc.com/code/547" mid=547&time=60 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" "-" 10.100.136.65:80 200 0.021 0.021
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
120.52.94.105 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getrelevantcourse HTTP/1.1" 200 774 "www.imooc.com" "-" cid=608&secrect=e25994750eb2bbc7ade1a36708b999a5×tamp=1478707262003&token=2b865e78535436df02fd3f986bb0cc08&uid=4203162 "mukewang/5.0.2 (iPhone; iOS 10.0.1; Scale/3.00)" "-" 10.100.136.65:80 200 0.048 0.048
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
183.44.115.163 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/savemediafinish HTTP/1.1" 200 103 "www.imooc.com" "-" is_offline=0&time=0&mid=2312&secrect=cc8506ee27115cd3c9d617730ea600d9&cid=0&plat_id=5×tamp=1478707261086&uid=4356276&stay_time=0&token=22e4a2ec2c40a7c4375651c5020e7023 "mukewang/5.0.1 (Android 5.0.2; Xiaomi Redmi Note 2 Build/LRX22G),Network WIFI" "-" 10.100.136.64:80 200 0.068 0.068
211.162.33.31 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxusermediasstatus?cid=9 HTTP/1.1" 200 2954 "www.imooc.com" "http://www.imooc.com/code/2053" - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" "-" 10.100.136.64:80 200 0.030 0.030
218.58.205.220 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.009 0.009
114.246.57.116 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/userinfo HTTP/1.1" 200 151 "www.imooc.com" "-" secrect=9455e4679d68f107477a27d69cdf753c×tamp=1478707262002&token=73bdcb218e48acd4869826afa320baf4&uid=4132795&uuid=0dd9c37bf4ac75031158349738b7612b "mukewang/5.0.2 (iPhone; iOS 10.1.1; Scale/2.00)" "-" 10.100.136.64:80 200 0.070 0.071
218.58.205.245 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.002 0.002
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
112.253.38.168 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.024 0.024
218.58.205.204 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.023 0.023
112.253.38.159 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.024 0.024
218.58.205.252 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.025 0.025
119.184.176.131 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.003 0.003
223.104.31.75 - - [10/Nov/2016:00:01:02 +0800] "GET /socket.io/1/websocket/szGk1G7hrpIe6RWHMfLK HTTP/1.1" 101 91 "chat.mukewang.com" "-" - "-" "-" 10.100.15.239:80 101 30.068 30.068
218.58.205.216 - - [10/Nov/2016:00:01:02 +0800] "HEAD /favicon.ico HTTP/1.1" 404 0 "chat.mukewang.com" "-" - "Go-http-client/1.1" "-" 10.100.15.239:80 404 0.022 0.022
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/beta HTTP/1.1" 200 16950 "www.imooc.com" "-" cid=0×tamp=1478707261842&uid=2871142&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=4ea00393c5ac3588c5317cf9f28013fa "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.377 0.377
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
106.39.41.166 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxmediauser/ HTTP/1.1" 200 54 "www.imooc.com" "http://www.imooc.com/video/8701" mid=8701&time=120.0010000000002&learn_time=16.1 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0" "-" 10.100.136.64:80 200 0.016 0.016
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/searchindex HTTP/1.1" 200 1484 "www.imooc.com" "-" cid=0&words=premiere×tamp=1478707261876&uid=2871142&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=1b4fcde08cb054e9077b2f316a7da0b0 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.110 0.110
39.186.247.142 - - [10/Nov/2016:00:01:02 +0800] "GET /video/3237 HTTP/1.1" 200 7227 "www.imooc.com" "http://www.imooc.com/ceping/4191" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" "-" 10.100.136.64:80 200 0.198 0.198
113.140.11.123 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxmediauser/ HTTP/1.1" 200 54 "www.imooc.com" "http://www.imooc.com/video/5915/0" mid=5915&time=60.01200000000006&learn_time=284.9 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393" "-" 10.100.134.244:80 200 0.029 0.029
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
b. 上传日志到HDFS根目录hadoop fs -put access.log /files
3. 执行项目
a. 请确保HDFS与YARN已启动,并确保HDFS根目录有access.log
文件
b. 进入jar包所在目录(此处为:/home/hadoop-sny/mr
)cd /home/hadoop-sny/mr
c. 执行:hadoop jar ./hadoop-learning-1.0-jar-with-dependencies.jar com.shaonaiyi.hadoop.project.ParseUserAgentApp /files/access.log /projectout
d. 查看统计结果hadoop fs -cat /projectout/*
结果显示:
[hadoop-sny@master mr]$ hadoop fs -cat /projectout/*
Chrome 2775
Firefox 327
MSIE 78
Safari 115
Unknown 6705
[hadoop-sny@master mr]$
0xFF 总结
- 同样,也可以进入YARN的WebUI界面:
http://master:8088
,查看执行的作业 - 实战的代码其实是此教程:MapReduce入门例子之单词计数 的进阶版,请跳转学习,一步一步升级打怪!
- 思考题:请尝试挖掘更多的业务,实现更多的需求,此处只是统计了一个浏览器的次数,其实通过我们的测试类,可以发现我们的业务不只有统计日志中浏览器出现的次数。
作者简介:邵奈一
大学大数据讲师、大学市场洞察者、专栏编辑
公众号、微博、CSDN:邵奈一
复制粘贴玩转大数据系列专栏已经更新完成,请跳转学习!