在用hive做nginx日志url 分析的时候,经常需要parse url。
hive中自带的函数parse_url可以实现这个功能,不过它对格式的要求比较严格,不能直接用于nginx log的request字段。
1
2
|
hive -e "select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST') from dual"
facebook.com |
1
2
|
hive -e "select parse_url('facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST') from dual"
NULL |
也可以通过regexp_extract来实现,不过需要写正则,同时性能也有些问题。。
1
2
|
hive -e "select regexp_extract('GET /vips-mobile/router.do?api_key=24415b921531551cb2ba756b885ce783&app_version=1.8.6&fields=sku_id HTTP/1.1','.+? +(.+?)app_version=(.+?)&(.+) .+?',2) from dual"
1.8.6 |
考虑自己写一个,借鉴parse_url的udf:
代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
package com.hive.myudf;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.ql.exec.UDF;
public class UDFNginxParseUrl extends UDF {
private String schemal = "http://" ;
private String host1 = null ;
private Pattern p1 = null ;
private URL url = null ;
private Pattern p = null ;
private String lastKey = null ;
public UDFNginxParseUrl() {
}
public String evaluate(String host1, String urlStr, String partToExtract) {
if (host1 == null || urlStr == null || partToExtract == null ) {
return null ;
}
p1 = Pattern.compile( "(.+?) +(.+?) (.+)" );
Matcher m1 = p1.matcher(urlStr);
if (m1.matches()){
String realUrl = schemal + host1 + m1.group( 2 );
System.out.println( "URL is " + realUrl);
try {
url = new URL(realUrl);
} catch (Exception e){
return null ;
}
}
/*
if (lastUrlStr == null || !urlStr.equals(lastUrlStr)) {
try {
url = new URL(urlStr);
} catch (Exception e) {
return null;
}
}
lastUrlStr = urlStr;
*/
if (partToExtract.equals( "HOST" )) {
return url.getHost();
}
if (partToExtract.equals( "PATH" )) {
return url.getPath();
}
if (partToExtract.equals( "QUERY" )) {
return url.getQuery();
}
if (partToExtract.equals( "REF" )) {
return url.getRef();
}
if (partToExtract.equals( "PROTOCOL" )) {
return url.getProtocol();
}
if (partToExtract.equals( "FILE" )) {
return url.getFile();
}
if (partToExtract.equals( "AUTHORITY" )) {
return url.getAuthority();
}
if (partToExtract.equals( "USERINFO" )) {
return url.getUserInfo();
}
return null ;
}
public String evaluate(String host, String urlStr, String partToExtract, String key) {
if (!partToExtract.equals( "QUERY" )) {
return null ;
}
String query = this .evaluate(host, urlStr, partToExtract);
if (query == null ) {
return null ;
}
if (!key.equals(lastKey)) {
p = Pattern.compile( "(&|^)" + key + "=([^&]*)" );
}
lastKey = key;
Matcher m = p.matcher(query);
if (m.find()) {
return m.group( 2 );
}
return null ;
}
} |
add jar和create function之后测试:
1
2
|
hive -e "select nginx_url_parse('test.test.com','GET /vips-mobile/router.do?api_key=24415&app_version=1.8.6&fields=sku_id HTTP/1.1','HOST') FROM dual;"
test.test.com |
1
2
|
hive -e "select nginx_url_parse('test.test.com','GET /vips-mobile/router.do?api_key=24415&app_version=1.8.6&fields=sku_id HTTP/1.1','QUERY','api_key') FROM dual;"
24415 |
这样就可以直接应用于nginx的日志了。
本文转自菜菜光 51CTO博客,原文链接:http://blog.51cto.com/caiguangguang/1350463,如需转载请自行联系原作者