案例:
使用电商网站的用户行为日志进行统计分析
一:准备
1.指标
PV:网页流浪量
UV:独立访客数
VV:访客的访问数,session次数
IP:独立的IP数
2.上传测试数据
3.查看第一条记录
注意点(字符显示):
二:程序
1.分析
省份ID-》key
value-》1
-》 <proviced,list(1,1,1)>
2.数据类型
key:Text
value:IntWritable
3.map 端的业务
4.reduce端的业务
5.整合运行
6.结果
三:计数器
1.程序
2.结果
结果完全吻合。
四:完整程序
1.PV程序
package com.senior.network; import java.io.IOException; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; public class WebPvCount extends Configured implements Tool{
//Mapper
public static class WebPvCountMapper extends Mapper<LongWritable,Text,IntWritable,IntWritable>{
private IntWritable mapoutputkey=new IntWritable();
private static final IntWritable mapoutputvalue=new IntWritable(1);
@Override
protected void cleanup(Context context) throws IOException,InterruptedException { }
@Override
protected void setup(Context context) throws IOException,InterruptedException { } @Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String lineValue=value.toString();
String[] strs=lineValue.split("\t");
if(30>strs.length){
return;
}
String priviceIdValue=strs[23];
String urlValue=strs[1];
if(StringUtils.isBlank(priviceIdValue)){
return;
}
if(StringUtils.isBlank(urlValue)){
return;
}
Integer priviceId=Integer.MAX_VALUE;
try{
priviceId=Integer.valueOf(priviceIdValue);
}catch(Exception e){
e.printStackTrace();
}
mapoutputkey.set(priviceId);
context.write(mapoutputkey, mapoutputvalue);
} } //Reducer
public static class WebPvCountReducer extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{
private IntWritable outputvalue=new IntWritable();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {
int sum=0;
for(IntWritable value : values){
sum+=value.get();
}
outputvalue.set(sum);
context.write(key, outputvalue);
} } //Driver
public int run(String[] args)throws Exception{
Configuration conf=this.getConf();
Job job=Job.getInstance(conf,this.getClass().getSimpleName());
job.setJarByClass(WebPvCount.class);
//input
Path inpath=new Path(args[0]);
FileInputFormat.addInputPath(job, inpath); //output
Path outpath=new Path(args[1]);
FileOutputFormat.setOutputPath(job, outpath); //map
job.setMapperClass(WebPvCountMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class); //shuffle //reduce
job.setReducerClass(WebPvCountReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class); //submit
boolean isSucess=job.waitForCompletion(true);
return isSucess?0:1;
} //main
public static void main(String[] args)throws Exception{
Configuration conf=new Configuration();
//compress
conf.set("mapreduce.map.output.compress", "true");
conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
args=new String[]{
"hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/inputWebData",
"hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/outputWebData1"
};
int status=ToolRunner.run(new WebPvCount(), args);
System.exit(status);
} }
2.计数器
这个计数器集中在mapper端。
package com.senior.network; import java.io.IOException; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; public class WebPvCount extends Configured implements Tool{
//Mapper
public static class WebPvCountMapper extends Mapper<LongWritable,Text,IntWritable,IntWritable>{
private IntWritable mapoutputkey=new IntWritable();
private static final IntWritable mapoutputvalue=new IntWritable(1);
@Override
protected void cleanup(Context context) throws IOException,InterruptedException { }
@Override
protected void setup(Context context) throws IOException,InterruptedException { } @Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String lineValue=value.toString();
String[] strs=lineValue.split("\t");
if(30>strs.length){
context.getCounter("webPvMapper_counter", "length_LT_30").increment(1L);
return;
}
String priviceIdValue=strs[23];
String urlValue=strs[1];
if(StringUtils.isBlank(priviceIdValue)){
context.getCounter("webPvMapper_counter", "priviceIdValue_null").increment(1L);
return; }
if(StringUtils.isBlank(urlValue)){
context.getCounter("webPvMapper_counter", "url_null").increment(1L);
return;
}
Integer priviceId=Integer.MAX_VALUE;
try{
priviceId=Integer.valueOf(priviceIdValue);
}catch(Exception e){
context.getCounter("webPvMapper_counter", "switch_fail").increment(1L);
e.printStackTrace();
}
mapoutputkey.set(priviceId);
context.write(mapoutputkey, mapoutputvalue);
} } //Reducer
public static class WebPvCountReducer extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{
private IntWritable outputvalue=new IntWritable();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {
int sum=0;
for(IntWritable value : values){
sum+=value.get();
}
outputvalue.set(sum);
context.write(key, outputvalue);
} } //Driver
public int run(String[] args)throws Exception{
Configuration conf=this.getConf();
Job job=Job.getInstance(conf,this.getClass().getSimpleName());
job.setJarByClass(WebPvCount.class);
//input
Path inpath=new Path(args[0]);
FileInputFormat.addInputPath(job, inpath); //output
Path outpath=new Path(args[1]);
FileOutputFormat.setOutputPath(job, outpath); //map
job.setMapperClass(WebPvCountMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class); //shuffle //reduce
job.setReducerClass(WebPvCountReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class); //submit
boolean isSucess=job.waitForCompletion(true);
return isSucess?0:1;
} //main
public static void main(String[] args)throws Exception{
Configuration conf=new Configuration();
//compress
conf.set("mapreduce.map.output.compress", "true");
conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
args=new String[]{
"hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/inputWebData",
"hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/outputWebData2"
};
int status=ToolRunner.run(new WebPvCount(), args);
System.exit(status);
} }