文章目录
前言
如果数据集很小,需求不大,我们可以直接在IDEA上运行MapReduce程序得到结果,但数据集很大、需求很多、代码又很繁琐时,再用IDEA运行就非常浪费时间,所以这个时候便需要将代码打包提交到hadoop集群运行,可以节省很多时间。
一、 编写MapReduce(以词频统计为例)
1.数据集和需求
数据集:
computer,huangxiaoming,85,86,41,75,93,42,85
computer,xuzheng,54,52,86,91,42
computer,huangbo,85,42,96,38
english,zhaobenshan,54,52,86,91,42,85,75
english,liuyifei,85,41,75,21,85,96,14
algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,liuyifei,76,95,86,74,68,74,48
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85
english,zhouqi,85,86,41,75,93,42,85,75,55,47,22
english,huangbo,85,42,96,38,55,47,22
algorithm,liutao,85,75,85,99,66
computer,huangzitao,85,86,41,75,93,42,85
math,wangbaoqiang,85,86,41,75,93,42,85
computer,liujialing,85,41,75,21,85,96,14,74,86
computer,liuyifei,75,85,62,48,54,96,15
computer,liutao,85,75,85,99,66,88,75,91
computer,huanglei,76,95,86,74,68,74,48
english,liujialing,75,85,62,48,54,96,15
math,huanglei,76,95,86,74,68,74,48
math,huangjiaju,85,75,86,85,85,74,86
math,liutao,48,58,67,86,15,33,85
english,huanglei,85,75,85,99,66,88,75,91
math,xuzheng,54,52,86,91,42,85,75
math,huangxiaoming,85,75,85,99,66,88,75,91
math,liujialing,85,86,41,75,93,42,85,75
english,huangxiaoming,85,86,41,75,93,42,85
algorithm,huangdatou,48,58,67,86,15,33,85
algorithm,huangzitao,85,86,41,75,93,42,85,75
需求:统计每类课程的参考人数和每类课程平均分
分析:
1.先计算每个学生的平均分
2.由于需要统计每个课程的参考人数,所以reduce接收的key需要接收课程course
3.在reduce中统计每类课程的平均分和参考人数
2.pom依赖
要在idea上编写MapReduce,pom需要一下几个依赖:
hadoop-hdfs,hadoop-client,hadoop-common
可以去https://mvnrepository.com/下载,注意和本地安装的hadoop版本相对应。
下面是我的pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>MapReduceTest</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
</dependencies>
</project>
3.编写MapReduce
package com.mandy;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class countCourseNumScore {
public static class mymap extends Mapper<LongWritable, Text,Text, DoubleWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//读取每行数据,计算每个学员平均分
String[] split = value.toString().split(",");
if (split.length >= 3){
double sum01 = 0;
for (int i = 2; i <split.length ; i++) {
//对分数转换成double类型,进行累加计算
sum01+=Double.parseDouble(split[i]);
}
int counts = split.length-2;
double avgScore = sum01 / counts;
context.write(new Text(split[0]),new DoubleWritable(avgScore));
}
}
}
//由于reduce一次接收相同课程,所以可以得出课程的参考人数
public static class myreduce extends Reducer<Text, DoubleWritable,Text,Text>{
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
int counts = 0;
double sum02 = 0;
for (DoubleWritable value : values) {
counts++;
//将相同课程的所有学员的平均成绩进行累加,从而获得每类课程的平均成绩
sum02+=value.get();
}
double avgCourse = sum02 / counts;
context.write(key,new Text(counts+"\t"+avgCourse));
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance();
job.setJarByClass(countCourseNumScore.class);
job.setMapperClass(mymap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setReducerClass(myreduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
String inputPath = args[0];
String outputPath = args[1];
// String inputPath = "datas/stu_courses_info";
// String outputPath = "output/countCourseNumScore/";
FileSystem fs = FileSystem.get(new Configuration());
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
if (fs.exists(new Path(outputPath))) {
fs.delete(new Path(outputPath), true);
}
boolean b = job.waitForCompletion(true);
if (b) {
System.out.println("success~~~");
}else {
System.out.println("fail~~~~");
}
}
}
4.打包代码
点开右侧maven,先双击Lifecycle下的clean消除target,再双击package就可以得到此代码的Jar包,完成后可以在左侧target下面找到此jar包,控制台也会显示此jar包本地路径。
二、提交到hadoop集群运行
1.将Windows下的jar包上传到虚拟机linux
利用传输工具FIleZillaClient将Windows下的文件传输至虚拟机,不会的小伙伴可以去看我的另一篇博客(虚拟机(CentOS)下安装配置Hadoop(伪分布式)),在文章的最后有教程。
此处我把jar包和数据集data.txt传输至我的虚拟机普通用户qingqing目录下
2.在hadoop上运行MapReduce jar包
运行MR jar包命令:
hadoop jar linux本地jar包路径 Windows下的包名.类名 hadoop下的数据集路径 hadoop下的输出路径
快速得到包名.类名:
在代码中双击类名,按下快捷键Ctrl+Alt+Shift+C,便可以粘贴到包名.类名
将数据集上传至hadoop的根目录下
hadoop fs -put /home/qingqing/data.txt /
开启hadoop相关服务
start-all.sh
运行jar包
hadoop jar /home/qingqing/MapReduceTest-1.0-SNAPSHOT.jar com.mandy.countCourseNumScore /data.txt /output1
运行结果