Hadoop集群搭建

一、修改三台主机名(三台机器都需执行)

hostnamectl set-hostname hadoop102
hostnamectl set-hostname hadoop103
hostnamectl set-hostname hadoop104

二、修改hosts(三台机器都需执行)

vim /etc/hosts
10.206.16.3 hadoop102
10.206.16.5 hadoop103
10.206.16.14 hadoop104

三、安装必要软件

# epel-release是额外软件仓库包 
yum install -y epel-release
# vim是文本编辑工具,最小化安装需要
yum install -y vim
# net-tools是网络工具,包含ifconfig等命令,最小化安装需要;
yum install -y net-tools
# rsync是文件分发工具
yum install -y rsync

四、关闭防火墙(腾讯云不需要,在控制台放行)

# 查看防火墙状态
systemctl status firewalld
# 关闭防火墙
systemctl stop firewalld
# 关闭防火墙开机自启动
systemctl disable firewalld

五、卸载自带JDK(最小化安装不需要,腾讯云不需要)

rpm -qa | grep -i java | xargs -n1 rpm -e --nodeps

六、安装JDK(hadoop102执行)

mkdir -p /opt/software
mkdir -p /opt/module
cd /opt/software
# 下载openjdk
wget https://download.java.net/openjdk/jdk8u41/ri/openjdk-8u41-b04-linux-x64-14_jan_2020.tar.gz
# 解压
tar -zxvf openjdk-8u41-b04-linux-x64-14_jan_2020.tar.gz -C /opt/module/
# 修改环境变量
vim /etc/profile

# 文件最后面追加
export JAVA_HOME=/opt/module/java-se-8u41-ri
export PATH=$PATH:$JAVA_HOME/bin

# 刷新
source /etc/profile

七、安装Hadoop3.3.1(hadoop102执行)

cd /opt/software
# 下载Hadoop3.3.1
wget --no-check-certificate https://downloads.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz
# 解压
tar -zxvf /opt/software/hadoop-3.3.1.tar.gz -C /opt/module/
# 编辑环境变量
vim /etc/profile

# 文件最后面追加
export HADOOP_HOME=/opt/module/hadoop-3.3.1
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin

# 刷新
source /etc/profile

八、配置ssh免密登录

ssh-keygen -t rsa
# 三次回车  在用户家目录下会生成id_rsa私钥、id_rsa.pub公钥
# 免密登录hadoop102,输入yes,输入对应的密码
ssh-copy-id hadoop102
# 免密登录hadoop103,输入yes,输入对应的密码
ssh-copy-id hadoop103
# 免密登录hadoop104,输入yes,输入对应的密码
ssh-copy-id hadoop104
103,104按上述也需配置免密登录

九、xsync分发脚本

vim xsync
#!/bin/bash
#1. 判断参数个数
if [ $# -lt 1 ]
then
    echo Not Enough Arguement!
    exit;
fi
#2. 遍历集群所有机器
for host in hadoop102 hadoop103 hadoop104
do
    echo ====================  $host  ====================
    #3. 遍历所有目录,挨个发送
    for file in $@
    do
        #4. 判断文件是否存在
        if [ -e $file ]
            then
                #5. 获取父目录
                pdir=$(cd -P $(dirname $file); pwd)

                #6. 获取当前文件的名称
                fname=$(basename $file)
                ssh $host "mkdir -p $pdir"
                rsync -av $pdir/$fname $host:$pdir
            else
                echo $file does not exists!
        fi
    done
done

十、集群配置规则

NameNode与SecondaryNameNode不要安装在同一服务器
ResourceManager也很消耗资源,不要与NameNode或者SecondaryNameNode安装在同一服务器

hadoop102 hadoop103 hadoop104
HDFS NameNode SecondaryNameNode
DataNode DataNode DataNode
YARN ResourceManager
NodeManager NodeManager NodeManager

十一、core配置文件

cd /opt/module/hadoop-3.3.1/etc/hadoop
vim core-site.xml
<configuration>
	<!--指定namenode的地址-->
	<property>
		<name>fs.defaultFS</name>
		<value>hdfs://hadoop102:8020</value>
	</property>
	<!--指定Hadoop数据存储的位置-->
	<property>
		<name>hadoop.tmp.dir</name>
		<value>/opt/module/hadoop-3.3.1/data</value>
	</property>
</configuration>

十二、hdfs配置文件

vim hdfs-site.xml
<configuration>
	<!--namenode web访问地址-->
	<property>
		<name>dfs.namenode.http-address</name>
		<value>hadoop102:9870</value>
	</property>
	<!--secondarynamenode web访问地址-->
	<property>
		<name>dfs.namenode.secondary.http-address</name>
		<value>hadoop104:9868</value>
	</property>
</configuration>

十三、yarn配置文件

vim yarn-site.xml
<configuration>
	<!--指定mapreduce走shuffle-->
	<property>
		<name>yarn.nodemanager.aux-services</name>
		<value>mapreduce_shuffle</value>
	</property>
	<!--指定ResourceManager地址-->
	<property>
		<name>yarn.resourcemanager.hostname</name>
		<value>hadoop103</value>
	</property>
        	<property>
                	<name>yarn.application.classpath</name>
                	<value>/opt/module/hadoop-3.3.1/etc/hadoop:/opt/module/hadoop-3.3.1/share/hadoop/common/lib/*:/opt/module/hadoop-3.3.1/share/hadoop/common/*:/opt/module/hadoop-3.3.1/share/hadoop/hdfs:/opt/module/hadoop-3.3.1/share/hadoop/hdfs/lib/*:/opt/module/hadoop-3.3.1/share/hadoop/hdfs/*:/opt/module/hadoop-3.3.1/share/hadoop/mapreduce/*:/opt/module/hadoop-3.3.1/share/hadoop/yarn:/opt/module/hadoop-3.3.1/share/hadoop/yarn/lib/*:/opt/module/hadoop-3.3.1/share/hadoop/yarn/*</value>
        	</property>
	<!--开启日志聚集功能-->
	<property>
		<name>yarn.log-aggregation-enable</name>
		<value>true</value>
	</property>
	<!--设置日志聚集服务器地址-->
	<property>
		<name>yarn.log.server.url</name>
		<value>http://hadoop102:19888/jobhistory/logs</value>
	</property>
	<!--设置日志保留时间7天-->
	<property>
		<name>yarn.log-aggregation.retain-seconds</name>
		<value>604800</value>
	</property>
</configuration>

十四、mapreduce配置文件

vim mapred-site.xml
<configuration>
	<!--指定mapreduce程序运行在yarn上-->
	<property>
		<name>mapreduce.framework.name</name>
		<value>yarn</value>
	</property>
		<!--历史服务器地址-->
	<property>
		<name>mapreduce.jobhistory.address</name>
		<value>hadoop102:10020</value>
	</property>
		<!--历史服务器web端地址-->
	<property>
		<name>mapreduce.jobhistory.webapp.address</name>
		<value>hadoop102:19888</value>
	</property>
</configuration>

十五、配置workers

vim workers
# 删除原来的localhost,改为:
hadoop102
hadoop103
hadoop104

十六、start-dfs.sh,stop-dfs.sh(在hadoop安装目录的sbin里)两个文件顶部添加以下参数

vim /opt/module/hadoop-3.3.1/sbin/start-dfs.sh

HDFS_DATANODE_USER=root
HDFS_DATANODE_SECURE_USER=hdfs 
HDFS_NAMENODE_USER=root 
HDFS_SECONDARYNAMENODE_USER=root
vim /opt/module/hadoop-3.3.1/sbin/stop-dfs.sh

HDFS_DATANODE_USER=root
HDFS_DATANODE_SECURE_USER=hdfs 
HDFS_NAMENODE_USER=root 
HDFS_SECONDARYNAMENODE_USER=root

十七、start-yarn.sh,stop-yarn.sh(在hadoop安装目录的sbin里)两个文件顶部添加以下参数

vim /opt/module/hadoop-3.3.1/sbin/start-yarn.sh

YARN_RESOURCEMANAGER_USER=root
HADOOP_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root
vim /opt/module/hadoop-3.3.1/sbin/stop-yarn.sh

YARN_RESOURCEMANAGER_USER=root
HADOOP_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root

十八、修改hadoop-env.sh JAVA_HOME路径

vim /opt/module/hadoop-3.3.1/etc/hadoop/hadoop-env.sh

export JAVA_HOME=/opt/module/java-se-8u41-ri

十九、将配置分发给103、104

./xsync /etc/profile
./xsync /opt/module/hadoop-3.3.1/
./xsync /opt/module/java-se-8u41-ri/

# 在103、104执行 
source /etc/profile

二十、集群启动脚本

vim startHadoop.sh
#!/bin/bash
echo " =================== 启动 hadoop集群 ==================="
echo " --------------- 启动 hdfs ---------------"
ssh hadoop102 "/opt/module/hadoop-3.3.1/sbin/start-dfs.sh"
echo " --------------- 启动 yarn ---------------"
ssh hadoop103 "/opt/module/hadoop-3.3.1/sbin/start-yarn.sh"
echo " --------------- 启动 historyserver ---------------"
ssh hadoop102 "/opt/module/hadoop-3.3.1/bin/mapred --daemon start historyserver"
chmod +x startHadoop.sh

二十一、集群关闭脚本

vim stopHadoop.sh
#!/bin/bash
echo " =================== 关闭 hadoop集群 ==================="
echo " --------------- 关闭 historyserver ---------------"
ssh hadoop102 "/opt/module/hadoop-3.3.1/bin/mapred --daemon stop historyserver"
echo " --------------- 关闭 yarn ---------------"
ssh hadoop103 "/opt/module/hadoop-3.3.1/sbin/stop-yarn.sh"
echo " --------------- 关闭 hdfs ---------------"
ssh hadoop102 "/opt/module/hadoop-3.3.1/sbin/stop-dfs.sh"
chmod +x stopHadoop.sh

二十二、集群启动测试

# 第一次需要初始化
hdfs namenode -forma
# 启动脚本
./startHadoop.sh
# 启动成功可以访问hadoop102:9870

二十三、单词计数

hdfs dfs -mkdir /software
hdfs dfs -mkdir /wcinput

vim user.txt

zou zou zou
hhh hhh
abcd
yuan yuan yuan
123 123 123 123

# 文件上传
hdfs dfs -put user.txt /wcinput

# 任务执行
hadoop jar /opt/module/hadoop-3.3.1/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.1.jar wordcount /wcinput /wcoutput
上一篇:大数据理论与实践 源码分析 YARN资源调度策略


下一篇:STM32CubeMX使用教程——GPIO端口(上)