Hadoop高可用
1.1 安装
解压
tar -zvxf hadoop-2.7.3.tar.gz
重命名
mv hadoop-2.7.3 hadoop
1.2 配置文件
cd /usr/local/software/hadoop/etc/hadoop
修改配置文件
core-site.xml
<property>
<name>hadoop.tmp.dir</name>
<value>file:/usr/local/software/hadoop-2.7.3/hdfs/tmp</value>
<description>A base for other temporary directories.</description>
</property>
<property>
<name>io.file.buffer.size</name>
<value>131072</value>
</property>
<!--指定hdfs连接地址,集群模式(高可用)-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://cluster</value>
</property>
<!-- 指定ZooKeeper集群的地址和端口。注意,数量一定是奇数,且不少于三个节点-->
<property>
<name>ha.zookeeper.quorum</name>
<value>10.202.80.109:2181,10.202.80.110:2181,10.202.80.196:2181</value>
</property>
hdfs-site.xml
<configuration>
<!--指定HDFS副本的数量,不能超过机器节点数-->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<!-- 为namenode集群定义一个services name -->
<property>
<name>dfs.nameservices</name>
<value>cluster</value>
</property>
<!-- nameservice 包含哪些namenode,为各个namenode起名 -->
<property>
<name>dfs.ha.namenodes.cluster</name>
<value>nn01,nn02</value>
</property>
<!-- 名为nn01的namenode的rpc地址和端口号,rpc用来和datanode通讯 -->
<property>
<name>dfs.namenode.rpc-address.cluster.nn01</name>
<value>10.202.80.109:9000</value>
</property>
<!--名为nn01的namenode的http地址和端口号,用来和web客户端通讯 -->
<property>
<name>dfs.namenode.http-address.cluster.nn01</name>
<value>10.202.80.109:50070</value>
</property>
<!-- 名为nn02的namenode的rpc地址和端口号,rpc用来和datanode通讯 -->
<property>
<name>dfs.namenode.rpc-address.cluster.nn02</name>
<value>10.202.80.110:9000</value>
</property>
<!--名为nn02的namenode的http地址和端口号,用来和web客户端通讯 -->
<property>
<name>dfs.namenode.http-address.cluster.nn02</name>
<value>10.202.80.110:50070</value>
</property>
<!-- namenode间用于共享编辑日志的journal节点列表 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://10.202.80.109:8485;10.202.80.110:8485;10.202.80.196:8485/cluster</value>
</property>
<!-- journalnode 上用于存放edits日志的目录 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/usr/local/software/hadoop/hdfs/journaldata</value>
</property>
<!-- 指定该集群出现故障时,是否自动切换到另一台namenode -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- 不开启权限验证 -->
<property>
<name>dfs.permissions.enable</name>
<value>false</value>
</property>
<!-- 配置失败自动切换实现方式 -->
<property>
<name>dfs.client.failover.proxy.provider.cluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 一旦需要NameNode切换,使用ssh方式进行操作 -->
<property>
<name>dfs.ha.fencing.methods</name>
<!--<value>sshfence(root:9431)</value>
<value>shell(true)</value> -->
<value>sshfence(root:9431)
shell(/bin/true)
</value>
</property>
<!-- 如果使用ssh进行故障切换,使用ssh通信时用的密钥存储的位置 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<!-- connect-timeout超时时间 -->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/usr/local/software/hadoop/hdfs/name</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>/usr/local/software/hadoop/hdfs/data</value>
</property>
</configuration>
yarn-site.xml
<configuration>
<!-- 启用Resource Manager HA高可用性 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 指定resourcemanager的名字 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yrc</value>
</property>
<!-- 使用了2个resourcemanager,分别指定Resourcemanager的地址 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 指定rm1的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>10.202.80.109</value>
</property>
<!-- 指定rm2的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>10.202.80.110</value>
</property>
<!-- 指定zookeeper集群机器 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>10.202.80.109:2181,10.202.80.110:2181,10.202.80.196:2181</value>
</property>
<!-- NodeManager上运行的附属服务,默认是mapreduce_shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
mapred-site.xml
<configuration>
<!-- 采用yarn作为mapreduce的资源调度框架 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
slaves
10.202.80.109
10.202.80.110
10.202.80.196
hadoop-env.sh
# The java implementation to use.
export JAVA_HOME=/usr/local/software/jdk
# 如果ssh端口不是默认22,需要添加此配置
export HADOOP_SSH_OPTS="-p 9431"
# 指定hadoop相关的pid存放位置 会自动生成
export HADOOP_PID_DIR=/usr/local/software/hadoop/pids
1.3 环境变量
/etc/profile
export HADOOP_HOME=/usr/local/software/hadoop
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=/usr/local/software/hadoop/lib"
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
1.4 分发
# 进入software目录
scp -r hadoop root@10.202.80.110:$PWD
scp -r hadoop root@10.202.80.196:$PWD
1.5 启动并查看
启动hadoop前,需要执行几步格式化操作:
(1)启动journalnode,三台机器都要这一步操作(仅第一次启动hadoop时,需要这一步操作,之后不再需要手动启动journalnode)
cd /usr/local/software/hadoop/sbin
sh hadoop-daemon.sh start journalnode
(2) 在hadoop-01
上执行格式化操作,格式化namenode和zkfc
hdfs namenode -format
hdfs zkfc -formatZK
(3) namenode主从信息同步,在hadoop-02节点上执行同步命令
bin/hdfs namenode -bootstrapStandby
上述步骤完成后,接下来我们就可以启动hadoop了
原文链接:https://blog.csdn.net/qq_22917163/article/details/99726899
1.5.1 启动
都进入
/usr/local/software/hadoop
目录
服务器10.202.80.109:
start-all.sh
sbin/hadoop-daemon.sh start zkfc
服务器10.202.80.110:
sbin/yarn-daemon.sh start resourcemanager
sbin/hadoop-daemon.sh start zkfc
1.5.2 查看进程
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EypDtHy1-1639723753777)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215170347444.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bBuDK3Oz-1639723753778)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215170423272.png)]
web界面查看
-
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-J6wyGYka-1639723753780)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215171948675.png)]
10.202.80.109
是active节点,即使namenode主节点 -
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hkkkLTs6-1639723753780)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215172116282.png)]
10.202.80.110
收standBy节点,也就是namenode备节点 -
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JfVSNMSg-1639723753781)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215172310886.png)]
1.5.2 停止命令
10.202.80.109
stop-all.sh
sbin/hadoop-daemon.sh stop zkfc
10.202.80.110
sbin/yarn-daemon.sh stop resourcemanager
sbin/hadoop-daemon.sh stop zkfc
1.6 测试高可用
(1)在
10.202.80.109
上kill掉namenode进程,然后通过浏览器查看10.202.80.109
的状态,发现状态变为active,说明高可用测试成功
(2)重新启动10.202.80.109
的namenode进程,sh start-dfs.sh,浏览器访问10.202.80.109
,此时10.202.80.109
的状态为standby
遇到的问题
kill 主节点后 ,hadoop HA standby无法自动切换为active
试了很多方法:
-
以为免密的问题,指定了自己位置
<value>/root/.ssh/authorized_keys</value>
还做了以下修改
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lF9b82Py-1639723753783)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211216103245375.png)]
-
由于
dfs.ha.fencing.methods
参数的value是sshfence,需要使用的fuser
命令;所以通过如下命令安装一下即可,两个namenode节点都需要安装yum -y install psmisc
-
以上都无用
我怀疑是zk的问题,因为我本地已经起了zk,它这边又配置类zk
进入备用节点hadoop/logs/
查看日志
cat hadoop-root-zkfc-node2.log
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VR5LTTEY-1639723753784)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211216132850080.png)]
进入zk的bin目录,执行以下命令:
zkCli.sh
[zk: localhost:2181(CONNECTED) 0] get /hadoop-ha/cluster/ActiveBreadCrumb
clusternn01node1 �F(�>
[zk: localhost:2181(CONNECTED) 1] Xshell
原因:由于zk中节点注册的信息是域名,只需要在hosts中配置相关域名即可
解决方法:
在备用(standby)服务器添加主机名映射
vi /etc/hosts
10.202.80.109 node1
10.202.80.110 node2
10.202.80.196 node3
然后再测试 发现成功
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-C3PyfUBo-1639723753785)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211216135531969.png)]
3.1.3 版本配置
环境变量
export HADOOP_HOME=/usr/local/software/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HADOOP_HOME/lib
配置文件
core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/software/hadoop/hdfs/tmp</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>10.202.80.109:2181,10.202.80.110:2181,10.202.80.196:2181</value>
</property>
</configuration>
hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>10.202.80.110:9869</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>10.202.80.109:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>10.202.80.110:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>10.202.80.109:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>10.202.80.110:9870</value>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://10.202.80.109:8485;10.202.80.110:8485/mycluster</value>
</property>
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/usr/local/software/hadoop/hdfs/journalnode</value>
</property>
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>10.202.80.109</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>10.202.80.109:8088</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
hadoop-env.sh
export JAVA_HOME=/usr/local/software/jdk
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export HDFS_ZKFC_USER=root
export HDFS_JOURNALNODE_USER=root
yarn-env.sh
export YARN_RESOURCEMANAGER_USER=root
export HADOOP_SECURE_DN_USER=yarn
export YARN_NODEMANAGER_USER=root
workers
10.202.80.109
10.202.80.110
10.202.80.196
启动前准备工作
-
启动zk
-
启动journalnode
手动启动所有journalNode节点的journalNode功能(Node1, Node2)
#Hadoop 2.X启动方式
hadoop-daemon.sh start journalnode
#Hadoop 3.X启动方式
hdfs --daemon start journalnode
-
在其中一台namenode格式化
hdfs zkfc -formatZK hdfs namenode -format //格式化 hdfs --daemon start namenode //打开NameNode节点
-
副节点同步主节点格式化
hdfs namenode -bootstrapStandby
-
启动
start-all.sh
查看:
hdfs haadmin -getServiceState nn1
hdfs haadmin -getServiceState nn2
hdfs haadmin -getServiceState nn3
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wExJcJDx-1639723753787)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211217140233835.png)]
手动启动所有journalNode节点的journalNode功能(Node1, Node2)
#Hadoop 2.X启动方式
hadoop-daemon.sh start journalnode
#Hadoop 3.X启动方式
hdfs --daemon start journalnode
-
在其中一台namenode格式化
hdfs zkfc -formatZK hdfs namenode -format //格式化 hdfs --daemon start namenode //打开NameNode节点
-
副节点同步主节点格式化
hdfs namenode -bootstrapStandby
-
启动
start-all.sh
查看:
hdfs haadmin -getServiceState nn1
hdfs haadmin -getServiceState nn2
hdfs haadmin -getServiceState nn3
[外链图片转存中…(img-wExJcJDx-1639723753787)]