hadoop高可用集群搭建,包括遇到的问题,超详细

Hadoop高可用

1.1 安装

解压

tar -zvxf hadoop-2.7.3.tar.gz

重命名

mv hadoop-2.7.3 hadoop

1.2 配置文件

cd /usr/local/software/hadoop/etc/hadoop

修改配置文件

core-site.xml

<property>
   <name>hadoop.tmp.dir</name>
   <value>file:/usr/local/software/hadoop-2.7.3/hdfs/tmp</value>
   <description>A base for other temporary directories.</description>
 </property>
 <property>
  <name>io.file.buffer.size</name>
   <value>131072</value>
 </property>
<!--指定hdfs连接地址,集群模式(高可用)-->
 <property>
   <name>fs.defaultFS</name>
   <value>hdfs://cluster</value>
 </property>
<!-- 指定ZooKeeper集群的地址和端口。注意,数量一定是奇数,且不少于三个节点-->
<property>
  <name>ha.zookeeper.quorum</name>
 <value>10.202.80.109:2181,10.202.80.110:2181,10.202.80.196:2181</value>
</property>

hdfs-site.xml

<configuration>
	<!--指定HDFS副本的数量,不能超过机器节点数-->
	<property>
		<name>dfs.replication</name>
		<value>3</value>
	</property>
	<!-- 为namenode集群定义一个services name -->
	<property>
 		<name>dfs.nameservices</name>
  		<value>cluster</value>
	</property>
	<!-- nameservice 包含哪些namenode,为各个namenode起名 -->
	<property>
  		<name>dfs.ha.namenodes.cluster</name>
  		<value>nn01,nn02</value>
	</property>
 	<!-- 名为nn01的namenode的rpc地址和端口号,rpc用来和datanode通讯 -->
	<property>
  		<name>dfs.namenode.rpc-address.cluster.nn01</name>
  		<value>10.202.80.109:9000</value>
	</property>
 	<!--名为nn01的namenode的http地址和端口号,用来和web客户端通讯 -->
	<property>
  		<name>dfs.namenode.http-address.cluster.nn01</name>
  		<value>10.202.80.109:50070</value>
	</property>
 	<!-- 名为nn02的namenode的rpc地址和端口号,rpc用来和datanode通讯 -->
	<property>
  		<name>dfs.namenode.rpc-address.cluster.nn02</name>
  		<value>10.202.80.110:9000</value>
	</property>
 	<!--名为nn02的namenode的http地址和端口号,用来和web客户端通讯 -->
	<property>
  		<name>dfs.namenode.http-address.cluster.nn02</name>
  		<value>10.202.80.110:50070</value>
	</property>
  	<!-- namenode间用于共享编辑日志的journal节点列表 -->
	<property>
  		<name>dfs.namenode.shared.edits.dir</name>
  		<value>qjournal://10.202.80.109:8485;10.202.80.110:8485;10.202.80.196:8485/cluster</value>
	</property>
    <!-- journalnode 上用于存放edits日志的目录 -->
	<property>
  		<name>dfs.journalnode.edits.dir</name>
  		<value>/usr/local/software/hadoop/hdfs/journaldata</value>
	</property>
  	<!-- 指定该集群出现故障时,是否自动切换到另一台namenode -->
	<property>
  		<name>dfs.ha.automatic-failover.enabled</name>
  		<value>true</value>
	</property>
    <!-- 不开启权限验证 -->
    <property>
  		<name>dfs.permissions.enable</name>
  		<value>false</value>
	</property>
	<!-- 配置失败自动切换实现方式 -->
	<property>
    	<name>dfs.client.failover.proxy.provider.cluster</name>
 		<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
	</property>
  	<!-- 一旦需要NameNode切换,使用ssh方式进行操作 -->
	<property>
  		<name>dfs.ha.fencing.methods</name>
  		<!--<value>sshfence(root:9431)</value>
        <value>shell(true)</value> -->
        <value>sshfence(root:9431)
			shell(/bin/true)
		</value>
	</property>
  	<!-- 如果使用ssh进行故障切换,使用ssh通信时用的密钥存储的位置 -->
	<property>
  		<name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/root/.ssh/id_rsa</value>
	</property>
	<!-- connect-timeout超时时间 -->
	<property>
    		<name>dfs.ha.fencing.ssh.connect-timeout</name>
    		<value>30000</value>
	</property>
	<property>
   		<name>dfs.name.dir</name>
   		<value>/usr/local/software/hadoop/hdfs/name</value>
   	</property>
   	<property>
   		<name>dfs.data.dir</name>
   		<value>/usr/local/software/hadoop/hdfs/data</value>
   	</property>
</configuration>

yarn-site.xml

<configuration>
	<!-- 启用Resource Manager HA高可用性 -->
	<property>
		<name>yarn.resourcemanager.ha.enabled</name>
		<value>true</value>
	</property>
	<!-- 指定resourcemanager的名字 -->
	<property>
		<name>yarn.resourcemanager.cluster-id</name>
		<value>yrc</value>
	</property>
	<!-- 使用了2个resourcemanager,分别指定Resourcemanager的地址 -->
	<property>
		<name>yarn.resourcemanager.ha.rm-ids</name>
		<value>rm1,rm2</value>
	</property>
	<!-- 指定rm1的地址 -->
	<property>
		<name>yarn.resourcemanager.hostname.rm1</name>
		<value>10.202.80.109</value>
	</property>
	<!-- 指定rm2的地址  -->
	<property>
		<name>yarn.resourcemanager.hostname.rm2</name>
		<value>10.202.80.110</value>
	</property>
	<!-- 指定zookeeper集群机器 -->
	<property>
		<name>yarn.resourcemanager.zk-address</name>
		<value>10.202.80.109:2181,10.202.80.110:2181,10.202.80.196:2181</value>
	</property>
	<!-- NodeManager上运行的附属服务,默认是mapreduce_shuffle -->
	<property>
		<name>yarn.nodemanager.aux-services</name>
		<value>mapreduce_shuffle</value>
	</property>
</configuration>

mapred-site.xml

<configuration>
	<!-- 采用yarn作为mapreduce的资源调度框架 -->
	<property>
		<name>mapreduce.framework.name</name>
		<value>yarn</value>
	</property>
</configuration>

slaves

10.202.80.109
10.202.80.110
10.202.80.196

hadoop-env.sh

# The java implementation to use.
export JAVA_HOME=/usr/local/software/jdk

# 如果ssh端口不是默认22,需要添加此配置
export HADOOP_SSH_OPTS="-p 9431"

# 指定hadoop相关的pid存放位置 会自动生成
export HADOOP_PID_DIR=/usr/local/software/hadoop/pids

1.3 环境变量

/etc/profile

export HADOOP_HOME=/usr/local/software/hadoop
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=/usr/local/software/hadoop/lib"
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH

1.4 分发

# 进入software目录
scp -r hadoop root@10.202.80.110:$PWD
scp -r hadoop root@10.202.80.196:$PWD

1.5 启动并查看

启动hadoop前,需要执行几步格式化操作:
(1)启动journalnode,三台机器都要这一步操作(仅第一次启动hadoop时,需要这一步操作,之后不再需要手动启动journalnode)

cd /usr/local/software/hadoop/sbin
sh hadoop-daemon.sh start journalnode

(2) 在hadoop-01上执行格式化操作,格式化namenode和zkfc

hdfs namenode -format
hdfs zkfc -formatZK

(3) namenode主从信息同步,在hadoop-02节点上执行同步命令

bin/hdfs namenode -bootstrapStandby

上述步骤完成后,接下来我们就可以启动hadoop了

原文链接:https://blog.csdn.net/qq_22917163/article/details/99726899

1.5.1 启动

都进入/usr/local/software/hadoop目录

服务器10.202.80.109:

start-all.sh
sbin/hadoop-daemon.sh start zkfc

服务器10.202.80.110:

sbin/yarn-daemon.sh start resourcemanager
sbin/hadoop-daemon.sh start zkfc

1.5.2 查看进程

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EypDtHy1-1639723753777)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215170347444.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bBuDK3Oz-1639723753778)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215170423272.png)]

web界面查看

  • 10.202.80.109:50070

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-J6wyGYka-1639723753780)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215171948675.png)]

    10.202.80.109是active节点,即使namenode主节点

  • 10.202.80.110:50070

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hkkkLTs6-1639723753780)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215172116282.png)]

    10.202.80.110收standBy节点,也就是namenode备节点

  • 10.202.80.109:8088

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JfVSNMSg-1639723753781)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211215172310886.png)]

1.5.2 停止命令

10.202.80.109

stop-all.sh
sbin/hadoop-daemon.sh stop zkfc

10.202.80.110

sbin/yarn-daemon.sh stop resourcemanager
sbin/hadoop-daemon.sh stop zkfc

1.6 测试高可用

(1)在10.202.80.109上kill掉namenode进程,然后通过浏览器查看10.202.80.109的状态,发现状态变为active,说明高可用测试成功
(2)重新启动10.202.80.109的namenode进程,sh start-dfs.sh,浏览器访问10.202.80.109,此时10.202.80.109的状态为standby

遇到的问题

kill 主节点后 ,hadoop HA standby无法自动切换为active

试了很多方法:

  • 以为免密的问题,指定了自己位置

      		<value>/root/.ssh/authorized_keys</value>
    

    还做了以下修改

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lF9b82Py-1639723753783)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211216103245375.png)]

  • 由于dfs.ha.fencing.methods参数的value是sshfence,需要使用的fuser命令;所以通过如下命令安装一下即可,两个namenode节点都需要安装

    yum -y install psmisc
    
  • 以上都无用

我怀疑是zk的问题,因为我本地已经起了zk,它这边又配置类zk

进入备用节点hadoop/logs/查看日志

cat hadoop-root-zkfc-node2.log 

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VR5LTTEY-1639723753784)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211216132850080.png)]

进入zk的bin目录,执行以下命令:

zkCli.sh

[zk: localhost:2181(CONNECTED) 0] get /hadoop-ha/cluster/ActiveBreadCrumb 

clusternn01node1 �F(�>
[zk: localhost:2181(CONNECTED) 1] Xshell

原因:由于zk中节点注册的信息是域名,只需要在hosts中配置相关域名即可

解决方法

在备用(standby)服务器添加主机名映射

vi /etc/hosts

10.202.80.109 node1
10.202.80.110 node2
10.202.80.196 node3

然后再测试 发现成功

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-C3PyfUBo-1639723753785)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211216135531969.png)]

3.1.3 版本配置

环境变量

export HADOOP_HOME=/usr/local/software/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HADOOP_HOME/lib

配置文件

core-site.xml
<configuration>
<property>
        <name>fs.defaultFS</name>
        <value>hdfs://mycluster</value>
</property>
<property>
        <name>hadoop.tmp.dir</name>
        <value>/usr/local/software/hadoop/hdfs/tmp</value>
</property>
<property>
        <name>hadoop.http.staticuser.user</name>
        <value>root</value>
</property> 
<property>
   		<name>ha.zookeeper.quorum</name>
  	    <value>10.202.80.109:2181,10.202.80.110:2181,10.202.80.196:2181</value>
 </property>
</configuration>

hdfs-site.xml
<configuration>
<property>
  <name>dfs.replication</name>
  <value>2</value>
</property>
<property>
  <name>dfs.namenode.secondary.http-address</name>
  <value>10.202.80.110:9869</value>
</property>
<property>
   <name>dfs.nameservices</name>
   <value>mycluster</value>
</property>
<property>
  <name>dfs.ha.namenodes.mycluster</name>
  <value>nn1,nn2</value>
</property>
<property>
  <name>dfs.namenode.rpc-address.mycluster.nn1</name>
  <value>10.202.80.109:8020</value>
</property>
<property>
  <name>dfs.namenode.rpc-address.mycluster.nn2</name>
  <value>10.202.80.110:8020</value>
</property>
<property>
  <name>dfs.namenode.http-address.mycluster.nn1</name>
  <value>10.202.80.109:9870</value>
</property>
<property>
  <name>dfs.namenode.http-address.mycluster.nn2</name>
  <value>10.202.80.110:9870</value>
</property>
<property>
  <name>dfs.namenode.shared.edits.dir</name>
  <value>qjournal://10.202.80.109:8485;10.202.80.110:8485/mycluster</value>
</property>
<property>
  <name>dfs.client.failover.proxy.provider.mycluster</name>
 <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<property>
      <name>dfs.ha.fencing.methods</name>
      <value>sshfence</value>
</property>
<property>
      <name>dfs.ha.fencing.ssh.private-key-files</name>
      <value>/root/.ssh/id_rsa</value>
</property>
<property>
  <name>dfs.journalnode.edits.dir</name>
  <value>/usr/local/software/hadoop/hdfs/journalnode</value>
</property>
<property>
   <name>dfs.ha.automatic-failover.enabled</name>
   <value>true</value>
</property>
</configuration>

mapred-site.xml
<configuration>
   <property>
       <name>mapreduce.framework.name</name>
       <value>yarn</value>
   </property>
   <property>
     	<name>mapreduce.application.classpath</name>
     	<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
   </property>
</configuration>

yarn-site.xml
<configuration>
	<property>
        <name>yarn.resourcemanager.hostname</name>
        <value>10.202.80.109</value>
    </property>
    <property>
      	  <name>yarn.nodemanager.env-whitelist</name>
          <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
   <property>
            <name>yarn.resourcemanager.webapp.address</name>
            <value>10.202.80.109:8088</value>
    </property>
    <property>
            <name>yarn.nodemanager.aux-services</name>
            <value>mapreduce_shuffle</value>
    </property>
</configuration>	

hadoop-env.sh
export JAVA_HOME=/usr/local/software/jdk
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export HDFS_ZKFC_USER=root
export HDFS_JOURNALNODE_USER=root

yarn-env.sh
export YARN_RESOURCEMANAGER_USER=root
export HADOOP_SECURE_DN_USER=yarn
export YARN_NODEMANAGER_USER=root

workers
10.202.80.109
10.202.80.110
10.202.80.196

启动前准备工作

  1. 启动zk

  2. 启动journalnode

  手动启动所有journalNode节点的journalNode功能(Node1, Node2)
  #Hadoop 2.X启动方式 
  hadoop-daemon.sh start journalnode
  #Hadoop 3.X启动方式
  hdfs --daemon start journalnode 

  1. 在其中一台namenode格式化

      hdfs zkfc -formatZK  
    hdfs namenode -format //格式化
    hdfs --daemon start namenode //打开NameNode节点
    
    
  2. 副节点同步主节点格式化

    hdfs namenode -bootstrapStandby
    
    
  3. 启动

    start-all.sh
    

查看:

hdfs haadmin -getServiceState nn1
hdfs haadmin -getServiceState nn2
hdfs haadmin -getServiceState nn3

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wExJcJDx-1639723753787)(C:\Users\ZYZ\AppData\Roaming\Typora\typora-user-images\image-20211217140233835.png)]

  手动启动所有journalNode节点的journalNode功能(Node1, Node2)
  #Hadoop 2.X启动方式 
  hadoop-daemon.sh start journalnode
  #Hadoop 3.X启动方式
  hdfs --daemon start journalnode 

  1. 在其中一台namenode格式化

      hdfs zkfc -formatZK  
    hdfs namenode -format //格式化
    hdfs --daemon start namenode //打开NameNode节点
    
    
  2. 副节点同步主节点格式化

    hdfs namenode -bootstrapStandby
    
    
  3. 启动

    start-all.sh
    

查看:

hdfs haadmin -getServiceState nn1
hdfs haadmin -getServiceState nn2
hdfs haadmin -getServiceState nn3

[外链图片转存中…(img-wExJcJDx-1639723753787)]

上一篇:09


下一篇:生产调优7 HDFS-集群迁移