15.1 修改hosts文件(并同步到所有节点)
[root@nn01 local]# vim /etc/hosts
192.168.0.10 nn01
192.168.0.11 node1
192.168.0.12 node2
192.168.0.21 node3
192.168.0.22 nn02
192.168.0.16 nfsgw
[root@nn01 local]# for i in 192.168.0.{11,12,21,22,26}
> do
> scp /etc/hosts root@$i:/etc/
> done
hosts 100% 272 583.5KB/s 00:00
hosts 100% 272 631.5KB/s 00:00
hosts 100% 272 359.7KB/s 00:00
hosts 100% 272 475.8KB/s 00:00
15.2 停掉所有节点除zookeeper以外的所有服务包括Hadoop,kafka
[root@nn01 hadoop]# ./sbin/stop-all.sh
[root@nn01 hadoop]# for i in node{1..3} nn01 nn02 ;do ssh $i echo $i ; ssh $i jps /t ;done
node1
3793 Jps
1493 QuorumPeerMain
node2
1205 QuorumPeerMain
2229 Jps
node3
1489 QuorumPeerMain
2255 Jps
nn01
4284 Jps
nn02
1444 Jps
15.3 初始化hdfs集群,所有节点删除 /var/hadoop/*
[root@nn01 hadoop]# rm -rf /var/hadoop/*
[root@nn01 hadoop]# ssh node1 rm -rf /var/hadoop/*
[root@nn01 hadoop]# ssh node2 rm -rf /var/hadoop/*
[root@nn01 hadoop]# ssh node3 rm -rf /var/hadoop/*
[root@nn01 hadoop]# ssh nn02 rm -rf /var/hadoop/*
15.4 nn02关闭ssh key验证,部署私钥
[root@nn02 ~]# vim /etc/ssh/ssh_config
Host *
GSSAPIAuthentication yes
StrictHostKeyChecking no
[root@nn01 hadoop]# scp /root/.ssh/id_rsa nn02:/root/.ssh/
id_rsa 100% 1675 1.6MB/s 00:00
[root@nn02 ~]# ssh-keygen
[root@nn02 ~]# ssh-copy-id root@nn01
[root@nn02 ~]# ssh-copy-id root@node1
[root@nn02 ~]# ssh-copy-id root@node2
[root@nn02 ~]# ssh-copy-id root@node3
15.5.1 修改core-site.xml配置文件如下
[root@nn01 ~]# vim /usr/local/hadoop/etc/hadoop/core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/var/hadoop</value>
</property>
<property>
<name>hadoop.proxyuser.test.groups</name>
<value>*</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>node1:2181,node2:2181,node3:2181</value>
</property>
</configuration>
15.6 修改hdfs-site.xml配置文件
[root@nn01 ~]# vim /usr/local/hadoop/etc/hadoop/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>nn01:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>nn02:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>nn01:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>nn02:50070</value>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
</property>
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>nn01:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>nn02:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>nn01:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>nn02:50070</value>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
</property>
<property>
<value>mycluster</value>
</property>
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>nn01:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>nn02:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>nn01:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>nn02:50070</value>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/var/hadoop/journal</value>
</property>
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
15.5.2 hdfs-site.xml配置文件讲解
1.<property>
<name>dfs.replication</name>
<value>2</value>
</property>
–secondarynamenode在高可用里面没有用途,这里把他关闭
–namenode在后面定义
–<!--指定hdfs的nameservices名称为mycluster-->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
–指定集群的两个NaneNode的名称分别为nn1,nn2(为变量)
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
–配置nn1,nn2的rpc通信端口
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>node1:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>node2:8020</value>
</property>
–配置nn1,nn2的http通信端口
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>node1:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>node2:50070</value>
</property>
–指定namenode元数据存储在journalnode中的路径
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
</property>
–指定journalnode日志文件存储的路径
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/var/hadoop/journal</value>
</property>
–指定HDFS客户端连接activenamenode的java类
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
–配置隔离机制为ssh
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
–指定秘钥的位置
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
–开启自动故障转移
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
15.6 Yarn 高可用
?ResourceManager高可用
–RM的高可用原理不NN是一样的,需要依赖ZK来实现,这里就不重复了,只给出配置文件的关键部分,感兴趣的同学可以自己学习和测试
–yarn.resourcemanager.hostname
–同理因为使用集群模式,该选项应该关闭
15.7 yarn-site.xml配置的配置
[root@nn01 ~]# vim /usr/local/hadoop/etc/hadoop/yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>node1:2181,node2:2181,node3:2181</value>
</property>
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yarn-ha</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>nn01</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>nn02</value>
</property>
</configuration>
15.8 同步配置到所有主机
[root@nn01 ~]# for i in node{1..3} nn02; do scp /usr/local/hadoop/etc/hadoop/* $i:/usr/local/hadoop/etc/hadoop/; done
方便拍错清空日志
[root@nn01 ~]# ssh node1 rm -rf /usr/local/hadoop/logs
[root@nn01 ~]# ssh node2 rm -rf /usr/local/hadoop/logs
[root@nn01 ~]# ssh node3 rm -rf /usr/local/hadoop/logs
15.9 集群初始化
15.9.1在nn01上初始化zk集群
[root@nn01 hadoop]# ./bin/hdfs zkfc -formatZK
.......
21/09/10 17:34:56 INFO zookeeper.ClientCnxn: Opening socket connection to server node1/192.168.0.11:2181. Will not attempt to authenticate using SASL (unknown error)
......
21/09/10 17:34:56 INFO ha.ActiveStandbyElector: Successfully created /hadoop-ha/mycluster in ZK.
.....
注,显示Successfully表示初始化成功,但要注意上面的error 如果前面是INFO表示成功,如果是其他表示有问题需要拍错
15.9.2 启动journalNode服务(需要在node1、node2、node3上操作)
[root@node1 ~]# cd /usr/local/hadoop/
[root@node1 hadoop]# ./sbin/hadoop-daemon.sh start journalnode
starting journalnode, logging to /usr/local/hadoop/logs/hadoop-root-journalnode-node1.out
[root@node1 hadoop]# jps
4050 Jps
1493 QuorumPeerMain
3999 JournalNode
----------------------------------------------------------
[root@node2 ~]# cd /usr/local/hadoop/
[root@node2 hadoop]# ./sbin/hadoop-daemon.sh start journalnode
starting journalnode, logging to /usr/local/hadoop/logs/hadoop-root-journalnode-node2.out
[root@node2 hadoop]# jps
2385 JournalNode
2436 Jps
1205 QuorumPeerMain
----------------------------------------------------------
[root@node3 kafka]# cd /usr/local/hadoop/
[root@node3 hadoop]# ./sbin/hadoop-daemon.sh start journalnode
starting journalnode, logging to /usr/local/hadoop/logs/hadoop-root-journalnode-node3.out
[root@node3 hadoop]# jps
1489 QuorumPeerMain
2389 JournalNode
2441 Jps
15.9.3 格式化 (nn01上执行)
[root@nn01 hadoop]# ./bin/hdfs namenode -format
。。。。。。。。。。。。。。。
21/09/10 17:48:59 INFO common.Storage: Storage directory /var/hadoop/dfs/name has been successfully formatted.
............................................................
注:出现上述信息表示格式化成功
15.9.4 在nn02是哪个数据同步到本地的/var/hadooop/dfs
[root@nn02 current]# rsync -aSH nn01:/var/hadoop/dfs /var/hadoop/
15.9.5 在nn01上执行初始化JNS
[root@nn01 hadoop]# ./bin/hdfs namenode -initializeSharedEdits
。。。。。。。。。。。。。。。
21/09/10 18:02:33 INFO client.QuorumJournalManager: Successfully started new epoch 1
。。。。。。。。。。。。。。。。。。。。。。
15.9.6 在node1-node3上停止journalnode服务
[root@node1 hadoop]# ./sbin/hadoop-daemon.sh stop journalnode
stopping journalnode
[root@node1 hadoop]# jps
1493 QuorumPeerMain
4122 Jps
[root@node2 hadoop]# ./sbin/hadoop-daemon.sh stop journalnode
stopping journalnode
[root@node2 hadoop]# jps
2483 Jps
1205 QuorumPeerMain
[root@node3 hadoop]# ./sbin/hadoop-daemon.sh stop journalnode
stopping journalnode
[root@node3 hadoop]# jps
1489 QuorumPeerMain
2488 Jps
15.10 启动集群
15.10.1 在nn01上执行的命令
[root@nn01 hadoop]# ./sbin/start-all.sh
15.10.2 在nn02上执行的命令
[root@nn02 hadoop]# ./sbin/yarn-daemon.sh start resourcemanager
starting resourcemanager, logging to /usr/local/hadoop/logs/yarn-root-resourcemanager-nn02.out
15.11 查看集群状态
–获取namenode状态
[root@nn01 hadoop]# cd /usr/local/hadoop/
[root@nn01 hadoop]# ./bin/hdfs haadmin -getServiceState nn1
active
[root@nn01 hadoop]# ./bin/hdfs haadmin -getServiceState nn2
standby
–获取resourcemanager状态
[root@nn01 hadoop]# ./bin/yarn rmadmin -getServiceState rm1
active
[root@nn01 hadoop]# ./bin/yarn rmadmin -getServiceState rm2
standby
–获取节点信息
[root@nn01 hadoop]# ./bin/hdfs dfsadmin -report
[root@nn01 hadoop]# ./bin/yarn node -list
Total Nodes:3
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
node1:44581 RUNNING node1:8042 0
node3:46272 RUNNING node3:8042 0
node2:41240 RUNNING node2:8042 0
–主从切换acitvate
[root@nn01 hadoop]# ./sbin/hadoop-daemon.sh stop namenode
停掉过后我们看nn01,此时就会报错如下
[root@nn01 hadoop]# ./bin/hdfs haadmin -getServiceState nn1
21/09/10 18:23:14 INFO ipc.Client: Retrying connect to server: nn01/192.168.0.10:8020. Already tried 0 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=1, sleepTime=1000 MILLISECONDS)
Operation failed: Call From nn01/192.168.0.10 to nn01:8020 failed on connection exception: java.net.ConnectException: 拒绝连接; For more details see: http://wiki.apache.org/hadoop/ConnectionRefused
我们再看nn02,nn02就变成active
[root@nn01 hadoop]# ./bin/hdfs haadmin -getServiceState nn2
active
恢复方法很简单如下
[root@nn01 hadoop]# ./sbin/hadoop-daemon.sh start namenode
starting namenode, logging to /usr/local/hadoop/logs/hadoop-root-namenode-nn01.out
[root@nn01 hadoop]# ./bin/hdfs haadmin -getServiceState nn1
standby