slurm集群搭建

1. 环境准备

#vi /etc/sysconfig/selinux
# SELINUX=disabled

systemctl stop firewalld
systemctl disable firewalld

yum -y install epel-release
yum repolist
yum install axel yum-axelget
yum install ntp -y
systemctl enable ntpd
ntpdate pool.ntp.org
systemctl start ntpd

# change hostname
hostnamectl --static set-hostname newname

#vi /etc/hosts
1.1.1.100 master
1.1.1.101 client01
1.1.1.102 client02

2. nis安装

server

yum -y install ypserv rpcbind
nisdomainname simcloud.com
echo "nisdomainname simcloud.com"  >>/etc/rc.local
echo "NISDOMAIN=simcloud.com" >> /etc/sysconfig/network

#cat /etc/sysconfig/network
YPSERV_ARGS="-p 1011"
     
#/etc/sysconfig/yppasswdd
YPPASSWDD_ARGS="--port 1012"
            
#cat /etc/ypserv.conf
dns: no
files: 30
xfr_check_port: yes
* : * : shadow.byname : port
* : * : passwd.adjunct.byname : port
        
systemctl restart rpcbind
systemctl restart ypserv                     
systemctl restart yppasswdd                  
        
systemctl enable rpcbind
systemctl enable ypserv
systemctl enable yppasswdd
        
rpcinfo -p localhost
rpcinfo -u localhost ypserv
    
/usr/lib64/yp/ypinit -m
   
make -C /var/yp

client

yum install -y rpcbind yp-tools ypbind
nisdomainname simcloud.com
echo "nisdomainname simcloud.com"  >>/etc/rc.local
echo "NISDOMAIN=simcloud.com" >> /etc/sysconfig/network
        
#cat /etc/nsswitch.conf
        
passwd: files nis
shadow: files nis
group:  files nis
hosts:  files nis dns
    
#cat /etc/sysconfig/authconfig
USENIS=yes

#cat /etc/pam.d/system-auth
password    sufficient    pam_unix.so sha512 shadow nis nullok try_first_pass use_authtok

#cat /etc/yp.conf
domain simcloud.com server 192.168.18.128
        
systemctl restart rpcbind
systemctl restart ypbind
        
systemctl enable rpcbind
systemctl enable ypbind

yptest

 

3. nfs安装

server

yum -y install nfs-utils
systemctl enable rpcbind
systemctl enable nfs
systemctl start rpcbind
systemctl start nfs
rpcinfo -p localhost | grep nfs
    100003    3   tcp   2049  nfs
    100003    4   tcp   2049  nfs
    100227    3   tcp   2049  nfs_acl
    100003    3   udp   2049  nfs
    100003    4   udp   2049  nfs
    100227    3   udp   2049  nfs_acl

chmod 755 /home
chmod 755 /opt
# shareDir ip(rw,no_root_squash,no_all_squash,sync)
# ip  192.168.0.0/24: 客户端 IP 范围,* 代表所有,即没有限制。
# rw: 权限设置,可读可写。
# sync: 同步共享目录。
# no_root_squash: 可以使用 root 授权。
# no_all_squash: 可以使用普通用户授权。

/home *(rw,no_root_squash,sync)
/opt  *(rw,no_root_squash,sync)

systemctl restart nfs
[root@mom01 home]#  showmount -e localhost
Export list for localhost:
/opt  *
/home *

#永久修改挂载
# vi /etc/fstab

client

yum -y install nfs-utils
systemctl enable rpcbind
systemctl start rpcbind
[root@boy01 ~]# showmount -e mom01
Export list for mom01:
/opt  *
/home *
[root@boy01 ~]# mount mom01:/opt /opt
[root@boy01 ~]# mount mom01:/home /home
[root@boy01 ~]# ls /home/
c1  cndaqang  test
#vi /etc/fstab
mom01:/home     /home                   nfs     defaults        0 0
mom01:/opt      /opt                    nfs     defaults        0 0

4. munge安装

yum -y install python
yum -y install python3
yum -y install epel-release
yum -y install gtk2
yum -y install gtk2-devel
yum -y install munge
yum -y install munge-devel
yum -y install perl
yum -y install gcc
yum -y install gcc-c++
yum -y install polkit
systemctl start polkit

mkdir /usr/local/etc
echo "
#slurm
USRLOCAL=/usr/local
export LD_LIBRARY_PATH=\${USRLOCAL}/lib:\$LD_LIBRARY_PATH
export LIBRARY_PATH=\${USRLOCAL}/lib:\$LIBRARY_PATH
export LIBRARY_PATH=\${USRLOCAL}/lib64:\$LIBRARY_PATH
export C_INCLUDE_PATH=\${USRLOCAL}/include:\$C_INCLUDE_PATH
export PATH=\${USRLOCAL}/bin:\$PATH
export PATH=\${USRLOCAL}/sbin:\$PATH
" >> /etc/profile

[root@master source]# /usr/sbin/create-munge-key 
Generating a pseudo-random key using /dev/urandom completed.
[root@master source]# scp /etc/munge/munge.key node8:/etc/munge

chown munge:munge /etc/munge
chown munge:munge /var/run/munge
chown munge:munge /var/lib/munge
chown munge:munge /var/log/munge
chown munge:munge /etc/munge/munge.key

vi /usr/lib/systemd/system/munge.service

[Unit]
Description=MUNGE authentication service 
Documentation=man:munged(8)
After=network.target
After=syslog.target
After=time-sync.target

[Service]
Type=forking
ExecStart=/usr/sbin/munged --syslog
PIDFile=/var/run/munge/munged.pid
User=munge
Group=munge
Restart=on-abort
ExecStartPre=-/usr/bin/mkdir -m 0755 -p /var/log/munge
ExecStartPre=-/usr/bin/chown -R munge:munge /var/log/munge
ExecStartPre=-/usr/bin/mkdir -m 0755 -p /var/run/munge
ExecStartPre=-/usr/bin/chown -R munge:munge /var/run/munge

[Install]
WantedBy=multi-user.target

systemctl daemon-reload

[root@master ~]# systemctl start munge
[root@master ~]# systemctl status munge
[root@client01 ~]# systemctl status munge
● munge.service - MUNGE authentication service
   Loaded: loaded (/usr/lib/systemd/system/munge.service; enabled; vendor preset: disabled)
   Active: active (running) since Sat 2020-11-07 11:58:51 CST; 10min ago
     Docs: man:munged(8)
  Process: 4684 ExecStart=/usr/sbin/munged --syslog (code=exited, status=0/SUCCESS)
  Process: 4660 ExecStartPre=/usr/bin/chown -R munge:munge /var/run/munge (code=exited, status=0/SUCCESS)
  Process: 4609 ExecStartPre=/usr/bin/mkdir -m 0755 -p /var/run/munge (code=exited, status=0/SUCCESS)
  Process: 4588 ExecStartPre=/usr/bin/chown -R munge:munge /var/log/munge (code=exited, status=1/FAILURE)
  Process: 4546 ExecStartPre=/usr/bin/mkdir -m 0755 -p /var/log/munge (code=exited, status=0/SUCCESS)
 Main PID: 4724 (munged)
    Tasks: 4
   Memory: 672.0K
   CGroup: /system.slice/munge.service
           └─4724 /usr/sbin/munged --syslog

Nov 07 11:58:51 client01 munged[4724]: Found 3 users with supplementary groups in 0.003 seconds


5. slurm安装

[root@master source]# useradd slurm
[root@master source]# passwd slurm
#NIS同步更新账户
[root@master source]# make -C /var/yp
# 客户端无需此步骤,挂载后会自动增加slurm 账号
rm -rf  /var/spool/slurm-llnl
mkdir /var/spool/slurm-llnl
chown -R slurm.slurm /var/spool/slurm-llnl
rm -rf /var/run/slurm-llnl/
mkdir /var/run/slurm-llnl/
chown -R slurm.slurm /var/run/slurm-llnl/

cd /opt/source/
#从https://download.schedmd.com/slurm/下载最新版即可
wget https://download.schedmd.com/slurm/slurm-20.11.0-0rc1.tar.bz2
tar -jxvf slurm-20.11.0-0rc1.tar.bz2 
cd slurm-20.11.0-0rc1/
./configure #默认安装到/usr/local
make -j90 #注意需要python3
make install

cp etc/{slurmctld.service,slurmdbd.service,slurmd.service} /usr/lib/systemd/system

[root@master slurm-20.11.0-0rc1]# cat /usr/lib/systemd/system/slurmctld.service 
[Unit]
Description=Slurm controller daemon
After=network.target munge.service
ConditionPathExists=/usr/local/etc/slurm.conf

[Service]
Type=simple
EnvironmentFile=-/etc/sysconfig/slurmctld
ExecStart=/usr/local/sbin/slurmctld -D $SLURMCTLD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
LimitNOFILE=65536


[Install]
WantedBy=multi-user.target
[root@master slurm-20.11.0-0rc1]# cat /usr/lib/systemd/system/slurmd.service 
[Unit]
Description=Slurm node daemon
After=munge.service network.target remote-fs.target
#ConditionPathExists=/usr/local/etc/slurm.conf

[Service]
Type=simple
EnvironmentFile=-/etc/sysconfig/slurmd
ExecStart=/usr/local/sbin/slurmd -D $SLURMD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
LimitNOFILE=131072
LimitMEMLOCK=infinity
LimitSTACK=infinity
Delegate=yes


[Install]
WantedBy=multi-user.target

cat << EOF > /usr/local/etc/slurm.conf
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
SlurmctldHost=master
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=999999
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=5000
#MaxStepCount=40000
#MaxTasksPerNode=128
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/pgid
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/spool/slurm-llnl/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/spool/slurm-llnl/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm-llnl
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurm-llnl
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=cluster
#DebugFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=info
#SlurmctldLogFile=
SlurmdDebug=info
#SlurmdLogFile=
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=master,client01 CPUs=96 State=UNKNOWN
PartitionName=long Nodes=master,client01 Default=YES MaxTime=INFINITE State=UP
EOF

chown slurm:slurm /usr/local/etc/slurm.conf

systemctl start slurmd
systemctl enable slurmd
systemctl start slurmctld
systemctl enable slurmctld

[root@master slurm-20.11.0-0rc1]# systemctl status slurmctld
● slurmctld.service - Slurm controller daemon
   Loaded: loaded (/usr/lib/systemd/system/slurmctld.service; enabled; vendor preset: disabled)
   Active: active (running) since Fri 2020-11-06 22:41:46 CST; 8s ago
 Main PID: 101889 (slurmctld)
   CGroup: /system.slice/slurmctld.service
           └─101889 /usr/local/sbin/slurmctld -D

Nov 06 22:41:46 master systemd[1]: Started Slurm controller daemon.
[root@master slurm-20.11.0-0rc1]# systemctl status slurmd
● slurmd.service - Slurm node daemon
   Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled)
   Active: active (running) since Fri 2020-11-06 22:41:36 CST; 20s ago
 Main PID: 101848 (slurmd)
   CGroup: /system.slice/slurmd.service
           └─101848 /usr/local/sbin/slurmd -D

Nov 06 22:41:36 master systemd[1]: Started Slurm node daemon.

6. 其他

# 配置防火墙

systemctl start firewalld
firewall-cmd --list-all
firewall-cmd --permanent --add-service=mountd,nfs,rpcbind
firewall-cmd --permanent --add-port=177/udp
firewall-cmd --permanent --add-rich-rule="rule family="ipv4" source address="10.10.10.103" port port="6818" protocol="tcp" accept"

 

参考:https://cndaqiang.github.io/2020/11/06/slurm-Centos7/

https://cndaqiang.github.io/2019/09/19/Centos7-CC19/

https://www.cnblogs.com/liuyongqian/articles/10789946.html

https://qizhanming.com/blog/2018/08/08/how-to-install-nfs-on-centos-7

上一篇:小米之后滴滴入局 互联网+汽车形成业务发展新方向


下一篇:hadoop搭建初步总结