Working Slurm cluster with test job execution
- Fix MUNGE key generation and distribution - Slurm config using easy version of the config tool - Ensure JOIN_TIMEOUT is passed to the provisioning script - Execute hostname command as Slurm job for verification
This commit is contained in:
parent
f5f601af13
commit
9a402948c8
1
Vagrantfile
vendored
1
Vagrantfile
vendored
@ -51,6 +51,7 @@ Vagrant.configure(2) do |vm_config|
|
|||||||
|
|
||||||
# Install and Setup Slurm
|
# Install and Setup Slurm
|
||||||
config.vm.provision "shell", inline: <<-SHELL
|
config.vm.provision "shell", inline: <<-SHELL
|
||||||
|
export JOIN_TIMEOUT=#{JOIN_TIMEOUT}
|
||||||
/bin/bash /vagrant/provision.sh
|
/bin/bash /vagrant/provision.sh
|
||||||
SHELL
|
SHELL
|
||||||
end
|
end
|
||||||
|
60
provision.sh
60
provision.sh
@ -16,15 +16,36 @@ apt-get install -y chrony
|
|||||||
systemctl start chrony
|
systemctl start chrony
|
||||||
systemctl enable chrony
|
systemctl enable chrony
|
||||||
|
|
||||||
# Install MUNGE
|
|
||||||
apt-get update
|
|
||||||
apt-get install -y munge
|
|
||||||
|
|
||||||
# Create a dedicated non-privileged user account for MUNGE
|
# Create a dedicated non-privileged user account for MUNGE
|
||||||
getent group munge > /dev/null || groupadd -r -g 900 munge
|
getent group munge > /dev/null || groupadd -r -g 900 munge
|
||||||
id -u munge &>/dev/null || \
|
id -u munge &>/dev/null || \
|
||||||
useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge
|
useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge
|
||||||
|
|
||||||
|
# Create slurm user
|
||||||
|
getent group slurm > /dev/null || groupadd -g 1001 slurm
|
||||||
|
id -u slurm &>/dev/null || \
|
||||||
|
useradd -m -u 1001 -g slurm -s /bin/bash slurm
|
||||||
|
|
||||||
|
# Create job 'submit' user
|
||||||
|
getent group submit > /dev/null || groupadd -g 1002 submit
|
||||||
|
id -u submit &>/dev/null || \
|
||||||
|
useradd -m -u 1002 -g submit -s /bin/bash submit
|
||||||
|
|
||||||
|
# Install MUNGE, remove any default key, and stop to another place key later
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y munge
|
||||||
|
systemctl stop munge
|
||||||
|
rm -f /etc/munge/munge.key
|
||||||
|
|
||||||
|
# Create directories for Slurm
|
||||||
|
mkdir -p /var/spool/slurm /var/log/slurm /etc/slurm
|
||||||
|
chown slurm:slurm /var/spool/slurm /var/log/slurm /etc/slurm
|
||||||
|
|
||||||
|
# Copy slurm.conf
|
||||||
|
cp -u /vagrant/slurm.conf /etc/slurm/slurm.conf
|
||||||
|
chown slurm:slurm /etc/slurm/slurm.conf
|
||||||
|
chmod 644 /etc/slurm/slurm.conf
|
||||||
|
|
||||||
# node1 = manager
|
# node1 = manager
|
||||||
if [ "$(hostname)" == "node1" ]; then
|
if [ "$(hostname)" == "node1" ]; then
|
||||||
# Create common MUNGE key on the manager node
|
# Create common MUNGE key on the manager node
|
||||||
@ -42,6 +63,17 @@ if [ "$(hostname)" == "node1" ]; then
|
|||||||
systemctl enable munge.service
|
systemctl enable munge.service
|
||||||
systemctl start munge.service
|
systemctl start munge.service
|
||||||
munge -n | unmunge
|
munge -n | unmunge
|
||||||
|
|
||||||
|
# Install Slurm Workload Manager and doc package for the Slurm config tool
|
||||||
|
apt-get install -y slurm-wlm slurm-wlm-doc
|
||||||
|
|
||||||
|
# Create directories for slurmctld
|
||||||
|
mkdir -p /var/spool/slurmctld
|
||||||
|
chown slurm:slurm /var/spool/slurmctld
|
||||||
|
|
||||||
|
# Start Slurm controller
|
||||||
|
systemctl enable slurmctld
|
||||||
|
systemctl start slurmctld
|
||||||
else
|
else
|
||||||
# Initial delay
|
# Initial delay
|
||||||
sleep 5
|
sleep 5
|
||||||
@ -65,8 +97,26 @@ else
|
|||||||
done
|
done
|
||||||
|
|
||||||
# Enable/start/test munge service
|
# Enable/start/test munge service
|
||||||
cp /vagrant/munge.key /etc/munge/munge.key
|
cp -f /vagrant/munge.key /etc/munge/munge.key
|
||||||
|
chown munge:munge /etc/munge/munge.key
|
||||||
|
chmod 400 /etc/munge/munge.key
|
||||||
systemctl enable munge.service
|
systemctl enable munge.service
|
||||||
systemctl start munge.service
|
systemctl start munge.service
|
||||||
munge -n | unmunge
|
munge -n | unmunge
|
||||||
|
|
||||||
|
# Submit job as 'submit' on node2
|
||||||
|
if [ "$(hostname)" == "node2" ]; then
|
||||||
|
# Install Slurm client tools
|
||||||
|
apt-get install -y slurm-client
|
||||||
|
|
||||||
|
# Submit a test job as the 'submit' user
|
||||||
|
sleep 10
|
||||||
|
sudo -u submit bash -c 'sbatch -N2 --wrap="srun hostname"'
|
||||||
|
sudo -u submit squeue
|
||||||
|
else
|
||||||
|
# Install SLURM compute node daemon on node3+
|
||||||
|
apt-get install -y slurmd
|
||||||
|
systemctl enable slurmd
|
||||||
|
systemctl start slurmd
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
49
slurm.conf
Normal file
49
slurm.conf
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
#slurm.conf file generated by configurator easy.html.
|
||||||
|
# Put this file on all nodes of your cluster.
|
||||||
|
# See the slurm.conf man page for more information.
|
||||||
|
#
|
||||||
|
ClusterName=vcluster
|
||||||
|
SlurmctldHost=node1
|
||||||
|
#
|
||||||
|
#MailProg=/bin/mail
|
||||||
|
MpiDefault=none
|
||||||
|
#MpiParams=ports=#-#
|
||||||
|
ProctrackType=proctrack/cgroup
|
||||||
|
ReturnToService=1
|
||||||
|
SlurmctldPidFile=/var/run/slurmctld.pid
|
||||||
|
#SlurmctldPort=6817
|
||||||
|
SlurmdPidFile=/var/run/slurmd.pid
|
||||||
|
#SlurmdPort=6818
|
||||||
|
SlurmdSpoolDir=/var/spool/slurmd
|
||||||
|
SlurmUser=slurm
|
||||||
|
#SlurmdUser=root
|
||||||
|
StateSaveLocation=/var/spool/slurmctld
|
||||||
|
SwitchType=switch/none
|
||||||
|
TaskPlugin=task/affinity
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# TIMERS
|
||||||
|
#KillWait=30
|
||||||
|
#MinJobAge=300
|
||||||
|
#SlurmctldTimeout=120
|
||||||
|
#SlurmdTimeout=300
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# SCHEDULING
|
||||||
|
SchedulerType=sched/backfill
|
||||||
|
SelectType=select/cons_tres
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# LOGGING AND ACCOUNTING
|
||||||
|
AccountingStorageType=accounting_storage/none
|
||||||
|
#JobAcctGatherFrequency=30
|
||||||
|
JobAcctGatherType=jobacct_gather/none
|
||||||
|
#SlurmctldDebug=info
|
||||||
|
SlurmctldLogFile=/var/log/slurmctld.log
|
||||||
|
#SlurmdDebug=info
|
||||||
|
SlurmdLogFile=/var/log/slurmd.log
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# COMPUTE NODES
|
||||||
|
NodeName=node[3-4] CPUs=2 State=UNKNOWN
|
||||||
|
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
|
Loading…
Reference in New Issue
Block a user