Working Slurm cluster with test job execution

- Fix MUNGE key generation and distribution
- Slurm config using easy version of the config tool
- Ensure JOIN_TIMEOUT is passed to the provisioning script
- Execute hostname command as Slurm job for verification
This commit is contained in:
Kris Lamoureux 2024-08-11 13:55:40 -04:00
parent f5f601af13
commit 9a402948c8
Signed by: kris
GPG Key ID: 3EDA9C3441EDA925
3 changed files with 105 additions and 5 deletions

1
Vagrantfile vendored
View File

@ -51,6 +51,7 @@ Vagrant.configure(2) do |vm_config|
# Install and Setup Slurm
config.vm.provision "shell", inline: <<-SHELL
export JOIN_TIMEOUT=#{JOIN_TIMEOUT}
/bin/bash /vagrant/provision.sh
SHELL
end

View File

@ -16,15 +16,36 @@ apt-get install -y chrony
systemctl start chrony
systemctl enable chrony
# Install MUNGE
apt-get update
apt-get install -y munge
# Create a dedicated non-privileged user account for MUNGE
getent group munge > /dev/null || groupadd -r -g 900 munge
id -u munge &>/dev/null || \
useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge
# Create slurm user
getent group slurm > /dev/null || groupadd -g 1001 slurm
id -u slurm &>/dev/null || \
useradd -m -u 1001 -g slurm -s /bin/bash slurm
# Create job 'submit' user
getent group submit > /dev/null || groupadd -g 1002 submit
id -u submit &>/dev/null || \
useradd -m -u 1002 -g submit -s /bin/bash submit
# Install MUNGE, remove any default key, and stop to another place key later
apt-get update
apt-get install -y munge
systemctl stop munge
rm -f /etc/munge/munge.key
# Create directories for Slurm
mkdir -p /var/spool/slurm /var/log/slurm /etc/slurm
chown slurm:slurm /var/spool/slurm /var/log/slurm /etc/slurm
# Copy slurm.conf
cp -u /vagrant/slurm.conf /etc/slurm/slurm.conf
chown slurm:slurm /etc/slurm/slurm.conf
chmod 644 /etc/slurm/slurm.conf
# node1 = manager
if [ "$(hostname)" == "node1" ]; then
# Create common MUNGE key on the manager node
@ -42,6 +63,17 @@ if [ "$(hostname)" == "node1" ]; then
systemctl enable munge.service
systemctl start munge.service
munge -n | unmunge
# Install Slurm Workload Manager and doc package for the Slurm config tool
apt-get install -y slurm-wlm slurm-wlm-doc
# Create directories for slurmctld
mkdir -p /var/spool/slurmctld
chown slurm:slurm /var/spool/slurmctld
# Start Slurm controller
systemctl enable slurmctld
systemctl start slurmctld
else
# Initial delay
sleep 5
@ -65,8 +97,26 @@ else
done
# Enable/start/test munge service
cp /vagrant/munge.key /etc/munge/munge.key
cp -f /vagrant/munge.key /etc/munge/munge.key
chown munge:munge /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
systemctl enable munge.service
systemctl start munge.service
munge -n | unmunge
# Submit job as 'submit' on node2
if [ "$(hostname)" == "node2" ]; then
# Install Slurm client tools
apt-get install -y slurm-client
# Submit a test job as the 'submit' user
sleep 10
sudo -u submit bash -c 'sbatch -N2 --wrap="srun hostname"'
sudo -u submit squeue
else
# Install SLURM compute node daemon on node3+
apt-get install -y slurmd
systemctl enable slurmd
systemctl start slurmd
fi
fi

49
slurm.conf Normal file
View File

@ -0,0 +1,49 @@
#slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=vcluster
SlurmctldHost=node1
#
#MailProg=/bin/mail
MpiDefault=none
#MpiParams=ports=#-#
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
#SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
#SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
TaskPlugin=task/affinity
#
#
# TIMERS
#KillWait=30
#MinJobAge=300
#SlurmctldTimeout=120
#SlurmdTimeout=300
#
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/none
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
#SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
#SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#
#
# COMPUTE NODES
NodeName=node[3-4] CPUs=2 State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP