From 9a402948c82a3c19cf85e718d58a985403dce9cc Mon Sep 17 00:00:00 2001 From: Kris Lamoureux Date: Sun, 11 Aug 2024 13:55:40 -0400 Subject: [PATCH] Working Slurm cluster with test job execution - Fix MUNGE key generation and distribution - Slurm config using easy version of the config tool - Ensure JOIN_TIMEOUT is passed to the provisioning script - Execute hostname command as Slurm job for verification --- Vagrantfile | 1 + provision.sh | 60 +++++++++++++++++++++++++++++++++++++++++++++++----- slurm.conf | 49 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 slurm.conf diff --git a/Vagrantfile b/Vagrantfile index d930d8b..e994a3f 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -51,6 +51,7 @@ Vagrant.configure(2) do |vm_config| # Install and Setup Slurm config.vm.provision "shell", inline: <<-SHELL + export JOIN_TIMEOUT=#{JOIN_TIMEOUT} /bin/bash /vagrant/provision.sh SHELL end diff --git a/provision.sh b/provision.sh index e00627a..a0fe65f 100755 --- a/provision.sh +++ b/provision.sh @@ -16,15 +16,36 @@ apt-get install -y chrony systemctl start chrony systemctl enable chrony -# Install MUNGE -apt-get update -apt-get install -y munge - # Create a dedicated non-privileged user account for MUNGE getent group munge > /dev/null || groupadd -r -g 900 munge id -u munge &>/dev/null || \ useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge +# Create slurm user +getent group slurm > /dev/null || groupadd -g 1001 slurm +id -u slurm &>/dev/null || \ + useradd -m -u 1001 -g slurm -s /bin/bash slurm + +# Create job 'submit' user +getent group submit > /dev/null || groupadd -g 1002 submit +id -u submit &>/dev/null || \ + useradd -m -u 1002 -g submit -s /bin/bash submit + +# Install MUNGE, remove any default key, and stop to another place key later +apt-get update +apt-get install -y munge +systemctl stop munge +rm -f /etc/munge/munge.key + +# Create directories for Slurm +mkdir -p /var/spool/slurm /var/log/slurm /etc/slurm +chown slurm:slurm /var/spool/slurm /var/log/slurm /etc/slurm + +# Copy slurm.conf +cp -u /vagrant/slurm.conf /etc/slurm/slurm.conf +chown slurm:slurm /etc/slurm/slurm.conf +chmod 644 /etc/slurm/slurm.conf + # node1 = manager if [ "$(hostname)" == "node1" ]; then # Create common MUNGE key on the manager node @@ -42,6 +63,17 @@ if [ "$(hostname)" == "node1" ]; then systemctl enable munge.service systemctl start munge.service munge -n | unmunge + + # Install Slurm Workload Manager and doc package for the Slurm config tool + apt-get install -y slurm-wlm slurm-wlm-doc + + # Create directories for slurmctld + mkdir -p /var/spool/slurmctld + chown slurm:slurm /var/spool/slurmctld + + # Start Slurm controller + systemctl enable slurmctld + systemctl start slurmctld else # Initial delay sleep 5 @@ -65,8 +97,26 @@ else done # Enable/start/test munge service - cp /vagrant/munge.key /etc/munge/munge.key + cp -f /vagrant/munge.key /etc/munge/munge.key + chown munge:munge /etc/munge/munge.key + chmod 400 /etc/munge/munge.key systemctl enable munge.service systemctl start munge.service munge -n | unmunge + + # Submit job as 'submit' on node2 + if [ "$(hostname)" == "node2" ]; then + # Install Slurm client tools + apt-get install -y slurm-client + + # Submit a test job as the 'submit' user + sleep 10 + sudo -u submit bash -c 'sbatch -N2 --wrap="srun hostname"' + sudo -u submit squeue + else + # Install SLURM compute node daemon on node3+ + apt-get install -y slurmd + systemctl enable slurmd + systemctl start slurmd + fi fi diff --git a/slurm.conf b/slurm.conf new file mode 100644 index 0000000..cbfb6fb --- /dev/null +++ b/slurm.conf @@ -0,0 +1,49 @@ +#slurm.conf file generated by configurator easy.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +ClusterName=vcluster +SlurmctldHost=node1 +# +#MailProg=/bin/mail +MpiDefault=none +#MpiParams=ports=#-# +ProctrackType=proctrack/cgroup +ReturnToService=1 +SlurmctldPidFile=/var/run/slurmctld.pid +#SlurmctldPort=6817 +SlurmdPidFile=/var/run/slurmd.pid +#SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=slurm +#SlurmdUser=root +StateSaveLocation=/var/spool/slurmctld +SwitchType=switch/none +TaskPlugin=task/affinity +# +# +# TIMERS +#KillWait=30 +#MinJobAge=300 +#SlurmctldTimeout=120 +#SlurmdTimeout=300 +# +# +# SCHEDULING +SchedulerType=sched/backfill +SelectType=select/cons_tres +# +# +# LOGGING AND ACCOUNTING +AccountingStorageType=accounting_storage/none +#JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/none +#SlurmctldDebug=info +SlurmctldLogFile=/var/log/slurmctld.log +#SlurmdDebug=info +SlurmdLogFile=/var/log/slurmd.log +# +# +# COMPUTE NODES +NodeName=node[3-4] CPUs=2 State=UNKNOWN +PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP