Set up a functional cluster for basic operations

- Include cgroup.conf configuration
- Install slurm-client on compute nodes
- Move home dir to /vagrant for result sharing
- Add Makefile for streamlined setup and cleanup
- Fix idempotency issues
This commit is contained in:
Kris Lamoureux 2024-08-11 23:23:38 -04:00
parent 9a402948c8
commit 943b64fdcf
Signed by: kris
GPG Key ID: 3EDA9C3441EDA925
5 changed files with 64 additions and 38 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@ munge.key
nodes.rb
.settings.yml
.vagrant
vagrantup.log

10
Makefile Normal file
View File

@ -0,0 +1,10 @@
.PHONY: all vagrant clean
all: vagrant
vagrant:
vagrant up --no-destroy-on-error --no-color | tee ./vagrantup.log
clean:
vagrant destroy -f --no-color
rm -rf .vagrant vagrantup.log munge.key ./scratch/submit

10
cgroup.conf Normal file
View File

@ -0,0 +1,10 @@
###
# Slurm cgroup support configuration file.
###
CgroupAutomount=yes
CgroupMountpoint=/sys/fs/cgroup
ConstrainCores=yes
ConstrainDevices=yes
ConstrainKmemSpace=no #avoid known Kernel issues
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes

View File

@ -16,7 +16,7 @@ apt-get install -y chrony
systemctl start chrony
systemctl enable chrony
# Create a dedicated non-privileged user account for MUNGE
# Create MUNGE user
getent group munge > /dev/null || groupadd -r -g 900 munge
id -u munge &>/dev/null || \
useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge
@ -26,25 +26,34 @@ getent group slurm > /dev/null || groupadd -g 1001 slurm
id -u slurm &>/dev/null || \
useradd -m -u 1001 -g slurm -s /bin/bash slurm
# Create job 'submit' user
# Create submit user
getent group submit > /dev/null || groupadd -g 1002 submit
id -u submit &>/dev/null || \
useradd -m -u 1002 -g submit -s /bin/bash submit
useradd -M -u 1002 -g submit -s /bin/bash -d /vagrant/scratch/submit submit
mkdir -p /vagrant/scratch/submit
# Install MUNGE, remove any default key, and stop to another place key later
# Update APT cache
apt-get update
# Install MUNGE, remove any default key, and stop to place another key later
if ! dpkg -s munge &>/dev/null; then
apt-get install -y munge
systemctl stop munge
rm -f /etc/munge/munge.key
fi
# Install slurm client tools
dpkg -s slurm-client &>/dev/null || apt-get install -y slurm-client
# Create directories for Slurm
mkdir -p /var/spool/slurm /var/log/slurm /etc/slurm
chown slurm:slurm /var/spool/slurm /var/log/slurm /etc/slurm
# Copy slurm.conf
# Copy slurm.conf and cgroup.conf
cp -u /vagrant/slurm.conf /etc/slurm/slurm.conf
chown slurm:slurm /etc/slurm/slurm.conf
chmod 644 /etc/slurm/slurm.conf
cp -u /vagrant/cgroup.conf /etc/slurm/cgroup.conf
chown slurm:slurm /etc/slurm/slurm.conf /etc/slurm/cgroup.conf
chmod 644 /etc/slurm/slurm.conf /etc/slurm/cgroup.conf
# node1 = manager
if [ "$(hostname)" == "node1" ]; then
@ -53,34 +62,36 @@ if [ "$(hostname)" == "node1" ]; then
sudo -u munge /usr/sbin/mungekey --verbose
fi
# Set MUNGE key perms
chmod 600 /etc/munge/munge.key
# Copy to shared directory for other nodes
cp /etc/munge/munge.key /vagrant/munge.key
# Enable/start/test munge service
systemctl enable munge.service
systemctl start munge.service
munge -n | unmunge
chmod 400 /etc/munge/munge.key
systemctl enable munge
systemctl start munge
# Install Slurm Workload Manager and doc package for the Slurm config tool
if ! dpkg -s slurm-wlm &>/dev/null; then
apt-get install -y slurm-wlm slurm-wlm-doc
# Create directories for slurmctld
systemctl stop slurmctld
mkdir -p /var/spool/slurmctld
chown slurm:slurm /var/spool/slurmctld
chmod 755 /var/spool/slurmctld
# Start Slurm controller
systemctl enable slurmctld
systemctl start slurmctld
fi
else
# Initial delay
sleep 5
# Waits JOIN_TIMEOUT of seconds to find the munge.key file before giving up
START_TIME="$(date +%s)"
# Wait until the munge.key can be found via Vagrant provider file sharing /vagrant
# Wait until the munge.key can be found via Vagrant provider file sharing
while [ ! -f /vagrant/munge.key ]; do
CURRENT_TIME="$(date +%s)"
DIFF_TIME="$((CURRENT_TIME - START_TIME))"
@ -100,21 +111,13 @@ else
cp -f /vagrant/munge.key /etc/munge/munge.key
chown munge:munge /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
systemctl enable munge.service
systemctl start munge.service
munge -n | unmunge
systemctl enable munge
systemctl start munge
# Submit job as 'submit' on node2
if [ "$(hostname)" == "node2" ]; then
# Install Slurm client tools
apt-get install -y slurm-client
# Submit a test job as the 'submit' user
sleep 10
sudo -u submit bash -c 'sbatch -N2 --wrap="srun hostname"'
sudo -u submit squeue
else
# Install SLURM compute node daemon on node3+
# Install SLURM compute node daemon on node[3-4]
if [[ $(hostname) == node[3-4] ]]; then
mkdir -p /var/spool/slurmd
chown slurm:slurm /var/spool/slurmd
apt-get install -y slurmd
systemctl enable slurmd
systemctl start slurmd

2
scratch/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore