2024-08-10 03:03:55 +00:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
#######################################
|
|
|
|
### Install and setup Slurm cluster ###
|
|
|
|
#######################################
|
|
|
|
|
|
|
|
# Print commands and exit on error
|
|
|
|
set -xe
|
|
|
|
|
2024-08-18 04:18:47 +00:00
|
|
|
# Increase APT retries and timeouts to improve provisioning reliability
|
2024-08-18 22:13:33 +00:00
|
|
|
cat > /etc/apt/apt.conf.d/99custom-retries << EOF
|
2024-08-18 04:18:47 +00:00
|
|
|
Acquire::Retries "5";
|
|
|
|
Acquire::http::Timeout "120";
|
|
|
|
Acquire::ftp::Timeout "120";
|
|
|
|
EOF
|
|
|
|
|
2024-08-10 03:03:55 +00:00
|
|
|
# Prevents interactive prompts during package installation
|
|
|
|
export DEBIAN_FRONTEND=noninteractive
|
|
|
|
|
|
|
|
# Keep system clocks in sync
|
|
|
|
apt-get update
|
|
|
|
apt-get install -y chrony
|
|
|
|
systemctl start chrony
|
|
|
|
systemctl enable chrony
|
|
|
|
|
2024-08-12 03:23:38 +00:00
|
|
|
# Create MUNGE user
|
2024-08-10 03:03:55 +00:00
|
|
|
getent group munge > /dev/null || groupadd -r -g 900 munge
|
|
|
|
id -u munge &>/dev/null || \
|
|
|
|
useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge
|
|
|
|
|
2024-08-11 17:55:40 +00:00
|
|
|
# Create slurm user
|
|
|
|
getent group slurm > /dev/null || groupadd -g 1001 slurm
|
|
|
|
id -u slurm &>/dev/null || \
|
|
|
|
useradd -m -u 1001 -g slurm -s /bin/bash slurm
|
|
|
|
|
2024-08-12 03:23:38 +00:00
|
|
|
# Create submit user
|
2024-08-11 17:55:40 +00:00
|
|
|
getent group submit > /dev/null || groupadd -g 1002 submit
|
|
|
|
id -u submit &>/dev/null || \
|
2024-08-12 03:23:38 +00:00
|
|
|
useradd -M -u 1002 -g submit -s /bin/bash -d /vagrant/scratch/submit submit
|
|
|
|
mkdir -p /vagrant/scratch/submit
|
2024-08-11 17:55:40 +00:00
|
|
|
|
2024-08-12 03:23:38 +00:00
|
|
|
# Update APT cache
|
2024-08-11 17:55:40 +00:00
|
|
|
apt-get update
|
2024-08-12 03:23:38 +00:00
|
|
|
|
|
|
|
# Install MUNGE, remove any default key, and stop to place another key later
|
|
|
|
if ! dpkg -s munge &>/dev/null; then
|
|
|
|
apt-get install -y munge
|
|
|
|
systemctl stop munge
|
|
|
|
rm -f /etc/munge/munge.key
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Install slurm client tools
|
|
|
|
dpkg -s slurm-client &>/dev/null || apt-get install -y slurm-client
|
2024-08-11 17:55:40 +00:00
|
|
|
|
|
|
|
# Create directories for Slurm
|
2024-08-18 04:00:08 +00:00
|
|
|
mkdir -p /var/spool/slurm /etc/slurm
|
|
|
|
chown slurm:slurm /var/spool/slurm /etc/slurm
|
2024-08-11 17:55:40 +00:00
|
|
|
|
2024-08-12 03:23:38 +00:00
|
|
|
# Copy slurm.conf and cgroup.conf
|
2024-08-18 04:00:08 +00:00
|
|
|
cp -f /vagrant/slurm.conf /etc/slurm/slurm.conf
|
|
|
|
cp -f /vagrant/cgroup.conf /etc/slurm/cgroup.conf
|
2024-08-12 03:23:38 +00:00
|
|
|
chown slurm:slurm /etc/slurm/slurm.conf /etc/slurm/cgroup.conf
|
|
|
|
chmod 644 /etc/slurm/slurm.conf /etc/slurm/cgroup.conf
|
2024-08-11 17:55:40 +00:00
|
|
|
|
2024-08-10 03:03:55 +00:00
|
|
|
# node1 = manager
|
|
|
|
if [ "$(hostname)" == "node1" ]; then
|
|
|
|
# Create common MUNGE key on the manager node
|
|
|
|
if [ ! -f /etc/munge/munge.key ]; then
|
|
|
|
sudo -u munge /usr/sbin/mungekey --verbose
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Copy to shared directory for other nodes
|
|
|
|
cp /etc/munge/munge.key /vagrant/munge.key
|
|
|
|
|
|
|
|
# Enable/start/test munge service
|
2024-08-12 03:23:38 +00:00
|
|
|
chmod 400 /etc/munge/munge.key
|
|
|
|
systemctl enable munge
|
|
|
|
systemctl start munge
|
2024-08-11 17:55:40 +00:00
|
|
|
|
|
|
|
# Install Slurm Workload Manager and doc package for the Slurm config tool
|
2024-08-12 03:23:38 +00:00
|
|
|
if ! dpkg -s slurm-wlm &>/dev/null; then
|
|
|
|
apt-get install -y slurm-wlm slurm-wlm-doc
|
|
|
|
|
|
|
|
# Create directories for slurmctld
|
|
|
|
systemctl stop slurmctld
|
|
|
|
mkdir -p /var/spool/slurmctld
|
|
|
|
chown slurm:slurm /var/spool/slurmctld
|
|
|
|
chmod 755 /var/spool/slurmctld
|
|
|
|
|
|
|
|
# Start Slurm controller
|
|
|
|
systemctl enable slurmctld
|
|
|
|
systemctl start slurmctld
|
|
|
|
fi
|
2024-08-10 03:03:55 +00:00
|
|
|
else
|
|
|
|
# Initial delay
|
|
|
|
sleep 5
|
|
|
|
|
|
|
|
# Waits JOIN_TIMEOUT of seconds to find the munge.key file before giving up
|
|
|
|
START_TIME="$(date +%s)"
|
2024-08-12 03:23:38 +00:00
|
|
|
|
|
|
|
# Wait until the munge.key can be found via Vagrant provider file sharing
|
2024-08-10 03:03:55 +00:00
|
|
|
while [ ! -f /vagrant/munge.key ]; do
|
|
|
|
CURRENT_TIME="$(date +%s)"
|
|
|
|
DIFF_TIME="$((CURRENT_TIME - START_TIME))"
|
|
|
|
|
|
|
|
# Timeout
|
|
|
|
if [ "$DIFF_TIME" -ge "$JOIN_TIMEOUT" ]; then
|
|
|
|
echo "[ERROR]: $(hostname) waited $DIFF_TIME/$JOIN_TIMEOUT seconds"
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Waiting
|
|
|
|
echo "Waiting ($DIFF_TIME/$JOIN_TIMEOUT seconds) for /vagrant/munge.key file"
|
|
|
|
sleep 10
|
|
|
|
done
|
|
|
|
|
2024-08-18 04:00:08 +00:00
|
|
|
# Enable/start munge service
|
|
|
|
sleep 3
|
2024-08-11 17:55:40 +00:00
|
|
|
cp -f /vagrant/munge.key /etc/munge/munge.key
|
|
|
|
chown munge:munge /etc/munge/munge.key
|
|
|
|
chmod 400 /etc/munge/munge.key
|
2024-08-12 03:23:38 +00:00
|
|
|
systemctl enable munge
|
|
|
|
systemctl start munge
|
2024-08-11 17:55:40 +00:00
|
|
|
|
2024-08-12 03:23:38 +00:00
|
|
|
# Install SLURM compute node daemon on node[3-4]
|
|
|
|
if [[ $(hostname) == node[3-4] ]]; then
|
|
|
|
mkdir -p /var/spool/slurmd
|
|
|
|
chown slurm:slurm /var/spool/slurmd
|
2024-08-11 17:55:40 +00:00
|
|
|
apt-get install -y slurmd
|
|
|
|
systemctl enable slurmd
|
|
|
|
systemctl start slurmd
|
|
|
|
fi
|
2024-08-10 03:03:55 +00:00
|
|
|
fi
|