Set up a functional cluster for basic operations
- Include cgroup.conf configuration - Install slurm-client on compute nodes - Move home dir to /vagrant for result sharing - Add Makefile for streamlined setup and cleanup - Fix idempotency issues
This commit is contained in:
parent
9a402948c8
commit
943b64fdcf
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,3 +2,4 @@ munge.key
|
|||||||
nodes.rb
|
nodes.rb
|
||||||
.settings.yml
|
.settings.yml
|
||||||
.vagrant
|
.vagrant
|
||||||
|
vagrantup.log
|
||||||
|
10
Makefile
Normal file
10
Makefile
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
.PHONY: all vagrant clean
|
||||||
|
|
||||||
|
all: vagrant
|
||||||
|
|
||||||
|
vagrant:
|
||||||
|
vagrant up --no-destroy-on-error --no-color | tee ./vagrantup.log
|
||||||
|
|
||||||
|
clean:
|
||||||
|
vagrant destroy -f --no-color
|
||||||
|
rm -rf .vagrant vagrantup.log munge.key ./scratch/submit
|
10
cgroup.conf
Normal file
10
cgroup.conf
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
###
|
||||||
|
# Slurm cgroup support configuration file.
|
||||||
|
###
|
||||||
|
CgroupAutomount=yes
|
||||||
|
CgroupMountpoint=/sys/fs/cgroup
|
||||||
|
ConstrainCores=yes
|
||||||
|
ConstrainDevices=yes
|
||||||
|
ConstrainKmemSpace=no #avoid known Kernel issues
|
||||||
|
ConstrainRAMSpace=yes
|
||||||
|
ConstrainSwapSpace=yes
|
65
provision.sh
65
provision.sh
@ -16,7 +16,7 @@ apt-get install -y chrony
|
|||||||
systemctl start chrony
|
systemctl start chrony
|
||||||
systemctl enable chrony
|
systemctl enable chrony
|
||||||
|
|
||||||
# Create a dedicated non-privileged user account for MUNGE
|
# Create MUNGE user
|
||||||
getent group munge > /dev/null || groupadd -r -g 900 munge
|
getent group munge > /dev/null || groupadd -r -g 900 munge
|
||||||
id -u munge &>/dev/null || \
|
id -u munge &>/dev/null || \
|
||||||
useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge
|
useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge
|
||||||
@ -26,25 +26,34 @@ getent group slurm > /dev/null || groupadd -g 1001 slurm
|
|||||||
id -u slurm &>/dev/null || \
|
id -u slurm &>/dev/null || \
|
||||||
useradd -m -u 1001 -g slurm -s /bin/bash slurm
|
useradd -m -u 1001 -g slurm -s /bin/bash slurm
|
||||||
|
|
||||||
# Create job 'submit' user
|
# Create submit user
|
||||||
getent group submit > /dev/null || groupadd -g 1002 submit
|
getent group submit > /dev/null || groupadd -g 1002 submit
|
||||||
id -u submit &>/dev/null || \
|
id -u submit &>/dev/null || \
|
||||||
useradd -m -u 1002 -g submit -s /bin/bash submit
|
useradd -M -u 1002 -g submit -s /bin/bash -d /vagrant/scratch/submit submit
|
||||||
|
mkdir -p /vagrant/scratch/submit
|
||||||
|
|
||||||
# Install MUNGE, remove any default key, and stop to another place key later
|
# Update APT cache
|
||||||
apt-get update
|
apt-get update
|
||||||
apt-get install -y munge
|
|
||||||
systemctl stop munge
|
# Install MUNGE, remove any default key, and stop to place another key later
|
||||||
rm -f /etc/munge/munge.key
|
if ! dpkg -s munge &>/dev/null; then
|
||||||
|
apt-get install -y munge
|
||||||
|
systemctl stop munge
|
||||||
|
rm -f /etc/munge/munge.key
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Install slurm client tools
|
||||||
|
dpkg -s slurm-client &>/dev/null || apt-get install -y slurm-client
|
||||||
|
|
||||||
# Create directories for Slurm
|
# Create directories for Slurm
|
||||||
mkdir -p /var/spool/slurm /var/log/slurm /etc/slurm
|
mkdir -p /var/spool/slurm /var/log/slurm /etc/slurm
|
||||||
chown slurm:slurm /var/spool/slurm /var/log/slurm /etc/slurm
|
chown slurm:slurm /var/spool/slurm /var/log/slurm /etc/slurm
|
||||||
|
|
||||||
# Copy slurm.conf
|
# Copy slurm.conf and cgroup.conf
|
||||||
cp -u /vagrant/slurm.conf /etc/slurm/slurm.conf
|
cp -u /vagrant/slurm.conf /etc/slurm/slurm.conf
|
||||||
chown slurm:slurm /etc/slurm/slurm.conf
|
cp -u /vagrant/cgroup.conf /etc/slurm/cgroup.conf
|
||||||
chmod 644 /etc/slurm/slurm.conf
|
chown slurm:slurm /etc/slurm/slurm.conf /etc/slurm/cgroup.conf
|
||||||
|
chmod 644 /etc/slurm/slurm.conf /etc/slurm/cgroup.conf
|
||||||
|
|
||||||
# node1 = manager
|
# node1 = manager
|
||||||
if [ "$(hostname)" == "node1" ]; then
|
if [ "$(hostname)" == "node1" ]; then
|
||||||
@ -53,34 +62,36 @@ if [ "$(hostname)" == "node1" ]; then
|
|||||||
sudo -u munge /usr/sbin/mungekey --verbose
|
sudo -u munge /usr/sbin/mungekey --verbose
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Set MUNGE key perms
|
|
||||||
chmod 600 /etc/munge/munge.key
|
|
||||||
|
|
||||||
# Copy to shared directory for other nodes
|
# Copy to shared directory for other nodes
|
||||||
cp /etc/munge/munge.key /vagrant/munge.key
|
cp /etc/munge/munge.key /vagrant/munge.key
|
||||||
|
|
||||||
# Enable/start/test munge service
|
# Enable/start/test munge service
|
||||||
systemctl enable munge.service
|
chmod 400 /etc/munge/munge.key
|
||||||
systemctl start munge.service
|
systemctl enable munge
|
||||||
munge -n | unmunge
|
systemctl start munge
|
||||||
|
|
||||||
# Install Slurm Workload Manager and doc package for the Slurm config tool
|
# Install Slurm Workload Manager and doc package for the Slurm config tool
|
||||||
|
if ! dpkg -s slurm-wlm &>/dev/null; then
|
||||||
apt-get install -y slurm-wlm slurm-wlm-doc
|
apt-get install -y slurm-wlm slurm-wlm-doc
|
||||||
|
|
||||||
# Create directories for slurmctld
|
# Create directories for slurmctld
|
||||||
|
systemctl stop slurmctld
|
||||||
mkdir -p /var/spool/slurmctld
|
mkdir -p /var/spool/slurmctld
|
||||||
chown slurm:slurm /var/spool/slurmctld
|
chown slurm:slurm /var/spool/slurmctld
|
||||||
|
chmod 755 /var/spool/slurmctld
|
||||||
|
|
||||||
# Start Slurm controller
|
# Start Slurm controller
|
||||||
systemctl enable slurmctld
|
systemctl enable slurmctld
|
||||||
systemctl start slurmctld
|
systemctl start slurmctld
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
# Initial delay
|
# Initial delay
|
||||||
sleep 5
|
sleep 5
|
||||||
|
|
||||||
# Waits JOIN_TIMEOUT of seconds to find the munge.key file before giving up
|
# Waits JOIN_TIMEOUT of seconds to find the munge.key file before giving up
|
||||||
START_TIME="$(date +%s)"
|
START_TIME="$(date +%s)"
|
||||||
# Wait until the munge.key can be found via Vagrant provider file sharing /vagrant
|
|
||||||
|
# Wait until the munge.key can be found via Vagrant provider file sharing
|
||||||
while [ ! -f /vagrant/munge.key ]; do
|
while [ ! -f /vagrant/munge.key ]; do
|
||||||
CURRENT_TIME="$(date +%s)"
|
CURRENT_TIME="$(date +%s)"
|
||||||
DIFF_TIME="$((CURRENT_TIME - START_TIME))"
|
DIFF_TIME="$((CURRENT_TIME - START_TIME))"
|
||||||
@ -100,21 +111,13 @@ else
|
|||||||
cp -f /vagrant/munge.key /etc/munge/munge.key
|
cp -f /vagrant/munge.key /etc/munge/munge.key
|
||||||
chown munge:munge /etc/munge/munge.key
|
chown munge:munge /etc/munge/munge.key
|
||||||
chmod 400 /etc/munge/munge.key
|
chmod 400 /etc/munge/munge.key
|
||||||
systemctl enable munge.service
|
systemctl enable munge
|
||||||
systemctl start munge.service
|
systemctl start munge
|
||||||
munge -n | unmunge
|
|
||||||
|
|
||||||
# Submit job as 'submit' on node2
|
# Install SLURM compute node daemon on node[3-4]
|
||||||
if [ "$(hostname)" == "node2" ]; then
|
if [[ $(hostname) == node[3-4] ]]; then
|
||||||
# Install Slurm client tools
|
mkdir -p /var/spool/slurmd
|
||||||
apt-get install -y slurm-client
|
chown slurm:slurm /var/spool/slurmd
|
||||||
|
|
||||||
# Submit a test job as the 'submit' user
|
|
||||||
sleep 10
|
|
||||||
sudo -u submit bash -c 'sbatch -N2 --wrap="srun hostname"'
|
|
||||||
sudo -u submit squeue
|
|
||||||
else
|
|
||||||
# Install SLURM compute node daemon on node3+
|
|
||||||
apt-get install -y slurmd
|
apt-get install -y slurmd
|
||||||
systemctl enable slurmd
|
systemctl enable slurmd
|
||||||
systemctl start slurmd
|
systemctl start slurmd
|
||||||
|
2
scratch/.gitignore
vendored
Normal file
2
scratch/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*
|
||||||
|
!.gitignore
|
Loading…
Reference in New Issue
Block a user