Set up a functional cluster for basic operations
- Include cgroup.conf configuration - Install slurm-client on compute nodes - Move home dir to /vagrant for result sharing - Add Makefile for streamlined setup and cleanup - Fix idempotency issues
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -2,3 +2,4 @@ munge.key | ||||
| nodes.rb | ||||
| .settings.yml | ||||
| .vagrant | ||||
| vagrantup.log | ||||
|   | ||||
							
								
								
									
										10
									
								
								Makefile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Makefile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| .PHONY: all vagrant clean | ||||
|  | ||||
| all: vagrant | ||||
|  | ||||
| vagrant: | ||||
| 	vagrant up --no-destroy-on-error --no-color | tee ./vagrantup.log | ||||
|  | ||||
| clean: | ||||
| 	vagrant destroy -f --no-color | ||||
| 	rm -rf .vagrant vagrantup.log munge.key ./scratch/submit | ||||
							
								
								
									
										10
									
								
								cgroup.conf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								cgroup.conf
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| ### | ||||
| # Slurm cgroup support configuration file. | ||||
| ### | ||||
| CgroupAutomount=yes | ||||
| CgroupMountpoint=/sys/fs/cgroup | ||||
| ConstrainCores=yes | ||||
| ConstrainDevices=yes | ||||
| ConstrainKmemSpace=no        #avoid known Kernel issues | ||||
| ConstrainRAMSpace=yes | ||||
| ConstrainSwapSpace=yes | ||||
							
								
								
									
										65
									
								
								provision.sh
									
									
									
									
									
								
							
							
						
						
									
										65
									
								
								provision.sh
									
									
									
									
									
								
							| @@ -16,7 +16,7 @@ apt-get install -y chrony | ||||
| systemctl start chrony | ||||
| systemctl enable chrony | ||||
|  | ||||
| # Create a dedicated non-privileged user account for MUNGE | ||||
| # Create MUNGE user | ||||
| getent group munge > /dev/null || groupadd -r -g 900 munge | ||||
| id -u munge &>/dev/null || \ | ||||
|   useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge | ||||
| @@ -26,25 +26,34 @@ getent group slurm > /dev/null || groupadd -g 1001 slurm | ||||
| id -u slurm &>/dev/null || \ | ||||
|   useradd -m -u 1001 -g slurm -s /bin/bash slurm | ||||
|  | ||||
| # Create job 'submit' user | ||||
| # Create submit user | ||||
| getent group submit > /dev/null || groupadd -g 1002 submit | ||||
| id -u submit &>/dev/null || \ | ||||
|   useradd -m -u 1002 -g submit -s /bin/bash submit | ||||
|   useradd -M -u 1002 -g submit -s /bin/bash -d /vagrant/scratch/submit submit | ||||
| mkdir -p /vagrant/scratch/submit | ||||
|  | ||||
| # Install MUNGE, remove any default key, and stop to another place key later | ||||
| # Update APT cache | ||||
| apt-get update | ||||
| apt-get install -y munge | ||||
| systemctl stop munge | ||||
| rm -f /etc/munge/munge.key | ||||
|  | ||||
| # Install MUNGE, remove any default key, and stop to place another key later | ||||
| if ! dpkg -s munge &>/dev/null; then | ||||
|   apt-get install -y munge | ||||
|   systemctl stop munge | ||||
|   rm -f /etc/munge/munge.key | ||||
| fi | ||||
|  | ||||
| # Install slurm client tools | ||||
| dpkg -s slurm-client &>/dev/null || apt-get install -y slurm-client | ||||
|  | ||||
| # Create directories for Slurm | ||||
| mkdir -p /var/spool/slurm /var/log/slurm /etc/slurm | ||||
| chown slurm:slurm /var/spool/slurm /var/log/slurm /etc/slurm | ||||
|  | ||||
| # Copy slurm.conf | ||||
| # Copy slurm.conf and cgroup.conf | ||||
| cp -u /vagrant/slurm.conf /etc/slurm/slurm.conf | ||||
| chown slurm:slurm /etc/slurm/slurm.conf | ||||
| chmod 644 /etc/slurm/slurm.conf | ||||
| cp -u /vagrant/cgroup.conf /etc/slurm/cgroup.conf | ||||
| chown slurm:slurm /etc/slurm/slurm.conf /etc/slurm/cgroup.conf | ||||
| chmod 644 /etc/slurm/slurm.conf /etc/slurm/cgroup.conf | ||||
|  | ||||
| # node1 = manager | ||||
| if [ "$(hostname)" == "node1" ]; then | ||||
| @@ -53,34 +62,36 @@ if [ "$(hostname)" == "node1" ]; then | ||||
|     sudo -u munge /usr/sbin/mungekey --verbose | ||||
|   fi | ||||
|  | ||||
|   # Set MUNGE key perms | ||||
|   chmod 600 /etc/munge/munge.key | ||||
|  | ||||
|   # Copy to shared directory for other nodes | ||||
|   cp /etc/munge/munge.key /vagrant/munge.key | ||||
|  | ||||
|   # Enable/start/test munge service | ||||
|   systemctl enable munge.service | ||||
|   systemctl start munge.service | ||||
|   munge -n | unmunge | ||||
|   chmod 400 /etc/munge/munge.key | ||||
|   systemctl enable munge | ||||
|   systemctl start munge | ||||
|  | ||||
|   # Install Slurm Workload Manager and doc package for the Slurm config tool | ||||
|   if ! dpkg -s slurm-wlm &>/dev/null; then | ||||
|     apt-get install -y slurm-wlm slurm-wlm-doc | ||||
|  | ||||
|     # Create directories for slurmctld | ||||
|     systemctl stop slurmctld | ||||
|     mkdir -p /var/spool/slurmctld | ||||
|     chown slurm:slurm /var/spool/slurmctld | ||||
|     chmod 755 /var/spool/slurmctld | ||||
|  | ||||
|     # Start Slurm controller | ||||
|     systemctl enable slurmctld | ||||
|     systemctl start slurmctld | ||||
|   fi | ||||
| else | ||||
|   # Initial delay | ||||
|   sleep 5 | ||||
|  | ||||
|   # Waits JOIN_TIMEOUT of seconds to find the munge.key file before giving up | ||||
|   START_TIME="$(date +%s)" | ||||
|   # Wait until the munge.key can be found via Vagrant provider file sharing /vagrant | ||||
|  | ||||
|   # Wait until the munge.key can be found via Vagrant provider file sharing | ||||
|   while [ ! -f /vagrant/munge.key ]; do | ||||
|     CURRENT_TIME="$(date +%s)" | ||||
|     DIFF_TIME="$((CURRENT_TIME - START_TIME))" | ||||
| @@ -100,21 +111,13 @@ else | ||||
|   cp -f /vagrant/munge.key /etc/munge/munge.key | ||||
|   chown munge:munge /etc/munge/munge.key | ||||
|   chmod 400 /etc/munge/munge.key | ||||
|   systemctl enable munge.service | ||||
|   systemctl start munge.service | ||||
|   munge -n | unmunge | ||||
|   systemctl enable munge | ||||
|   systemctl start munge | ||||
|  | ||||
|   # Submit job as 'submit' on node2 | ||||
|   if [ "$(hostname)" == "node2" ]; then | ||||
|     # Install Slurm client tools | ||||
|     apt-get install -y slurm-client | ||||
|  | ||||
|     # Submit a test job as the 'submit' user | ||||
|     sleep 10 | ||||
|     sudo -u submit bash -c 'sbatch -N2 --wrap="srun hostname"' | ||||
|     sudo -u submit squeue | ||||
|   else | ||||
|     # Install SLURM compute node daemon on node3+ | ||||
|   # Install SLURM compute node daemon on node[3-4] | ||||
|   if [[ $(hostname) == node[3-4] ]]; then | ||||
|     mkdir -p /var/spool/slurmd | ||||
|     chown slurm:slurm /var/spool/slurmd | ||||
|     apt-get install -y slurmd | ||||
|     systemctl enable slurmd | ||||
|     systemctl start slurmd | ||||
|   | ||||
							
								
								
									
										2
									
								
								scratch/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								scratch/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| * | ||||
| !.gitignore | ||||
		Reference in New Issue
	
	Block a user