From f5f601af13ae63eb48c1d9ad900fc750a81f574a Mon Sep 17 00:00:00 2001 From: Kris Lamoureux Date: Fri, 9 Aug 2024 23:03:55 -0400 Subject: [PATCH] Create Vagrant environment with shared MUNGE key --- .gitignore | 4 +++ Vagrantfile | 58 ++++++++++++++++++++++++++++++++++++++++++ provision.sh | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 .gitignore create mode 100644 Vagrantfile create mode 100755 provision.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8927f51 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +munge.key +nodes.rb +.settings.yml +.vagrant diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 0000000..d930d8b --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,58 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# Load override settings +require 'yaml' +settings_path = '.settings.yml' +settings = {} + +if File.exist?(settings_path) + settings = YAML.load_file(settings_path) +end + +# Default Vagrant settings +VAGRANT_BOX = settings['VAGRANT_BOX'] || 'debian/bookworm64' +VAGRANT_CPU = settings['VAGRANT_CPU'] || 2 +VAGRANT_MEM = settings['VAGRANT_MEM'] || 2048 +SSH_FORWARD = settings['SSH_FORWARD'] || false + +# Default Slurm settings +SLURM_NODES = settings['SLURM_NODES'] || 4 +JOIN_TIMEOUT = settings['JOIN_TIMEOUT'] || 60 + +# Node settings overrides +if File.exist?('nodes.rb') + require_relative 'nodes.rb' +else + # Using all defaults + NODES = {} +end + +HOSTS = Array(1..SLURM_NODES) +Vagrant.configure(2) do |vm_config| + HOSTS.each do |count| + vm_config.vm.define "node#{count}" do |config| + config.vm.hostname = "node#{count}" + config.vm.box = NODES.dig("node#{count}", 'BOX') || VAGRANT_BOX + config.ssh.forward_agent = + NODES.dig("node#{count}", 'SSH') || SSH_FORWARD + + # Libvirt + config.vm.provider :libvirt do |virt| + virt.memory = NODES.dig("node#{count}", 'MEM') || VAGRANT_MEM + virt.cpus = NODES.dig("node#{count}", 'CPU') || VAGRANT_CPU + end + + # VirtualBox + config.vm.provider :virtualbox do |vbox| + vbox.memory = NODES.dig("node#{count}", 'MEM') || VAGRANT_MEM + vbox.cpus = NODES.dig("node#{count}", 'CPU') || VAGRANT_CPU + end + + # Install and Setup Slurm + config.vm.provision "shell", inline: <<-SHELL + /bin/bash /vagrant/provision.sh + SHELL + end + end +end diff --git a/provision.sh b/provision.sh new file mode 100755 index 0000000..e00627a --- /dev/null +++ b/provision.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +####################################### +### Install and setup Slurm cluster ### +####################################### + +# Print commands and exit on error +set -xe + +# Prevents interactive prompts during package installation +export DEBIAN_FRONTEND=noninteractive + +# Keep system clocks in sync +apt-get update +apt-get install -y chrony +systemctl start chrony +systemctl enable chrony + +# Install MUNGE +apt-get update +apt-get install -y munge + +# Create a dedicated non-privileged user account for MUNGE +getent group munge > /dev/null || groupadd -r -g 900 munge +id -u munge &>/dev/null || \ + useradd -r -u 900 -g munge -d /var/lib/munge -s /usr/sbin/nologin munge + +# node1 = manager +if [ "$(hostname)" == "node1" ]; then + # Create common MUNGE key on the manager node + if [ ! -f /etc/munge/munge.key ]; then + sudo -u munge /usr/sbin/mungekey --verbose + fi + + # Set MUNGE key perms + chmod 600 /etc/munge/munge.key + + # Copy to shared directory for other nodes + cp /etc/munge/munge.key /vagrant/munge.key + + # Enable/start/test munge service + systemctl enable munge.service + systemctl start munge.service + munge -n | unmunge +else + # Initial delay + sleep 5 + + # Waits JOIN_TIMEOUT of seconds to find the munge.key file before giving up + START_TIME="$(date +%s)" + # Wait until the munge.key can be found via Vagrant provider file sharing /vagrant + while [ ! -f /vagrant/munge.key ]; do + CURRENT_TIME="$(date +%s)" + DIFF_TIME="$((CURRENT_TIME - START_TIME))" + + # Timeout + if [ "$DIFF_TIME" -ge "$JOIN_TIMEOUT" ]; then + echo "[ERROR]: $(hostname) waited $DIFF_TIME/$JOIN_TIMEOUT seconds" + exit 1 + fi + + # Waiting + echo "Waiting ($DIFF_TIME/$JOIN_TIMEOUT seconds) for /vagrant/munge.key file" + sleep 10 + done + + # Enable/start/test munge service + cp /vagrant/munge.key /etc/munge/munge.key + systemctl enable munge.service + systemctl start munge.service + munge -n | unmunge +fi