From 2e1049531cb7389cc5633b47d8a41e602da295f3 Mon Sep 17 00:00:00 2001 From: Ward Vandewege Date: Thu, 16 Dec 2021 14:12:49 -0500 Subject: [PATCH] 18325: add support for Nvidia GPUs as an optional feature to our compute node image builder script, for AWS. Arvados-DCO-1.1-Signed-off-by: Ward Vandewege --- tools/compute-images/arvados-images-aws.json | 5 +++-- tools/compute-images/build.sh | 16 +++++++++++--- tools/compute-images/scripts/base.sh | 22 ++++++++++++++++++++ 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/tools/compute-images/arvados-images-aws.json b/tools/compute-images/arvados-images-aws.json index b1b4c909d2..0865343dc4 100644 --- a/tools/compute-images/arvados-images-aws.json +++ b/tools/compute-images/arvados-images-aws.json @@ -5,10 +5,11 @@ "aws_access_key": "", "aws_profile": "", "aws_secret_key": "", - "aws_source_ami": "ami-04d70e069399af2e9", + "aws_source_ami": "ami-031283ff8a43b021c", "build_environment": "aws", "public_key_file": "", "mksquashfs_mem": "", + "nvidia_gpu_support": "", "reposuffix": "", "resolver": "", "ssh_user": "admin", @@ -77,6 +78,6 @@ "type": "shell", "execute_command": "sudo -S env {{ .Vars }} /bin/bash '{{ .Path }}'", "script": "scripts/base.sh", - "environment_vars": ["RESOLVER={{user `resolver`}}","REPOSUFFIX={{user `reposuffix`}}","MKSQUASHFS_MEM={{user `mksquashfs_mem`}}"] + "environment_vars": ["RESOLVER={{user `resolver`}}","REPOSUFFIX={{user `reposuffix`}}","MKSQUASHFS_MEM={{user `mksquashfs_mem`}}","NVIDIA_GPU_SUPPORT={{user `nvidia_gpu_support`}}"] }] } diff --git a/tools/compute-images/build.sh b/tools/compute-images/build.sh index 526db4906f..fce8b1918b 100755 --- a/tools/compute-images/build.sh +++ b/tools/compute-images/build.sh @@ -57,8 +57,10 @@ Options: Path to the public key file that a-d-c will use to log into the compute node --mksquashfs-mem (default: 256M) Only relevant when using Singularity. This is the amount of memory mksquashfs is allowed to use. - --debug - Output debug information (default: false) + --nvidia-gpu-support (default: false) + Install all the necessary tooling for Nvidia GPU support + --debug (default: false) + Output debug information EOF @@ -81,9 +83,10 @@ SSH_USER= AWS_DEFAULT_REGION=us-east-1 PUBLIC_KEY_FILE= MKSQUASHFS_MEM=256M +NVIDIA_GPU_SUPPORT= PARSEDOPTS=$(getopt --name "$0" --longoptions \ - help,json-file:,arvados-cluster-id:,aws-source-ami:,aws-profile:,aws-secrets-file:,aws-region:,aws-vpc-id:,aws-subnet-id:,gcp-project-id:,gcp-account-file:,gcp-zone:,azure-secrets-file:,azure-resource-group:,azure-location:,azure-sku:,azure-cloud-environment:,ssh_user:,resolver:,reposuffix:,public-key-file:,mksquashfs-mem:,debug \ + help,json-file:,arvados-cluster-id:,aws-source-ami:,aws-profile:,aws-secrets-file:,aws-region:,aws-vpc-id:,aws-subnet-id:,gcp-project-id:,gcp-account-file:,gcp-zone:,azure-secrets-file:,azure-resource-group:,azure-location:,azure-sku:,azure-cloud-environment:,ssh_user:,resolver:,reposuffix:,public-key-file:,mksquashfs-mem:,nvidia-gpu-support,debug \ -- "" "$@") if [ $? -ne 0 ]; then exit 1 @@ -160,6 +163,9 @@ while [ $# -gt 0 ]; do --mksquashfs-mem) MKSQUASHFS_MEM="$2"; shift ;; + --nvidia-gpu-support) + NVIDIA_GPU_SUPPORT=1 + ;; --debug) # If you want to debug a build issue, add the -debug flag to the build # command in question. @@ -265,6 +271,10 @@ fi if [[ "$MKSQUASHFS_MEM" != "" ]]; then EXTRA2+=" -var mksquashfs_mem=$MKSQUASHFS_MEM" fi +if [[ "$NVIDIA_GPU_SUPPORT" != "" ]]; then + EXTRA2+=" -var nvidia_gpu_support=$NVIDIA_GPU_SUPPORT" +fi + echo diff --git a/tools/compute-images/scripts/base.sh b/tools/compute-images/scripts/base.sh index 0ab51223b9..f180f81c49 100644 --- a/tools/compute-images/scripts/base.sh +++ b/tools/compute-images/scripts/base.sh @@ -149,3 +149,25 @@ $SUDO chmod 755 /usr/local/bin/ensure-encrypted-partitions.sh $SUDO chown root:root /usr/local/bin/ensure-encrypted-partitions.sh $SUDO mv /tmp/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg $SUDO chown root:root /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg + +if [ "$NVIDIA_GPU_SUPPORT" == "1" ]; then + DIST=$(. /etc/os-release; echo $ID$VERSION_ID) + # We need a kernel and matching headers + $sudo apt-get -y install linux-image-cloud-amd64 linux-headers-cloud-amd64 + + # Install CUDA + $sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$DIST/x86_64/7fa2af80.pub + $sudo apt-get -y install software-properties-common + $sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/$DIST/x86_64/ /" + $sudo add-apt-repository contrib + $sudo apt-get update + $sudo apt-get -y install cuda + + # Install libnvidia-container, the tooling for Docker/Singularity + curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | \ + $sudo apt-key add - + curl -s -L https://nvidia.github.io/libnvidia-container/$DIST/libnvidia-container.list | \ + $sudo tee /etc/apt/sources.list.d/libnvidia-container.list + $sudo apt-get update + $sudo apt-get -y install libnvidia-container1 libnvidia-container-tools nvidia-container-toolkit +fi -- 2.30.2