From 2638fb1852ec48f5b65364768ab9978dda818efa Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Tue, 19 Nov 2024 09:25:18 -0500 Subject: [PATCH] 22317: Replace compute image base.sh with an Ansible playbook The hope is that this will be more maintainable and extendable, and allow us to start building towards a general Ansible installer. This Ansible playbook was written and tested against Ansible 8.7.0, because Ansible 8.x seems to be the version that supports most of the Pythons we care about. This might need refinement in the future. The Ansible playbook supports all the same deployment configurations as base.sh, but there are some differences in the way it goes about things: * It no longer configures GRUB for cgroups v1 because Arvados supports cgroups v2 now. * It installs software to /opt instead of /var/lib/arvados, because software doesn't belong in /var and none of this is Arvados software specifically looking for that path. * It only installs either Docker or Singularity as required, not both. * It builds Singularity in a temporary directory that it cleans up afterwards to save space in the AMI. * It sets up the encrypted partition script as a systemd service rather than a cloud-init script so we can be more explicit about the ordering requirements. The service orders itself before SSH so it has a chance to finish before the Crunch dispatcher is able to SSH in. If it fails, that will be reflected in systemctl's system state, and therefore our default BootProbeCommand as well. Arvados-DCO-1.1-Signed-off-by: Brett Smith --- tools/compute-images/1078ECD7.asc | 30 -- .../ansible/build-compute-image.yml | 70 +++++ .../roles/arvados_apt/defaults/main.yml | 7 + .../ansible/roles/arvados_apt/tasks/main.yml | 25 ++ .../arvados_apt/templates/arvados.pref.j2 | 8 + .../roles/compute_docker/defaults/main.yml | 16 ++ .../compute_docker/files/arvados-docker.pref | 14 + .../roles/compute_docker/tasks/main.yml | 59 ++++ .../compute_encrypt_tmp/defaults/main.yml | 7 + ...rvados-ensure-encrypted-partitions.service | 19 ++ .../files/ebs-autoscale.conf | 7 + .../files/ensure-encrypted-partitions.sh} | 54 +--- .../compute_encrypt_tmp/tasks/aws_ebs.yml | 26 ++ .../roles/compute_encrypt_tmp/tasks/main.yml | 46 ++++ .../roles/compute_nvidia/defaults/main.yml | 6 + .../compute_nvidia/files/arvados-nvidia.pref | 18 ++ .../compute_nvidia/files/detect-gpu.conf} | 2 +- .../roles/compute_nvidia/files/detect-gpu.sh} | 0 .../roles/compute_nvidia/tasks/main.yml | 83 ++++++ .../compute_singularity/defaults/main.yml | 8 + .../roles/compute_singularity/tasks/main.yml | 79 ++++++ .../roles/compute_user/defaults/main.yml} | 5 +- .../ansible/roles/compute_user/tasks/main.yml | 33 +++ .../roles/distro_apt/defaults/main.yml | 8 + .../ansible/roles/distro_apt/tasks/debian.yml | 11 + .../ansible/roles/distro_apt/tasks/main.yml | 13 + tools/compute-images/arvados-images-aws.json | 65 +---- .../compute-images/arvados-images-azure.json | 58 +--- tools/compute-images/build.sh | 92 +++---- tools/compute-images/scripts/base.sh | 258 ------------------ .../etc-apt-preferences.d-arvados.pref | 32 --- ...fg.d-07_compute_arvados_dispatch_cloud.cfg | 9 - ...-encrypted-partitions-aws-ebs-autoscale.sh | 72 ----- 33 files changed, 625 insertions(+), 615 deletions(-) delete mode 100644 tools/compute-images/1078ECD7.asc create mode 100644 tools/compute-images/ansible/build-compute-image.yml create mode 100644 tools/compute-images/ansible/roles/arvados_apt/defaults/main.yml create mode 100644 tools/compute-images/ansible/roles/arvados_apt/tasks/main.yml create mode 100644 tools/compute-images/ansible/roles/arvados_apt/templates/arvados.pref.j2 create mode 100644 tools/compute-images/ansible/roles/compute_docker/defaults/main.yml create mode 100644 tools/compute-images/ansible/roles/compute_docker/files/arvados-docker.pref create mode 100644 tools/compute-images/ansible/roles/compute_docker/tasks/main.yml create mode 100644 tools/compute-images/ansible/roles/compute_encrypt_tmp/defaults/main.yml create mode 100644 tools/compute-images/ansible/roles/compute_encrypt_tmp/files/arvados-ensure-encrypted-partitions.service create mode 100644 tools/compute-images/ansible/roles/compute_encrypt_tmp/files/ebs-autoscale.conf rename tools/compute-images/{scripts/usr-local-bin-ensure-encrypted-partitions.sh => ansible/roles/compute_encrypt_tmp/files/ensure-encrypted-partitions.sh} (74%) create mode 100644 tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/aws_ebs.yml create mode 100644 tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/main.yml create mode 100644 tools/compute-images/ansible/roles/compute_nvidia/defaults/main.yml create mode 100644 tools/compute-images/ansible/roles/compute_nvidia/files/arvados-nvidia.pref rename tools/compute-images/{scripts/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf => ansible/roles/compute_nvidia/files/detect-gpu.conf} (68%) rename tools/compute-images/{scripts/usr-local-bin-detect-gpu.sh => ansible/roles/compute_nvidia/files/detect-gpu.sh} (100%) create mode 100644 tools/compute-images/ansible/roles/compute_nvidia/tasks/main.yml create mode 100644 tools/compute-images/ansible/roles/compute_singularity/defaults/main.yml create mode 100644 tools/compute-images/ansible/roles/compute_singularity/tasks/main.yml rename tools/compute-images/{.licenseignore => ansible/roles/compute_user/defaults/main.yml} (53%) create mode 100644 tools/compute-images/ansible/roles/compute_user/tasks/main.yml create mode 100644 tools/compute-images/ansible/roles/distro_apt/defaults/main.yml create mode 100644 tools/compute-images/ansible/roles/distro_apt/tasks/debian.yml create mode 100644 tools/compute-images/ansible/roles/distro_apt/tasks/main.yml delete mode 100644 tools/compute-images/scripts/base.sh delete mode 100644 tools/compute-images/scripts/etc-apt-preferences.d-arvados.pref delete mode 100644 tools/compute-images/scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg delete mode 100644 tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh diff --git a/tools/compute-images/1078ECD7.asc b/tools/compute-images/1078ECD7.asc deleted file mode 100644 index edc62f48ff..0000000000 --- a/tools/compute-images/1078ECD7.asc +++ /dev/null @@ -1,30 +0,0 @@ ------BEGIN PGP PUBLIC KEY BLOCK----- - -mQENBEzhgeoBCAChhoK1dqpWzNyDWqRGEvdFdkJaA9D2HRwKPfBfjAoePX6ZyrpA -ItlUsvt/8s/DRiTiPEFQR4S7VqocmU6whJc3gDEGyOM6b1NF873lIfSVwUoE42QE -a76dO8woOYgLUyxu2mKG+bJgGMumjBJt6ZOndYVjTYB/7sEeVxwmMVulfZe0s6zg -ut0+SoTYg2R36qIqeIcWllYt97sEYnyy1qXMis4/3IZnuWkS/frsPR3aeUI4W+o2 -NDN1kj49+LMe7Fb5b7jZY08rZbAWXi1rU1hQx4jC9RvYqlT4HNld4Bn7os1IvOOA -wNiR0oiVdiuDbBxcMvRPktxMrFVjowusRLq/ABEBAAG0PUN1cm92ZXJzZSwgSW5j -IEF1dG9tYXRpYyBTaWduaW5nIEtleSA8c3lzYWRtaW5AY3Vyb3ZlcnNlLmNvbT6J -ATgEEwECACIFAlNgYIECGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJEFcW -WREQeOzXPkEH/jQJDIYI1dxWcYiA+hczmpaZvN2/pc/kwIW/6a03+6zqmSNkebOE -TgoDILacSYc17hy20R1/rWyUstOMKcEgFDBlSehhHyl0f7q/w7d8Ais6MabzsPfx -IceJpsjUg87+BR7qWhgQ0sxmtIF2TKuTFLs+nkGsgSsiBOEF4NvHxuj3HD4y8F27 -HNqrkqwjLS8xJwwH5Gp2uMEVr1AXIH3iSRjJ8X124s8iEP97Q/3IazoYRf9/MCSm -QEx8KzxwDX6t4bW6O4D01K+e9gdkTY70dcMgJoqm5IsX7yxjEubiOunphtlJnZ9d -Oi1yBN5UM3pWKAdcfRj4rcfV9Simvpx9av+5AQ0ETOGB6gEIAMAA0HVMG0BbdnU7 -wWgl5eFdT0AUSrXK/WdcKqVEGGv+c68NETSHWZOJX7O46Eao4gY4cTYprVMBzxpY -/BtQSYLpE0HLvBc1fcFd61Yz4H/9rGSNY0GcIQEbOjbJY5mr8qFsQ1K/mAf3aUL3 -b6ni4sHVicRiRr0Gl4Ihorlskpfu1SHs/C5tvTSVNF9p4vtl5892y1yILQeVpcBs -NCR7MUpdS49xCpvnAWsDZX+ij6LTR3lzCm/ZLCg4gNuZkjgU9oqVfGkqysW7WZ8S -OLvzAwUw7i1EIFX8q6QdudGoezxz8m8OgZM1v8AFpYEKlhEPf1W0MSfaRDwrj866 -8nCLruEAEQEAAYkBHwQYAQIACQUCTOGB6gIbDAAKCRBXFlkREHjs199EB/4+p0G1 -3PHxt6rLWSCGXobDOu4ZOA/qnv0D/JhOLroFds5TzQv6vnS8eAkhCTjHVA+b58cm -kXpI0oYcD4ZP+KK1CHKq2rGfwou7HfAF+icnNqYkeBOkjjbCgkvBlcCInuAuU8JX -DZMkfFk52+eBKwTjS/J/fQp0vDru8bHLp98WgdRHWfJQ3mc3gz4A5sR6zhrGPW6/ -ssnROS4dC2Ohp35GpgN1KjD3EmEw5RoSBYlyrARCaMsivgIKMxGUEyFZWhuJt3N1 -2MTddRwz28hbmYCi+MzHYDbRv+cSyUDmvXaWhfkNKBepClBA1rTWBcldit5vvlqr -yPet6wIKrtLGhAqZ -=CLkG ------END PGP PUBLIC KEY BLOCK----- diff --git a/tools/compute-images/ansible/build-compute-image.yml b/tools/compute-images/ansible/build-compute-image.yml new file mode 100644 index 0000000000..fc3748ac50 --- /dev/null +++ b/tools/compute-images/ansible/build-compute-image.yml @@ -0,0 +1,70 @@ +#!/usr/bin/env ansible-playbook +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Build compute node + # `default` is the name that the Packer Ansible plugin assigns to the + # instance used to create the image. + hosts: default + become: true + tasks: + - name: Bootstrap packages required for Ansible + ansible.builtin.raw: "apt-get -o DPkg::Lock::Timeout=300 -qy {{ item }}" + loop: + - update + - install gnupg python3-apt python3-debian xz-utils + - include_role: + name: distro_apt + - include_role: + name: arvados_apt + - name: Upgrade packages + ansible.builtin.apt: + update_cache: true + upgrade: true + - name: Remove unwanted packages + ansible.builtin.apt: + state: absent + autoremove: true + purge: true + name: + - unattended-upgrades + + - include_role: + name: compute_nvidia + when: "arvados_compute_nvidia|default(false)|bool" + - include_role: + name: "compute_{{ arvados_container_engine|default('docker') }}" + - include_role: + name: compute_encrypt_tmp + - include_role: + name: compute_user + + - name: Install Arvados FUSE driver + ansible.builtin.apt: + name: python3-arvados-fuse + - name: Configure FUSE + ansible.builtin.lineinfile: + path: /etc/fuse.conf + regexp: "^#* *user_allow_other *$" + line: user_allow_other + - name: Configure locale + ansible.builtin.lineinfile: + path: /etc/locale.gen + regexp: "^#* *en_US.UTF-8 +UTF-8 *$" + line: en_US.UTF-8 UTF-8 + notify: locale-gen + - name: Configure DNS + ansible.builtin.lineinfile: + path: /etc/dhcp/dhclient.conf + regexp: "^#* *prepend +domain-name-servers " + line: "prepend domain-name-servers {{ dns_resolver }};" + when: dns_resolver is defined + + handlers: + - name: apt update + ansible.builtin.debug: + msg: Skipping apt update handler before an apt upgrade + verbosity: 1 + - name: locale-gen + ansible.builtin.command: locale-gen diff --git a/tools/compute-images/ansible/roles/arvados_apt/defaults/main.yml b/tools/compute-images/ansible/roles/arvados_apt/defaults/main.yml new file mode 100644 index 0000000000..f1541369a6 --- /dev/null +++ b/tools/compute-images/ansible/roles/arvados_apt/defaults/main.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +arvados_apt_url: "https://apt.arvados.org" +arvados_apt_suites: "{{ ansible_distribution_release }}" +arvados_pin_version: "3.0.0" diff --git a/tools/compute-images/ansible/roles/arvados_apt/tasks/main.yml b/tools/compute-images/ansible/roles/arvados_apt/tasks/main.yml new file mode 100644 index 0000000000..a6cef47635 --- /dev/null +++ b/tools/compute-images/ansible/roles/arvados_apt/tasks/main.yml @@ -0,0 +1,25 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Install Arvados package pins + when: "arvados_pin_version != ''" + ansible.builtin.template: + src: arvados.pref.j2 + dest: /etc/apt/preferences.d/arvados.pref + owner: root + group: root + mode: 0644 + +- name: Install Arvados apt repository + ansible.builtin.deb822_repository: + name: arvados + types: deb + uris: "{{ arvados_apt_url }}/{{ ansible_distribution_release }}" + # build.sh takes a `--reposuffix` option. If arvados_apt_suites looks like + # a suffix (i.e., it starts with `-`), prepend the current distro codename. + suites: "{{ ansible_distribution_release if arvados_apt_suites.startswith('-') else '' }}{{ arvados_apt_suites }}" + components: main + signed_by: "{{ arvados_apt_url }}/pubkey.gpg" + notify: + - apt update diff --git a/tools/compute-images/ansible/roles/arvados_apt/templates/arvados.pref.j2 b/tools/compute-images/ansible/roles/arvados_apt/templates/arvados.pref.j2 new file mode 100644 index 0000000000..d6d99ef245 --- /dev/null +++ b/tools/compute-images/ansible/roles/arvados_apt/templates/arvados.pref.j2 @@ -0,0 +1,8 @@ +### This file is managed by Ansible ### +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +Package: arvados-* crunch-* keep-* python3-arvados-* keepproxy keepstore libpam-arvados-go python3-crunchstat-summary +Pin: version {{ arvados_pin_version }}-* +Pin-Priority: 995 diff --git a/tools/compute-images/ansible/roles/compute_docker/defaults/main.yml b/tools/compute-images/ansible/roles/compute_docker/defaults/main.yml new file mode 100644 index 0000000000..5184f05594 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_docker/defaults/main.yml @@ -0,0 +1,16 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +arvados_compute_pin_packages: true +docker_apt_url: "https://download.docker.com/linux/{{ ansible_distribution|lower }}" +docker_daemon: + data-root: /tmp/docker-data + default-ulimits: + nofile: + Name: nofile + Soft: 10000 + Hard: 10000 +docker_cleaner: + Quota: 10G + RemoveStoppedContainers: always diff --git a/tools/compute-images/ansible/roles/compute_docker/files/arvados-docker.pref b/tools/compute-images/ansible/roles/compute_docker/files/arvados-docker.pref new file mode 100644 index 0000000000..2299ef0444 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_docker/files/arvados-docker.pref @@ -0,0 +1,14 @@ +### This file is managed by Ansible ### +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Pin Docker dependencies to tested and known-good versions + +Package: src:docker-ce +Pin: version 5:27.* +Pin-Priority: 995 + +Package: containerd.io +Pin: version 1.7.* +Pin-Priority: 995 diff --git a/tools/compute-images/ansible/roles/compute_docker/tasks/main.yml b/tools/compute-images/ansible/roles/compute_docker/tasks/main.yml new file mode 100644 index 0000000000..c8d7d2a011 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_docker/tasks/main.yml @@ -0,0 +1,59 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Install Docker package pins + ansible.builtin.copy: + src: arvados-docker.pref + dest: /etc/apt/preferences.d/arvados-docker.pref + owner: root + group: root + mode: 0644 + when: "arvados_compute_pin_packages|bool" + +- name: Install Docker apt repository + ansible.builtin.deb822_repository: + name: docker + types: deb + uris: "{{ docker_apt_url }}" + suites: "{{ ansible_distribution_release }}" + components: stable + signed_by: "{{ docker_apt_url }}/gpg" + register: docker_apt_task + +- name: Install Docker + ansible.builtin.apt: + update_cache: "{{ docker_apt_task.changed }}" + name: docker-ce + install_recommends: false + +- name: Configure Docker daemon + ansible.builtin.copy: + content: "{{ docker_daemon|combine({'dns': [dns_resolver]} if dns_resolver is defined else {})|to_json }}" + dest: /etc/docker/daemon.json + owner: root + group: docker + mode: 0640 + +- name: Install Docker cleaner + ansible.builtin.apt: + name: arvados-docker-cleaner + +- name: Create Docker cleaner configuration directories + ansible.builtin.file: + name: "{{ item }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - /etc/arvados + - /etc/arvados/docker-cleaner + +- name: Configure Docker cleaner + ansible.builtin.copy: + content: "{{ docker_cleaner|to_json }}" + dest: /etc/arvados/docker-cleaner/docker-cleaner.json + owner: root + group: root + mode: 0644 diff --git a/tools/compute-images/ansible/roles/compute_encrypt_tmp/defaults/main.yml b/tools/compute-images/ansible/roles/compute_encrypt_tmp/defaults/main.yml new file mode 100644 index 0000000000..db9d4f847b --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_encrypt_tmp/defaults/main.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +arvados_compute_encrypted_tmp: "{{ 'aws_ebs' if ansible_system_vendor == 'Amazon EC2' else '' }}" +aws_ebs_autoscale_url: "https://github.com/arvados/amazon-ebs-autoscale.git" +aws_ebs_autoscale_version: "ee323f0751c2b6f733692e805b51b9bf3c251bac" diff --git a/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/arvados-ensure-encrypted-partitions.service b/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/arvados-ensure-encrypted-partitions.service new file mode 100644 index 0000000000..91bb4bcdbd --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/arvados-ensure-encrypted-partitions.service @@ -0,0 +1,19 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +[Unit] +Description=Ensure Arvados compute work directories are encrypted +Before=docker.service +Before=docker.socket +Before=ssh.service + +[Install] +WantedBy=docker.service +WantedBy=docker.socket +WantedBy=ssh.service + +[Service] +Type=oneshot +RemainAfterExit=true +ExecStart=/usr/local/sbin/ensure-encrypted-partitions.sh diff --git a/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/ebs-autoscale.conf b/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/ebs-autoscale.conf new file mode 100644 index 0000000000..1d99d56de6 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/ebs-autoscale.conf @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +[Service] +ExecStart= +ExecStart=/usr/bin/bash /opt/amazon-ebs-autoscale/install.sh --imdsv2 -f lvm.ext4 -m /tmp diff --git a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh b/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/ensure-encrypted-partitions.sh similarity index 74% rename from tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh rename to tools/compute-images/ansible/roles/compute_encrypt_tmp/files/ensure-encrypted-partitions.sh index 726ff0cdcd..999de594fa 100644 --- a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh +++ b/tools/compute-images/ansible/roles/compute_encrypt_tmp/files/ensure-encrypted-partitions.sh @@ -115,57 +115,5 @@ echo YES | cryptsetup luksFormat "$LVPATH" "$KEYPATH" cryptsetup --key-file "$KEYPATH" luksOpen "$LVPATH" "$(basename "$CRYPTPATH")" shred -u "$KEYPATH" mkfs.xfs -f "$CRYPTPATH" - -# First make sure docker is not using /tmp, then unmount everything under it. -if [ -d /etc/sv/docker.io ] -then - # TODO: Actually detect Docker state with runit - DOCKER_ACTIVE=true - sv stop docker.io || service stop docker.io || true -else - if systemctl --quiet is-active docker.service docker.socket; then - systemctl stop docker.service docker.socket || true - DOCKER_ACTIVE=true - else - DOCKER_ACTIVE=false - fi -fi - -ensure_umount "$MOUNTPATH/docker/aufs" - -MOUNTOPTIONS="async" -mount -o ${MOUNTOPTIONS} "$CRYPTPATH" "$MOUNTPATH" +mount -o async "$CRYPTPATH" "$MOUNTPATH" chmod a+w,+t "$MOUNTPATH" - -# Make sure docker uses the big partition -cat < /etc/docker/daemon.json -{ - "data-root": "$MOUNTPATH/docker-data" -} -EOF - -if ! $DOCKER_ACTIVE; then - # Nothing else to do - exit 0 -fi - -# restart docker -if [ -d /etc/sv/docker.io ] -then - ## runit - sv up docker.io -else - systemctl start docker.service docker.socket || true -fi - -end=$((SECONDS+60)) - -while [ $SECONDS -lt $end ]; do - if /usr/bin/docker ps -q >/dev/null; then - exit 0 - fi - sleep 1 -done - -# Docker didn't start within a minute, abort -exit 1 diff --git a/tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/aws_ebs.yml b/tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/aws_ebs.yml new file mode 100644 index 0000000000..bbc8bf46be --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/aws_ebs.yml @@ -0,0 +1,26 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Install EBS autoscaler dependencies + ansible.builtin.apt: + name: + - awscli + - bash + - git + - jq + - unzip + +- name: Check out EBS autoscaler from Git + ansible.builtin.git: + repo: "{{ aws_ebs_autoscale_url }}" + dest: /opt/amazon-ebs-autoscale + version: "{{ aws_ebs_autoscale_version }}" + +- name: Override encrypted partition service with EBS autoscaler + ansible.builtin.copy: + src: ebs-autoscale.conf + dest: /etc/systemd/system/arvados-ensure-encrypted-partitions.service.d/ebs-autoscale.conf + owner: root + group: root + mode: 0644 diff --git a/tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/main.yml b/tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/main.yml new file mode 100644 index 0000000000..92f87e8597 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_encrypt_tmp/tasks/main.yml @@ -0,0 +1,46 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Install encrypted partition dependencies + ansible.builtin.apt: + name: + - btrfs-progs + - cryptsetup + - curl + - lvm2 + - xfsprogs + +- name: Install encrypted partition script + ansible.builtin.copy: + src: ensure-encrypted-partitions.sh + dest: /usr/local/sbin/ensure-encrypted-partitions.sh + owner: root + group: root + mode: 0755 + +- name: Define encrypted partition service + ansible.builtin.copy: + src: arvados-ensure-encrypted-partitions.service + dest: /etc/systemd/system/arvados-ensure-encrypted-partitions.service + owner: root + group: root + mode: 0644 + +- name: Prepare encrypted partition service override directory + ansible.builtin.file: + path: /etc/systemd/system/arvados-ensure-encrypted-partitions.service.d + state: directory + owner: root + group: root + mode: 0755 + +- name: Set up AWS EBS-backed encrypted partitions + ansible.builtin.include_tasks: + file: aws_ebs.yml + when: "arvados_compute_encrypted_tmp|lower == 'aws_ebs'" + +- name: Enable encrypted partition service + ansible.builtin.systemd_service: + name: arvados-ensure-encrypted-partitions.service + enabled: true diff --git a/tools/compute-images/ansible/roles/compute_nvidia/defaults/main.yml b/tools/compute-images/ansible/roles/compute_nvidia/defaults/main.yml new file mode 100644 index 0000000000..f13ace404f --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_nvidia/defaults/main.yml @@ -0,0 +1,6 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +arvados_compute_pin_packages: true +nvidia_container_apt_url: "https://nvidia.github.io/libnvidia-container" diff --git a/tools/compute-images/ansible/roles/compute_nvidia/files/arvados-nvidia.pref b/tools/compute-images/ansible/roles/compute_nvidia/files/arvados-nvidia.pref new file mode 100644 index 0000000000..fcabbdbebf --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_nvidia/files/arvados-nvidia.pref @@ -0,0 +1,18 @@ +### This file is managed by Ansible ### +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Pin NVIDIA dependencies to tested and known-good versions + +Package: cuda +Pin: version 12.5.* +Pin-Priority: 995 + +Package: src:libnvidia-container src:nvidia-container-toolkit +Pin: version 1.16.* +Pin-Priority: 995 + +Package: cuda-drivers src:nvidia-graphics-drivers +Pin: version 560.* +Pin-Priority: 995 diff --git a/tools/compute-images/scripts/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf b/tools/compute-images/ansible/roles/compute_nvidia/files/detect-gpu.conf similarity index 68% rename from tools/compute-images/scripts/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf rename to tools/compute-images/ansible/roles/compute_nvidia/files/detect-gpu.conf index a5c9823ec9..937a576593 100644 --- a/tools/compute-images/scripts/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf +++ b/tools/compute-images/ansible/roles/compute_nvidia/files/detect-gpu.conf @@ -3,4 +3,4 @@ # SPDX-License-Identifier: Apache-2.0 [Service] -ExecStartPre=/usr/local/bin/detect-gpu.sh enable +ExecStartPre=/usr/local/sbin/detect-gpu.sh enable diff --git a/tools/compute-images/scripts/usr-local-bin-detect-gpu.sh b/tools/compute-images/ansible/roles/compute_nvidia/files/detect-gpu.sh similarity index 100% rename from tools/compute-images/scripts/usr-local-bin-detect-gpu.sh rename to tools/compute-images/ansible/roles/compute_nvidia/files/detect-gpu.sh diff --git a/tools/compute-images/ansible/roles/compute_nvidia/tasks/main.yml b/tools/compute-images/ansible/roles/compute_nvidia/tasks/main.yml new file mode 100644 index 0000000000..09ef6ed218 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_nvidia/tasks/main.yml @@ -0,0 +1,83 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Install NVIDIA package pins + ansible.builtin.copy: + src: arvados-nvidia.pref + dest: /etc/apt/preferences.d/arvados-nvidia.pref + owner: root + group: root + mode: 0644 + when: "arvados_compute_pin_packages|bool" + +- name: Install NVIDIA CUDA apt repository + ansible.builtin.apt: + deb: "https://developer.download.nvidia.com/compute/cuda/repos/{{ ansible_distribution|lower }}{{ ansible_distribution_major_version }}{{ ansible_distribution_minor_version if ansible_distribution == 'Ubuntu' else '' }}/{{ ansible_architecture }}/cuda-keyring_1.1-1_all.deb" + register: cuda_apt_task + +- name: Install NVIDIA container toolkit apt repository + ansible.builtin.deb822_repository: + name: nvidia-container-toolkit + types: deb + uris: "{{ nvidia_container_apt_url }}/stable/deb/$(ARCH)" + suites: "/" + signed_by: "{{ nvidia_container_apt_url }}/gpgkey" + register: nvidia_apt_task + +- name: Install NVIDIA packages + ansible.builtin.apt: + update_cache: "{{ cuda_apt_task.changed or nvidia_apt_task.changed }}" + name: + - cuda + - libnvidia-container1 + - libnvidia-container-tools + - nvidia-container-toolkit + +- name: Copy nvidia.conf modules to nvidia.avail + ansible.builtin.copy: + src: /etc/modules-load.d/nvidia.conf + dest: /etc/modules-load.d/nvidia.avail + remote_src: true + +- name: Remove nvidia.conf modules from autoloading + ansible.builtin.file: + dest: /etc/modules-load.d/nvidia.conf + state: absent + +- name: Install dynamic module loading script + ansible.builtin.copy: + src: detect-gpu.sh + dest: /usr/local/sbin/detect-gpu.sh + owner: root + group: root + mode: 0755 + +- name: Prepare systemd-modules-load override directory + ansible.builtin.file: + path: /etc/systemd/system/systemd-modules-load.service.d + state: directory + owner: root + group: root + mode: 0755 + +- name: Install dynamic module load hook + ansible.builtin.copy: + src: detect-gpu.conf + dest: /etc/systemd/system/systemd-modules-load.service.d/arvados-detect-gpu.conf + owner: root + group: root + mode: 0644 + +# crunch-run has its own CUDA initialization code. +# We prefer to use that over NVIDIA's. +- name: Query nvidia-persistenced.service + ansible.builtin.systemd_service: + name: nvidia-persistenced.service + register: nvidia_persistenced + +- name: Disable nvidia-persistenced.service + when: "nvidia_persistenced.status.LoadState != 'not-found'" + ansible.builtin.systemd_service: + name: nvidia-persistenced.service + enabled: false diff --git a/tools/compute-images/ansible/roles/compute_singularity/defaults/main.yml b/tools/compute-images/ansible/roles/compute_singularity/defaults/main.yml new file mode 100644 index 0000000000..985b210b45 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_singularity/defaults/main.yml @@ -0,0 +1,8 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +compute_go_version: "1.21.10" +compute_singularity_version: "3.10.4" +compute_singularity_url: "https://github.com/sylabs/singularity" +workdir: "{{ ansible_env.get('TMPDIR', '/tmp') }}" diff --git a/tools/compute-images/ansible/roles/compute_singularity/tasks/main.yml b/tools/compute-images/ansible/roles/compute_singularity/tasks/main.yml new file mode 100644 index 0000000000..3ca34c3ee2 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_singularity/tasks/main.yml @@ -0,0 +1,79 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Install Singularity dependencies + ansible.builtin.apt: + name: + - build-essential + - git + - libglib2.0-dev + - libseccomp-dev + - squashfs-tools + +- name: Create Singularity build directory + become: no + ansible.builtin.tempfile: + state: directory + path: "{{ workdir }}" + prefix: singularity-build- + register: singularity_build + +- name: Clone Singularity + become: no + ansible.builtin.git: + repo: "{{ compute_singularity_url }}" + dest: "{{ singularity_build.path }}" + version: "v{{ compute_singularity_version }}" + +- name: Create Singularity VERSION file + become: no + ansible.builtin.copy: + content: | + {{ compute_singularity_version }} + dest: "{{ singularity_build.path }}/VERSION" + mode: 0644 + +- name: Install Go + ansible.builtin.unarchive: + src: "https://storage.googleapis.com/golang/go{{ compute_go_version }}.linux-amd64.tar.gz" + dest: "{{ singularity_build.path }}" + remote_src: yes + +- name: Build Singularity + become: no + ansible.builtin.command: + cmd: "{{ item }}" + chdir: "{{ singularity_build.path }}" + environment: + PATH: "{{ singularity_build.path }}/go/bin:{{ ansible_env.PATH }}" + loop: + - ./mconfig --prefix=/opt/singularity + - env -C builddir make + +- name: Install Singularity + ansible.builtin.command: + cmd: make install + chdir: "{{ singularity_build.path }}/builddir" + +- name: Clean Singularity build directory + ansible.builtin.file: + path: "{{ singularity_build.path }}" + state: absent + +- name: Add Singularity commands to PATH + ansible.builtin.file: + state: link + src: "/opt/singularity/bin/{{ item }}" + dest: "/usr/local/bin/{{ item }}" + loop: + - run-singularity + - singularity + +- name: Configure Singularity mksquashfs mem + ansible.builtin.lineinfile: + create: true + path: /opt/singularity/etc/singularity/singularity.conf + regexp: "^ *mksquashfs +mem *=" + line: "mksquashfs mem = {{ compute_mksquashfs_mem }}" + when: compute_mksquashfs_mem is defined diff --git a/tools/compute-images/.licenseignore b/tools/compute-images/ansible/roles/compute_user/defaults/main.yml similarity index 53% rename from tools/compute-images/.licenseignore rename to tools/compute-images/ansible/roles/compute_user/defaults/main.yml index 6288dbbc87..a147e9d542 100644 --- a/tools/compute-images/.licenseignore +++ b/tools/compute-images/ansible/roles/compute_user/defaults/main.yml @@ -1,5 +1,6 @@ # Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 -*.json -1078ECD7.asc + +compute_user_account: crunch +compute_user_home: "/home/{{ compute_user_account }}" diff --git a/tools/compute-images/ansible/roles/compute_user/tasks/main.yml b/tools/compute-images/ansible/roles/compute_user/tasks/main.yml new file mode 100644 index 0000000000..90efa47079 --- /dev/null +++ b/tools/compute-images/ansible/roles/compute_user/tasks/main.yml @@ -0,0 +1,33 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Create compute user account + ansible.builtin.user: + name: "{{ compute_user_account }}" + comment: Crunch user,,,, + home: "{{ compute_user_home }}" + password_lock: true + +- name: Give compute user sudo access + ansible.builtin.lineinfile: + line: "{{ compute_user_account }} ALL=(ALL) NOPASSWD:ALL" + path: /etc/sudoers.d/91-crunch + create: true + owner: root + group: root + mode: 0644 + +- name: Create compute user .ssh directory + ansible.builtin.file: + state: directory + path: "{{ compute_user_home }}/.ssh" + owner: "{{ compute_user_account }}" + mode: 0700 + +- name: Configure compute user authorized keys + ansible.builtin.copy: + src: "{{ compute_authorized_keys }}" + dest: "{{ compute_user_home }}/.ssh/authorized_keys" + owner: "{{ compute_user_account }}" + mode: 0600 diff --git a/tools/compute-images/ansible/roles/distro_apt/defaults/main.yml b/tools/compute-images/ansible/roles/distro_apt/defaults/main.yml new file mode 100644 index 0000000000..8d0373722f --- /dev/null +++ b/tools/compute-images/ansible/roles/distro_apt/defaults/main.yml @@ -0,0 +1,8 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +debian_apt_mirror: "{{ 'http://cdn-aws.deb.debian.org/debian' if ansible_system_vendor == 'Amazon EC2' else 'http://deb.debian.org/debian' }}" +debian_apt_components: + - main + - contrib diff --git a/tools/compute-images/ansible/roles/distro_apt/tasks/debian.yml b/tools/compute-images/ansible/roles/distro_apt/tasks/debian.yml new file mode 100644 index 0000000000..7e877e0bf4 --- /dev/null +++ b/tools/compute-images/ansible/roles/distro_apt/tasks/debian.yml @@ -0,0 +1,11 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Set up Debian apt repository + ansible.builtin.deb822_repository: + name: debian + types: deb + uris: "{{ debian_apt_mirror }}" + suites: "{{ ansible_distribution_release }}" + components: "{{ debian_apt_components }}" diff --git a/tools/compute-images/ansible/roles/distro_apt/tasks/main.yml b/tools/compute-images/ansible/roles/distro_apt/tasks/main.yml new file mode 100644 index 0000000000..637bdc13c0 --- /dev/null +++ b/tools/compute-images/ansible/roles/distro_apt/tasks/main.yml @@ -0,0 +1,13 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +- name: Check distribution + ansible.builtin.fail: + msg: "Unsupported distribution: {{ ansible_distribution }}" + when: "ansible_distribution != 'Debian'" + +- name: Set up Debian apt repository + ansible.builtin.include_tasks: + file: debian.yml + when: "ansible_distribution == 'Debian'" diff --git a/tools/compute-images/arvados-images-aws.json b/tools/compute-images/arvados-images-aws.json index 8402e96fd8..6285b84844 100644 --- a/tools/compute-images/arvados-images-aws.json +++ b/tools/compute-images/arvados-images-aws.json @@ -1,25 +1,16 @@ { "variables": { + "ansible_vars_file": "", "arvados_cluster": "", "aws_access_key": "", "aws_profile": "", "aws_secret_key": "", "aws_source_ami": "ami-0a9d5908c7201e91d", - "aws_ebs_autoscale": "", "aws_associate_public_ip_address": "", "aws_ena_support": "", - "build_environment": "aws", - "public_key_file": "", - "mksquashfs_mem": "", - "nvidia_gpu_support": "", - "goversion": "", - "reposuffix": "", - "pin_packages": "true", - "resolver": "", "ssh_user": "admin", "subnet_id": "", - "vpc_id": "", - "workdir": "/tmp" + "vpc_id": "" }, "builders": [{ "type": "amazon-ebs", @@ -71,50 +62,12 @@ } }], "provisioners": [{ - "type": "file", - "source": "1078ECD7.asc", - "destination": "{{user `workdir`}}/1078ECD7.asc" - },{ - "type": "file", - "source": "scripts/etc-apt-preferences.d-arvados.pref", - "destination": "{{user `workdir`}}/etc-apt-preferences.d-arvados.pref" - },{ - "type": "file", - "source": "scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg", - "destination": "{{user `workdir`}}/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg" - },{ - "type": "file", - "source": "scripts/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf", - "destination": "{{user `workdir`}}/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf" - },{ - "type": "file", - "source": "scripts/usr-local-bin-detect-gpu.sh", - "destination": "{{user `workdir`}}/usr-local-bin-detect-gpu.sh" - },{ - "type": "file", - "source": "scripts/usr-local-bin-ensure-encrypted-partitions.sh", - "destination": "{{user `workdir`}}/usr-local-bin-ensure-encrypted-partitions.sh" - },{ - "type": "file", - "source": "scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh", - "destination": "{{user `workdir`}}/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh" - },{ - "type": "file", - "source": "{{user `public_key_file`}}", - "destination": "{{user `workdir`}}/crunch-authorized_keys" - },{ - "type": "shell", - "execute_command": "sudo -S env {{ .Vars }} /bin/bash '{{ .Path }}'", - "script": "scripts/base.sh", - "environment_vars": [ - "RESOLVER={{user `resolver`}}", - "REPOSUFFIX={{user `reposuffix`}}", - "MKSQUASHFS_MEM={{user `mksquashfs_mem`}}", - "NVIDIA_GPU_SUPPORT={{user `nvidia_gpu_support`}}", - "CLOUD=aws", - "AWS_EBS_AUTOSCALE={{user `aws_ebs_autoscale`}}", - "GOVERSION={{user `goversion`}}", - "WORKDIR={{user `workdir`}}" - ] + "type": "ansible", + "playbook_file": "ansible/build-compute-image.yml", + "user": "{{user `ssh_user`}}", + "extra_arguments": [ + "--extra-vars", "@{{ user `ansible_vars_file` }}", + "--scp-extra-args", "'-O'" + ] }] } diff --git a/tools/compute-images/arvados-images-azure.json b/tools/compute-images/arvados-images-azure.json index 925296b395..2bfdd1d487 100644 --- a/tools/compute-images/arvados-images-azure.json +++ b/tools/compute-images/arvados-images-azure.json @@ -1,5 +1,6 @@ { "variables": { + "ansible_vars_file": "", "account_file": "", "arvados_cluster": "", "build_environment": "azure-arm", @@ -9,18 +10,11 @@ "image_sku": "", "location": "centralus", "project_id": "", - "public_key_file": "", - "mksquashfs_mem": "", - "nvidia_gpu_support": "", - "reposuffix": "", - "pin_packages": "true", - "resolver": "", "resource_group": null, "ssh_private_key_file": "{{env `PACKERPRIVKEY`}}", "ssh_user": "packer", "subscription_id": "{{env `ARM_SUBSCRIPTION_ID`}}", - "tenant_id": "{{env `ARM_TENANT_ID`}}", - "workdir": "/tmp" + "tenant_id": "{{env `ARM_TENANT_ID`}}" }, "builders": [ { @@ -48,47 +42,13 @@ "vm_size": "Standard_D1_v2" } ], - "provisioners": [{ - "type": "file", - "source": "1078ECD7.asc", - "destination": "/{{user `workdir`}}/1078ECD7.asc" - },{ - "type": "file", - "source": "scripts/etc-apt-preferences.d-arvados.pref", - "destination": "{{user `workdir`}}/etc-apt-preferences.d-arvados.pref" - },{ - "type": "file", - "source": "scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg", - "destination": "/{{user `workdir`}}/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg" - },{ - "type": "file", - "source": "scripts/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf", - "destination": "{{user `workdir`}}/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf" - },{ - "type": "file", - "source": "scripts/usr-local-bin-detect-gpu.sh", - "destination": "{{user `workdir`}}/usr-local-bin-detect-gpu.sh" - },{ - "type": "file", - "source": "scripts/usr-local-bin-ensure-encrypted-partitions.sh", - "destination": "/{{user `workdir`}}/usr-local-bin-ensure-encrypted-partitions.sh" - },{ - "type": "file", - "source": "{{user `public_key_file`}}", - "destination": "/{{user `workdir`}}/crunch-authorized_keys" - },{ - "type": "shell", - "execute_command": "sudo -S env {{ .Vars }} /bin/bash '{{ .Path }}'", - "script": "scripts/base.sh", - "environment_vars": [ - "RESOLVER={{user `resolver`}}", - "REPOSUFFIX={{user `reposuffix`}}", - "MKSQUASHFS_MEM={{user `mksquashfs_mem`}}", - "NVIDIA_GPU_SUPPORT={{user `nvidia_gpu_support`}}", - "CLOUD=azure", - "GOVERSION={{user `goversion`}}", - "WORKDIR={{user `workdir`}}" - ] + "type": "ansible", + "playbook_file": "ansible/build-compute-image.yml", + "user": "{{user `ssh_user`}}", + "extra_arguments": [ + "--extra-vars", "@{{ user `ansible_vars_file` }}", + "--scp-extra-args", "'-O'" + ] }] } diff --git a/tools/compute-images/build.sh b/tools/compute-images/build.sh index 54ca662f91..52b87c4710 100755 --- a/tools/compute-images/build.sh +++ b/tools/compute-images/build.sh @@ -30,8 +30,10 @@ Options: VPC id for AWS, if not specified packer will derive from the subnet id or pick the default one. --aws-subnet-id Subnet id for AWS, if not specified packer will pick the default one for the VPC. - --aws-ebs-autoscale - Install the AWS EBS autoscaler daemon (default: do not install the AWS EBS autoscaler). + --aws-ebs-autoscale, --no-aws-ebs-autoscale + These flags determine whether or not to use an EBS autoscaling volume for + Crunch's working directory. The default is to use this when building an + image on AWS. --aws-associate-public-ip Associate a public IP address with the node used for building the compute image. Required when the machine running packer can not reach the node used for building @@ -81,6 +83,20 @@ EOF set -e -o pipefail +ansible_vars_file="$(mktemp --tmpdir ansible-vars-XXXXXX.yml)" +trap 'rm -f "$ansible_vars_file"' EXIT INT TERM QUIT +# FIXME? We build the compute node image with the same version of Go that +# Arvados uses, but it's not clear that we should: the only thing we use Go +# for is to build Singularity, so what matters is what Singularity wants, not +# what Arvados wants. +sed -rn 's/^const +goversion *= */compute_go_version: /p' \ + <../../lib/install/deps.go >>"$ansible_vars_file" + +ansible_set_var() { + eval "$(printf "%s=%q" "$1" "$2")" + echo "$1: $2" >>"$ansible_vars_file" +} + JSON_FILE= ARVADOS_CLUSTER_ID= AWS_PROFILE= @@ -100,15 +116,10 @@ AZURE_LOCATION= AZURE_CLOUD_ENVIRONMENT= DEBUG= SSH_USER= -WORKDIR= AWS_DEFAULT_REGION=us-east-1 -PIN_PACKAGES= -PUBLIC_KEY_FILE= -MKSQUASHFS_MEM=256M -NVIDIA_GPU_SUPPORT= PARSEDOPTS=$(getopt --name "$0" --longoptions \ - help,json-file:,arvados-cluster-id:,aws-source-ami:,aws-profile:,aws-secrets-file:,aws-region:,aws-vpc-id:,aws-subnet-id:,aws-ebs-autoscale,aws-associate-public-ip:,aws-ena-support:,gcp-project-id:,gcp-account-file:,gcp-zone:,azure-secrets-file:,azure-resource-group:,azure-location:,azure-sku:,azure-cloud-environment:,ssh_user:,workdir:,resolver:,reposuffix:,pin-packages,no-pin-packages,public-key-file:,mksquashfs-mem:,nvidia-gpu-support,debug \ + help,json-file:,arvados-cluster-id:,aws-source-ami:,aws-profile:,aws-secrets-file:,aws-region:,aws-vpc-id:,aws-subnet-id:,aws-ebs-autoscale,no-aws-ebs-autoscale,aws-associate-public-ip:,aws-ena-support:,gcp-project-id:,gcp-account-file:,gcp-zone:,azure-secrets-file:,azure-resource-group:,azure-location:,azure-sku:,azure-cloud-environment:,ssh_user:,workdir:,resolver:,reposuffix:,pin-packages,no-pin-packages,public-key-file:,mksquashfs-mem:,nvidia-gpu-support,debug \ -- "" "$@") if [ $? -ne 0 ]; then exit 1 @@ -147,7 +158,10 @@ while [ $# -gt 0 ]; do AWS_SUBNET_ID="$2"; shift ;; --aws-ebs-autoscale) - AWS_EBS_AUTOSCALE=1 + ansible_set_var arvados_compute_encrypted_tmp aws_ebs + ;; + --no-aws-ebs-autoscale) + ansible_set_var arvados_compute_encrypted_tmp '""' ;; --aws-associate-public-ip) AWS_ASSOCIATE_PUBLIC_IP="$2"; shift @@ -183,28 +197,29 @@ while [ $# -gt 0 ]; do SSH_USER="$2"; shift ;; --workdir) - WORKDIR="$2"; shift + ansible_set_var workdir "$2"; shift ;; --resolver) - RESOLVER="$2"; shift + ansible_set_var dns_resolver "$2"; shift ;; --reposuffix) - REPOSUFFIX="$2"; shift + ansible_set_var arvados_apt_suites "$2"; shift ;; --pin-packages) - PIN_PACKAGES=true + ansible_set_var arvados_compute_pin_packages true ;; --no-pin-packages) - PIN_PACKAGES=false + ansible_set_var arvados_pin_version '""' + ansible_set_var arvados_compute_pin_packages false ;; --public-key-file) - PUBLIC_KEY_FILE="$2"; shift + ansible_set_var compute_authorized_keys "$(readlink -e "$2")"; shift ;; --mksquashfs-mem) - MKSQUASHFS_MEM="$2"; shift + ansible_set_var compute_mksquashfs_mem "$2"; shift ;; --nvidia-gpu-support) - NVIDIA_GPU_SUPPORT=1 + ansible_set_var arvados_compute_nvidia true ;; --debug) # If you want to debug a build issue, add the -debug flag to the build @@ -224,7 +239,6 @@ while [ $# -gt 0 ]; do shift done - if [[ -z "$JSON_FILE" ]] || [[ ! -f "$JSON_FILE" ]]; then echo >&2 "$helpmessage" echo >&2 @@ -241,7 +255,7 @@ if [[ -z "$ARVADOS_CLUSTER_ID" ]]; then exit 1 fi -if [[ -z "$PUBLIC_KEY_FILE" ]] || [[ ! -f "$PUBLIC_KEY_FILE" ]]; then +if [[ -z "${compute_authorized_keys:-}" || ! -f "$compute_authorized_keys" ]]; then echo >&2 "$helpmessage" echo >&2 echo >&2 "ERROR: public key file file not found" @@ -257,7 +271,6 @@ if [[ ! -z "$AZURE_SECRETS_FILE" ]]; then source $AZURE_SECRETS_FILE fi - AWS=0 EXTRA2="" @@ -281,10 +294,6 @@ if [[ -n "$AWS_DEFAULT_REGION" ]]; then EXTRA2+=" -var aws_default_region=$AWS_DEFAULT_REGION" AWS=1 fi -if [[ -n "$AWS_EBS_AUTOSCALE" ]]; then - EXTRA2+=" -var aws_ebs_autoscale=$AWS_EBS_AUTOSCALE" - AWS=1 -fi if [[ $AWS -eq 1 ]]; then EXTRA2+=" -var aws_associate_public_ip_address=$AWS_ASSOCIATE_PUBLIC_IP" EXTRA2+=" -var aws_ena_support=$AWS_ENA_SUPPORT" @@ -313,39 +322,16 @@ fi if [[ -n "$SSH_USER" ]]; then EXTRA2+=" -var ssh_user=$SSH_USER" fi -if [[ -n "$WORKDIR" ]]; then - EXTRA2+=" -var workdir=$WORKDIR" -fi -if [[ -n "$RESOLVER" ]]; then - EXTRA2+=" -var resolver=$RESOLVER" +if [[ -z "${arvados_compute_pin_packages:-}" && "${arvados_apt_suites:-}" = -dev ]]; then + ansible_set_var arvados_pin_version '""' + ansible_set_var arvados_compute_pin_packages false fi -if [[ -n "$REPOSUFFIX" ]]; then - EXTRA2+=" -var reposuffix=$REPOSUFFIX" -fi -if [[ -z "$PIN_PACKAGES" ]]; then - case "$REPOSUFFIX" in - -dev) PIN_PACKAGES=false ;; - *) PIN_PACKAGES=true ;; - esac -fi -EXTRA2+=" -var pin_packages=$PIN_PACKAGES" -if [[ -n "$PUBLIC_KEY_FILE" ]]; then - EXTRA2+=" -var public_key_file=$PUBLIC_KEY_FILE" -fi -if [[ -n "$MKSQUASHFS_MEM" ]]; then - EXTRA2+=" -var mksquashfs_mem=$MKSQUASHFS_MEM" -fi -if [[ -n "$NVIDIA_GPU_SUPPORT" ]]; then - EXTRA2+=" -var nvidia_gpu_support=$NVIDIA_GPU_SUPPORT" -fi - -GOVERSION=$(grep 'const goversion =' ../../lib/install/deps.go |awk -F'"' '{print $2}') -EXTRA2+=" -var goversion=$GOVERSION" logfile=packer-$(date -Iseconds).log echo +cat "$ansible_vars_file" packer version echo -echo packer build$EXTRA -var "arvados_cluster=$ARVADOS_CLUSTER_ID"$EXTRA2 $JSON_FILE | tee -a $logfile -packer build$EXTRA -var "arvados_cluster=$ARVADOS_CLUSTER_ID"$EXTRA2 $JSON_FILE 2>&1 | tee -a $logfile +echo packer build$EXTRA -var "arvados_cluster=$ARVADOS_CLUSTER_ID" -var "ansible_vars_file=$ansible_vars_file" $EXTRA2 $JSON_FILE | tee -a $logfile +packer build$EXTRA -var "arvados_cluster=$ARVADOS_CLUSTER_ID" -var "ansible_vars_file=$ansible_vars_file" $EXTRA2 $JSON_FILE 2>&1 | tee -a $logfile diff --git a/tools/compute-images/scripts/base.sh b/tools/compute-images/scripts/base.sh deleted file mode 100644 index c352f94285..0000000000 --- a/tools/compute-images/scripts/base.sh +++ /dev/null @@ -1,258 +0,0 @@ -#!/bin/bash -euxo pipefail - -# Copyright (C) The Arvados Authors. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -set -eu -o pipefail - -SUDO=sudo - -wait_for_apt_locks() { - while $SUDO fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do - echo "APT: Waiting for apt/dpkg locks to be released..." - sleep 1 - done -} - -safe_apt() { - wait_for_apt_locks && - $SUDO env DEBIAN_FRONTEND=noninteractive apt-get -q --yes "$@" -} - -download_and_install() { - local url="$1"; shift - local dest="$1"; shift - curl -fsSL "$url" | $SUDO install "$@" /dev/stdin "$dest" -} - -. /etc/os-release -DISTRO_ID="$ID" -echo "Working directory is '${WORKDIR}'" - -### 1. Configure apt preferences - -# Third-party packages may depend on contrib packages. -# Make sure we have that component enabled for all existing sources. -if [[ "$DISTRO_ID" = debian ]]; then - find /etc/apt -name "*.list" -print0 | - xargs -0r $SUDO sed -ri '/^deb / s/$/ contrib/' - find /etc/apt -name "*.sources" -print0 | - xargs -0r $SUDO sed -ri '/^Components:/ s/$/ contrib/' -fi - -if [[ "${PIN_PACKAGES:-true}" != false ]]; then - $SUDO install -d /etc/apt/preferences.d - $SUDO install -m 0644 \ - "$WORKDIR/etc-apt-preferences.d-arvados.pref" \ - /etc/apt/preferences.d/arvados.pref -fi - -### 2. Install all base packages we need - -safe_apt update -# Add the debian keys (but don't abort if we can't find them, e.g. on Ubuntu where we don't need them) -safe_apt install debian-keyring debian-archive-keyring 2>/dev/null || true -safe_apt upgrade -# Install gnupg and dirmgr or gpg key checks will fail -safe_apt install \ - gnupg \ - dirmngr \ - lsb-release \ - cloud-init \ - openssh-server \ - apt-utils \ - git \ - curl \ - libcurl3-gnutls \ - libcurl4-openssl-dev \ - lvm2 \ - cryptsetup \ - xfsprogs \ - jq \ - unzip \ - make \ - build-essential \ - libssl-dev \ - uuid-dev \ - squashfs-tools \ - libglib2.0-dev \ - libseccomp-dev - -safe_apt remove --purge unattended-upgrades - -### 3. Set up third-party apt repositories and install packages we need from them -$SUDO install -d /etc/apt/keyrings - -# Add the Arvados apt source -download_and_install https://apt.arvados.org/pubkey.gpg /etc/apt/keyrings/arvados.asc -$SUDO install -m 644 /dev/stdin /etc/apt/sources.list.d/arvados.sources < /etc/arvados/docker-cleaner/docker-cleaner.json - -# Enable cgroup accounting (forcing cgroups v1) -$SUDO echo 'GRUB_CMDLINE_LINUX="$GRUB_CMDLINE_LINUX cgroup_enable=memory swapaccount=1 systemd.unified_cgroup_hierarchy=0"' >> /etc/default/grub -$SUDO update-grub - -# Make sure user_allow_other is set in fuse.conf -$SUDO sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf - -# Add crunch user with sudo powers -$SUDO adduser --disabled-password --gecos "Crunch user,,,," crunch -# Do not require a password to sudo -echo -e "# for the crunch user\ncrunch ALL=(ALL) NOPASSWD:ALL" | $SUDO tee /etc/sudoers.d/91-crunch -# Set up the ssh public key for the crunch user -$SUDO install -d -m 700 -o crunch -g crunch ~crunch/.ssh -$SUDO install -m 600 -o crunch -g crunch "$WORKDIR/crunch-authorized_keys" ~crunch/.ssh/authorized_keys - -# Make sure we resolve via the provided resolver IP if set. Prepending is good enough because -# unless 'rotate' is set, the nameservers are queried in order (cf. man resolv.conf) -if [ -n "${RESOLVER:-}" ]; then - $SUDO sed -i "s/#prepend domain-name-servers 127.0.0.1;/prepend domain-name-servers ${RESOLVER};/" /etc/dhcp/dhclient.conf -fi - -if [ "${AWS_EBS_AUTOSCALE:-}" != "1" ]; then - # Set up the cloud-init script that will ensure encrypted disks - $SUDO install "$WORKDIR/usr-local-bin-ensure-encrypted-partitions.sh" /usr/local/bin/ensure-encrypted-partitions.sh -else - download_and_install "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" "${WORKDIR}/awscliv2.zip" - unzip -q ${WORKDIR}/awscliv2.zip -d ${WORKDIR} && $SUDO ${WORKDIR}/aws/install - EBS_AUTOSCALE_VERSION="ee323f0751c2b6f733692e805b51b9bf3c251bac" - $SUDO env -C /opt git clone https://github.com/arvados/amazon-ebs-autoscale.git - $SUDO git -C /opt/amazon-ebs-autoscale checkout "$EBS_AUTOSCALE_VERSION" - - # Set up the cloud-init script that makes use of the AWS EBS autoscaler - $SUDO install "$WORKDIR/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh" /usr/local/bin/ensure-encrypted-partitions.sh -fi - -$SUDO install -m 644 \ - "$WORKDIR/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg" \ - /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg - -if [ "$NVIDIA_GPU_SUPPORT" == "1" ]; then - # We need a kernel and matching headers - if [[ "$DISTRO_ID" == debian ]]; then - safe_apt install linux-image-cloud-amd64 linux-headers-cloud-amd64 - elif [ "$CLOUD" == "azure" ]; then - safe_apt install linux-image-azure linux-headers-azure - elif [ "$CLOUD" == "aws" ]; then - safe_apt install linux-image-aws linux-headers-aws - fi - safe_apt install cuda libnvidia-container1 libnvidia-container-tools nvidia-container-toolkit - - # Various components fail to start, and cause systemd to boot in degraded - # state, if the system does not actually have an NVIDIA GPU. Configure the - # image to adapt at boot time. - - # Don't load modules unconditionally. - # Instead load them if hardware is detected. - if [[ -f /etc/modules-load.d/nvidia.conf ]]; then - $SUDO mv /etc/modules-load.d/nvidia.conf /etc/modules-load.d/nvidia.avail - fi - $SUDO install "$WORKDIR/usr-local-bin-detect-gpu.sh" /usr/local/bin/detect-gpu.sh - $SUDO install -d /etc/systemd/system/systemd-modules-load.service.d - $SUDO install -m 0644 \ - "$WORKDIR/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf" \ - /etc/systemd/system/systemd-modules-load.service.d/detect-gpu.conf - - # Don't start the persistence daemon. - # Instead rely on crunch-run's CUDA initialization. - if $SUDO systemctl is-enabled --quiet nvidia-persistenced.service; then - $SUDO systemctl disable nvidia-persistenced.service - fi -fi - -# Get Go and build singularity -mkdir -p /var/lib/arvados -rm -rf /var/lib/arvados/go/ -curl -fsSL https://storage.googleapis.com/golang/go${GOVERSION}.linux-amd64.tar.gz | - tar -C /var/lib/arvados -xz -ln -sf /var/lib/arvados/go/bin/* /usr/local/bin/ - -singularityversion=3.10.4 -cd /var/lib/arvados -git clone --recurse-submodules https://github.com/sylabs/singularity -cd singularity -git checkout v${singularityversion} -echo $singularityversion > VERSION -./mconfig --prefix=/var/lib/arvados -make -C ./builddir -make -C ./builddir install -ln -sf /var/lib/arvados/bin/* /usr/local/bin/ - -# set `mksquashfs mem` in the singularity config file if it is configured -if [ "$MKSQUASHFS_MEM" != "" ]; then - echo "mksquashfs mem = ${MKSQUASHFS_MEM}" >> /var/lib/arvados/etc/singularity/singularity.conf -fi - -# Print singularity version installed -singularity --version - -safe_apt clean diff --git a/tools/compute-images/scripts/etc-apt-preferences.d-arvados.pref b/tools/compute-images/scripts/etc-apt-preferences.d-arvados.pref deleted file mode 100644 index 3802482785..0000000000 --- a/tools/compute-images/scripts/etc-apt-preferences.d-arvados.pref +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (C) The Arvados Authors. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -Explanation: Do not upgrade Arvados packages after they are installed -Explanation: (you should build a new compute image instead). -Package: * -Pin: origin "apt.arvados.org" -Pin-Priority: 50 - -# For every package that `base.sh` installs from a third-party repository, -# pin those packages to the last tested and known good version. - -Package: src:docker-ce -Pin: version 5:27.* -Pin-Priority: 995 - -Package: containerd.io -Pin: version 1.7.* -Pin-Priority: 995 - -Package: cuda -Pin: version 12.5.* -Pin-Priority: 995 - -Package: src:libnvidia-container src:nvidia-container-toolkit -Pin: version 1.16.* -Pin-Priority: 995 - -Package: cuda-drivers src:nvidia-graphics-drivers -Pin: version 560.* -Pin-Priority: 995 diff --git a/tools/compute-images/scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg b/tools/compute-images/scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg deleted file mode 100644 index febeda372e..0000000000 --- a/tools/compute-images/scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) The Arvados Authors. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -runcmd: - - /bin/echo "cloudinit runcmd starting" | /usr/bin/logger - - /usr/local/bin/ensure-encrypted-partitions.sh - - /bin/echo "cloudinit runcmd ensure-encrypted-partitions.sh done" | /usr/bin/logger - - /bin/echo "cloudinit runcmd finished" | /usr/bin/logger - - /bin/touch /arvados-compute-node-boot.complete diff --git a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh b/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh deleted file mode 100644 index d9790fb45c..0000000000 --- a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright (C) The Arvados Authors. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -set -e -set -x - -MOUNTPATH=/tmp - -findmntq() { - findmnt "$@" >/dev/null -} - -ensure_umount() { - if findmntq "$1"; then - umount "$1" - fi -} - -# First make sure docker is not using /tmp, then unmount everything under it. -if [ -d /etc/sv/docker.io ] -then - # TODO: Actually detect Docker state with runit - DOCKER_ACTIVE=true - sv stop docker.io || service stop docker.io || true -else - if systemctl --quiet is-active docker.service docker.socket; then - systemctl stop docker.service docker.socket || true - DOCKER_ACTIVE=true - else - DOCKER_ACTIVE=false - fi -fi - -ensure_umount "$MOUNTPATH/docker/aufs" - -/bin/bash /opt/amazon-ebs-autoscale/install.sh --imdsv2 -f lvm.ext4 -m $MOUNTPATH 2>&1 > /var/log/ebs-autoscale-install.log - -# Make sure docker uses the big partition -cat < /etc/docker/daemon.json -{ - "data-root": "$MOUNTPATH/docker-data" -} -EOF - -if ! $DOCKER_ACTIVE; then - # Nothing else to do - exit 0 -fi - -# restart docker -if [ -d /etc/sv/docker.io ] -then - ## runit - sv up docker.io -else - systemctl start docker.service docker.socket || true -fi - -end=$((SECONDS+60)) - -while [ $SECONDS -lt $end ]; do - if /usr/bin/docker ps -q >/dev/null; then - exit 0 - fi - sleep 1 -done - -# Docker didn't start within a minute, abort -exit 1 -- 2.30.2