From ee0896bf5bf1f7f4ebdc5168e928c28d9f06eaeb Mon Sep 17 00:00:00 2001 From: Ward Vandewege Date: Mon, 14 Jul 2014 00:05:24 -0400 Subject: [PATCH] Compute nodes can run jobs - almost. The final arv-put --raw in the collation step still fails (file descriptor error?). refs #3219 --- docker/api/Dockerfile | 26 +++++---- docker/api/application.yml.in | 3 + docker/api/crunch-dispatch-run.sh | 9 +-- docker/api/keep_server_0.json | 6 ++ docker/api/keep_server_1.json | 7 +++ docker/api/setup.sh.in | 47 +++++++++++++++ docker/api/supervisor.conf | 5 ++ docker/arvdock | 25 ++++---- docker/build_tools/Makefile | 10 ++++ docker/compute/Dockerfile | 21 ++++++- docker/compute/apt.arvados.org.list | 2 + docker/compute/fuse.conf | 10 ++++ docker/compute/setup.sh.in | 18 ++++++ docker/compute/supervisor.conf | 12 ++++ docker/compute/wrapdocker | 90 +++++++++++++++++++++++++++++ docker/slurm/Dockerfile | 2 +- 16 files changed, 265 insertions(+), 28 deletions(-) create mode 100644 docker/api/keep_server_0.json create mode 100644 docker/api/keep_server_1.json create mode 100755 docker/api/setup.sh.in create mode 100644 docker/compute/apt.arvados.org.list create mode 100644 docker/compute/fuse.conf create mode 100755 docker/compute/setup.sh.in create mode 100755 docker/compute/wrapdocker diff --git a/docker/api/Dockerfile b/docker/api/Dockerfile index 8f69b808f5..ce5755d7c8 100644 --- a/docker/api/Dockerfile +++ b/docker/api/Dockerfile @@ -7,21 +7,14 @@ MAINTAINER Tim Pierce # Clone a git repository of Arvados source -- not used to build, but # will be used by the Commit model and anything else that needs to # check a git repo for crunch scripts. -# RUN apt-get update && \ apt-get -q -y install procps postgresql postgresql-server-dev-9.1 apache2 slurm-llnl munge \ - supervisor && \ + supervisor sudo libwww-perl libio-socket-ssl-perl libcrypt-ssleay-perl \ + libjson-perl && \ git clone --bare git://github.com/curoverse/arvados.git /var/cache/git/arvados.git -# For crunch-dispatch -#ADD apt.arvados.org.list /etc/apt/sources.list.d/ - -#RUN apt-key adv --keyserver pgp.mit.edu --recv 1078ECD7 && apt-get update && \ -#RUN apt-get -q -y install libjson-perl libwww-perl libio-socket-ssl-perl libipc-system-simple-perl slurm-llnl munge -# apt-get -q -y install arvados-src libjson-perl libwww-perl libio-socket-ssl-perl libipc-system-simple-perl slurm-llnl munge - ADD munge.key /etc/munge/ -RUN chown munge:munge /etc/munge/munge.key +RUN chown munge:munge /etc/munge/munge.key && chmod 600 /etc/munge/munge.key ADD generated/slurm.conf /etc/slurm-llnl/ RUN /usr/local/rvm/bin/rvm-exec default gem install arvados-cli arvados @@ -61,9 +54,22 @@ RUN a2dissite default && \ a2enmod ssl && \ /bin/mkdir /var/run/apache2 +# Install a token for root +RUN mkdir -p /root/.config/arvados; echo "ARVADOS_API_HOST=api" >> /root/.config/arvados/settings.conf && echo "ARVADOS_API_HOST_INSECURE=yes" >> /root/.config/arvados/settings.conf && echo "ARVADOS_API_TOKEN=$(cat /tmp/superuser_token)" >> /root/.config/arvados/settings.conf && chmod 600 /root/.config/arvados/settings.conf + +# Set up directory for job commit repo +RUN mkdir -p /var/lib/arvados +# Add crunch user +RUN addgroup --gid 4005 crunch && mkdir /home/crunch && useradd --uid 4005 --gid 4005 crunch && chown crunch:crunch /home/crunch + +# Create keep and compute node objects +ADD keep_server_0.json /root/ +ADD keep_server_1.json /root/ + # Supervisor. ADD supervisor.conf /etc/supervisor/conf.d/arvados.conf ADD ssh.sh /usr/local/bin/ssh.sh +ADD generated/setup.sh /usr/local/bin/setup.sh ADD crunch-dispatch-run.sh /usr/local/bin/crunch-dispatch-run.sh ADD apache2_foreground.sh /etc/apache2/foreground.sh diff --git a/docker/api/application.yml.in b/docker/api/application.yml.in index 056d4b9263..f832b7f3d4 100644 --- a/docker/api/application.yml.in +++ b/docker/api/application.yml.in @@ -50,6 +50,9 @@ production: # Net::HTTP.get(URI("http://169.254.169.254/latest/meta-data/#{iface}-ipv4")).match(/^[\d\.]+$/)[0] # end << '172.16.0.23' # %> + permit_create_collection_with_unsigned_manifest: true + git_repositories_dir: /var/cache/git + crunch_job_wrapper: :slurm_immediate test: uuid_prefix: zzzzz diff --git a/docker/api/crunch-dispatch-run.sh b/docker/api/crunch-dispatch-run.sh index c16a4334c2..f422e78e38 100755 --- a/docker/api/crunch-dispatch-run.sh +++ b/docker/api/crunch-dispatch-run.sh @@ -1,15 +1,16 @@ #!/bin/bash set -e -export PATH="$PATH":/usr/local/arvados/src/services/crunch -export PERLLIB=/usr/local/arvados/src/sdk/perl/lib -export ARVADOS_API_HOST=qr1hi.arvadosapi.com +export PATH="$PATH":/usr/src/arvados/services/crunch +export PERLLIB=/usr/src/arvados/sdk/perl/lib +export ARVADOS_API_HOST=api +export ARVADOS_API_HOST_INSECURE=yes export CRUNCH_DISPATCH_LOCKFILE=/var/lock/crunch-dispatch if [[ ! -e $CRUNCH_DISPATCH_LOCKFILE ]]; then touch $CRUNCH_DISPATCH_LOCKFILE fi -export CRUNCH_JOB_BIN=/usr/local/arvados/src/services/crunch/crunch-job +export CRUNCH_JOB_BIN=/usr/src/arvados/services/crunch/crunch-job export HOME=`pwd` fuser -TERM -k $CRUNCH_DISPATCH_LOCKFILE || true diff --git a/docker/api/keep_server_0.json b/docker/api/keep_server_0.json new file mode 100644 index 0000000000..ce02f50865 --- /dev/null +++ b/docker/api/keep_server_0.json @@ -0,0 +1,6 @@ +{ + "service_host": "keep_server_0.keep.dev.arvados", + "service_port": 25107, + "service_ssl_flag": "false", + "service_type": "disk" +} diff --git a/docker/api/keep_server_1.json b/docker/api/keep_server_1.json new file mode 100644 index 0000000000..dbbdd1c31f --- /dev/null +++ b/docker/api/keep_server_1.json @@ -0,0 +1,7 @@ +{ + "service_host": "keep_server_1.keep.dev.arvados", + "service_port": 25107, + "service_ssl_flag": "false", + "service_type": "disk" +} + diff --git a/docker/api/setup.sh.in b/docker/api/setup.sh.in new file mode 100755 index 0000000000..fd4cf07652 --- /dev/null +++ b/docker/api/setup.sh.in @@ -0,0 +1,47 @@ +#!/bin/bash + +set -x + +. /etc/profile.d/rvm.sh + +export ARVADOS_API_HOST=api +export ARVADOS_API_HOST_INSECURE=yes +export ARVADOS_API_TOKEN=@@API_SUPERUSER_SECRET@@ + +# All users group + +prefix=`arv --format=uuid user current | cut -d- -f1` +read -rd $'\000' newgroup < /tmp/keep_service.list + +grep -q keep_server_0 /tmp/keep_service.list +if [[ "$?" != "0" ]]; then + arv keep_service create --keep-service "$(cat /root/keep_server_0.json)" +fi + +grep -q keep_server_1 /tmp/keep_service.list +if [[ "$?" != "0" ]]; then + arv keep_service create --keep-service "$(cat /root/keep_server_1.json)" +fi + diff --git a/docker/api/supervisor.conf b/docker/api/supervisor.conf index 9c4a6a529a..a1dacac92d 100644 --- a/docker/api/supervisor.conf +++ b/docker/api/supervisor.conf @@ -19,6 +19,11 @@ command=/etc/init.d/munge start user=root command=/etc/init.d/slurm-llnl start +[program:setup] +user=root +command=/usr/local/bin/setup.sh +startsecs=0 + [program:crunch-dispatch] user=root command=/usr/local/bin/crunch-dispatch-run.sh diff --git a/docker/arvdock b/docker/arvdock index abc3e8d825..9336ae1583 100755 --- a/docker/arvdock +++ b/docker/arvdock @@ -47,7 +47,8 @@ function start_container { args="$args --hostname api -P --name $name" elif [[ "$name" == "compute" ]]; then name=$name$COMPUTE_COUNTER - args="$args --hostname compute$COMPUTE_COUNTER -P --name $name" + # We need --privileged because we run docker-inside-docker on the compute nodes + args="$args --hostname compute$COMPUTE_COUNTER -P --privileged --name $name" let COMPUTE_COUNTER=$(($COMPUTE_COUNTER + 1)) else args="$args --name $name" @@ -233,6 +234,16 @@ function do_start { start_keep=true fi + if [[ $start_sso != false ]] + then + start_container "$start_sso:443" "sso_server" '' '' "arvados/sso" + fi + + if [[ $start_api != false ]] + then + start_container "$start_api:443" "api_server" '' "sso_server:sso" "arvados/api" + fi + if [[ $start_nameserver != false ]] then # We rely on skydock and skydns for dns discovery between the slurm controller and compute nodes, @@ -246,7 +257,7 @@ function do_start { if [[ "$?" != "0" ]]; then echo "Starting crosbymichael/skydns container..." $DOCKER rm "skydns" 2>/dev/null - $DOCKER run -d -p 172.17.42.1:53:53/udp --name skydns crosbymichael/skydns -nameserver 8.8.8.8:53 -nameserver 8.8.4.4 -domain arvados + $DOCKER run -d -p 172.17.42.1:53:53/udp --name skydns crosbymichael/skydns -nameserver 8.8.8.8:53 -domain arvados fi $DOCKER images | grep skydock >/dev/null if [[ "$?" != "0" ]]; then @@ -261,16 +272,6 @@ function do_start { fi fi - if [[ $start_sso != false ]] - then - start_container "$start_sso:443" "sso_server" '' '' "arvados/sso" - fi - - if [[ $start_api != false ]] - then - start_container "$start_api:443" "api_server" '' "sso_server:sso" "arvados/api" - fi - if [[ $start_compute != false ]] then for i in `seq 0 $(($start_compute - 1))`; do diff --git a/docker/build_tools/Makefile b/docker/build_tools/Makefile index e2fd50d615..fd49b3cf50 100644 --- a/docker/build_tools/Makefile +++ b/docker/build_tools/Makefile @@ -49,12 +49,17 @@ BASE_GENERATED = base/generated/arvados.tar.gz SLURM_GENERATED = slurm/generated/* +COMPUTE_GENERATED = compute/generated/setup.sh + +COMPUTE_GENERATED_IN = compute/setup.sh.in + API_GENERATED = \ api/generated/apache2_vhost \ api/generated/config_databases.sh \ api/generated/database.yml \ api/generated/omniauth.rb \ api/generated/application.yml \ + api/generated/setup.sh \ api/generated/slurm.conf \ api/generated/superuser_token @@ -64,6 +69,7 @@ API_GENERATED_IN = \ api/database.yml.in \ api/omniauth.rb.in \ api/application.yml.in \ + api/setup.sh.in \ api/slurm.conf.in \ api/superuser_token.in @@ -117,6 +123,8 @@ $(API_GENERATED): config.yml $(API_GENERATED_IN) $(WORKBENCH_GENERATED): config.yml $(WORKBENCH_GENERATED_IN) $(CONFIG_RB) +$(COMPUTE_GENERATED): config.yml $(COMPUTE_GENERATED_IN) + $(WAREHOUSE_GENERATED): config.yml $(WAREHOUSE_GENERATED_IN) $(CONFIG_RB) @@ -134,12 +142,14 @@ DOCKER_BUILD = $(DOCKER) build -q --rm=true api-image: passenger-image $(BUILD) $(API_DEPS) mkdir -p api/generated tar -czf api/generated/api.tar.gz -C build/services api + chmod 755 api/generated/setup.sh $(DOCKER_BUILD) -t arvados/api api date >api-image slurm-image: base-image $(SLURM_DEPS) compute-image: slurm-image $(BUILD) $(COMPUTE_DEPS) + chmod 755 compute/generated/setup.sh $(DOCKER_BUILD) -t arvados/compute compute date >compute-image diff --git a/docker/compute/Dockerfile b/docker/compute/Dockerfile index 8c403b5a50..4cc02aa824 100644 --- a/docker/compute/Dockerfile +++ b/docker/compute/Dockerfile @@ -3,13 +3,32 @@ FROM arvados/slurm MAINTAINER Ward Vandewege -RUN apt-get update && apt-get -q -y install supervisor +ADD apt.arvados.org.list /etc/apt/sources.list.d/ +RUN apt-key adv --keyserver pgp.mit.edu --recv 1078ECD7 + +RUN apt-get update && apt-get -qqy install supervisor python-pip python-pyvcf python-gflags python-google-api-python-client python-virtualenv libattr1-dev libfuse-dev python-dev python-llfuse fuse crunchstat python-arvados-python-client python-arvados-fuse cron + +ADD fuse.conf /etc/fuse.conf RUN /usr/local/rvm/bin/rvm-exec default gem install arvados-cli arvados +# Install Docker from the Docker Inc. repository +RUN apt-get update -qq && apt-get install -qqy iptables ca-certificates lxc apt-transport-https +RUN echo deb https://get.docker.io/ubuntu docker main > /etc/apt/sources.list.d/docker.list +RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 36A1D7869245C8950F966E92D8576A8BA88D21E9 +RUN apt-get update -qq && apt-get install -qqy lxc-docker + +RUN addgroup --gid 4005 crunch && mkdir /home/crunch && useradd --uid 4005 --gid 4005 crunch && usermod crunch -G fuse,docker && chown crunch:crunch /home/crunch + +# Fix /dev/fuse permissions/ownership +RUN chmod 1660 /dev/fuse && chgrp fuse /dev/fuse + # Supervisor. ADD supervisor.conf /etc/supervisor/conf.d/arvados.conf ADD ssh.sh /usr/local/bin/ssh.sh +ADD generated/setup.sh /usr/local/bin/setup.sh +ADD wrapdocker /usr/local/bin/wrapdocker.sh +VOLUME /var/lib/docker # Start the supervisor. CMD ["/usr/bin/supervisord", "-n"] diff --git a/docker/compute/apt.arvados.org.list b/docker/compute/apt.arvados.org.list new file mode 100644 index 0000000000..7eb8716071 --- /dev/null +++ b/docker/compute/apt.arvados.org.list @@ -0,0 +1,2 @@ +# apt.arvados.org +deb http://apt.arvados.org/ wheezy main diff --git a/docker/compute/fuse.conf b/docker/compute/fuse.conf new file mode 100644 index 0000000000..4ed21baae5 --- /dev/null +++ b/docker/compute/fuse.conf @@ -0,0 +1,10 @@ +# Set the maximum number of FUSE mounts allowed to non-root users. +# The default is 1000. +# +#mount_max = 1000 + +# Allow non-root users to specify the 'allow_other' or 'allow_root' +# mount options. +# +user_allow_other + diff --git a/docker/compute/setup.sh.in b/docker/compute/setup.sh.in new file mode 100755 index 0000000000..ade905dc1b --- /dev/null +++ b/docker/compute/setup.sh.in @@ -0,0 +1,18 @@ +#!/bin/bash + +. /etc/profile.d/rvm.sh + +export ARVADOS_API_HOST=api +export ARVADOS_API_HOST_INSECURE=yes +export ARVADOS_API_TOKEN=@@API_SUPERUSER_SECRET@@ + +arv node create --node {} > /tmp/node.json + +UUID=`grep \"uuid\" /tmp//node.json |cut -f4 -d\"` +PING_SECRET=`grep \"ping_secret\" /tmp//node.json |cut -f4 -d\"` + +echo "*/5 * * * * root /usr/bin/curl -k -d ping_secret=$PING_SECRET https://api/arvados/v1/nodes/$UUID/ping" > /etc/cron.d/node_ping + +# Send a ping now +/usr/bin/curl -k -d ping_secret=$PING_SECRET https://api/arvados/v1/nodes/$UUID/ping?ping_secret=$PING_SECRET + diff --git a/docker/compute/supervisor.conf b/docker/compute/supervisor.conf index 6563b547df..af081dfc00 100644 --- a/docker/compute/supervisor.conf +++ b/docker/compute/supervisor.conf @@ -11,4 +11,16 @@ command=/etc/init.d/munge start user=root command=/etc/init.d/slurm-llnl start +[program:cron] +user=root +command=/etc/init.d/cron start + +[program:setup] +user=root +command=/usr/local/bin/setup.sh +startsecs=0 + +[program:docker] +user=root +command=/usr/local/bin/wrapdocker.sh diff --git a/docker/compute/wrapdocker b/docker/compute/wrapdocker new file mode 100755 index 0000000000..e714d5bd2d --- /dev/null +++ b/docker/compute/wrapdocker @@ -0,0 +1,90 @@ +#!/bin/bash + +# Borrowed from https://github.com/jpetazzo/dind under Apache2 +# and slightly modified. + +# First, make sure that cgroups are mounted correctly. +CGROUP=/sys/fs/cgroup +: {LOG:=stdio} + +[ -d $CGROUP ] || + mkdir $CGROUP + +mountpoint -q $CGROUP || + mount -n -t tmpfs -o uid=0,gid=0,mode=0755 cgroup $CGROUP || { + echo "Could not make a tmpfs mount. Did you use -privileged?" + exit 1 + } + +if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security +then + mount -t securityfs none /sys/kernel/security || { + echo "Could not mount /sys/kernel/security." + echo "AppArmor detection and -privileged mode might break." + } +fi + +# Mount the cgroup hierarchies exactly as they are in the parent system. +for SUBSYS in $(cut -d: -f2 /proc/1/cgroup) +do + [ -d $CGROUP/$SUBSYS ] || mkdir $CGROUP/$SUBSYS + mountpoint -q $CGROUP/$SUBSYS || + mount -n -t cgroup -o $SUBSYS cgroup $CGROUP/$SUBSYS + + # The two following sections address a bug which manifests itself + # by a cryptic "lxc-start: no ns_cgroup option specified" when + # trying to start containers withina container. + # The bug seems to appear when the cgroup hierarchies are not + # mounted on the exact same directories in the host, and in the + # container. + + # Named, control-less cgroups are mounted with "-o name=foo" + # (and appear as such under /proc//cgroup) but are usually + # mounted on a directory named "foo" (without the "name=" prefix). + # Systemd and OpenRC (and possibly others) both create such a + # cgroup. To avoid the aforementioned bug, we symlink "foo" to + # "name=foo". This shouldn't have any adverse effect. + echo $SUBSYS | grep -q ^name= && { + NAME=$(echo $SUBSYS | sed s/^name=//) + ln -s $SUBSYS $CGROUP/$NAME + } + + # Likewise, on at least one system, it has been reported that + # systemd would mount the CPU and CPU accounting controllers + # (respectively "cpu" and "cpuacct") with "-o cpuacct,cpu" + # but on a directory called "cpu,cpuacct" (note the inversion + # in the order of the groups). This tries to work around it. + [ $SUBSYS = cpuacct,cpu ] && ln -s $SUBSYS $CGROUP/cpu,cpuacct +done + +# Note: as I write those lines, the LXC userland tools cannot setup +# a "sub-container" properly if the "devices" cgroup is not in its +# own hierarchy. Let's detect this and issue a warning. +grep -q :devices: /proc/1/cgroup || + echo "WARNING: the 'devices' cgroup should be in its own hierarchy." +grep -qw devices /proc/1/cgroup || + echo "WARNING: it looks like the 'devices' cgroup is not mounted." + +# Now, close extraneous file descriptors. +pushd /proc/self/fd >/dev/null +for FD in * +do + case "$FD" in + # Keep stdin/stdout/stderr + [012]) + ;; + # Nuke everything else + *) + eval exec "$FD>&-" + ;; + esac +done +popd >/dev/null + + +# If a pidfile is still around (for example after a container restart), +# delete it so that docker can start. +rm -rf /var/run/docker.pid + +exec docker -d + diff --git a/docker/slurm/Dockerfile b/docker/slurm/Dockerfile index cfd63fc75b..7a60bf66f3 100644 --- a/docker/slurm/Dockerfile +++ b/docker/slurm/Dockerfile @@ -6,6 +6,6 @@ MAINTAINER Ward Vandewege RUN apt-get update && apt-get -q -y install slurm-llnl munge ADD munge.key /etc/munge/ -RUN chown munge:munge /etc/munge/munge.key +RUN chown munge:munge /etc/munge/munge.key && chmod 600 /etc/munge/munge.key ADD generated/slurm.conf /etc/slurm-llnl/ -- 2.30.2