Merge branch 'master' into wtsi-hgi-8087-arv-cli-request-body-from-file
authorradhika <radhika@curoverse.com>
Wed, 8 Jun 2016 11:23:19 +0000 (07:23 -0400)
committerradhika <radhika@curoverse.com>
Wed, 8 Jun 2016 11:23:19 +0000 (07:23 -0400)
282 files changed:
apps/workbench/Gemfile.lock
apps/workbench/app/helpers/application_helper.rb
apps/workbench/app/helpers/pipeline_instances_helper.rb
apps/workbench/app/models/job.rb
apps/workbench/app/models/job_task.rb
apps/workbench/app/models/job_task_work_unit.rb [new file with mode: 0644]
apps/workbench/app/models/job_work_unit.rb [new file with mode: 0644]
apps/workbench/app/models/pipeline_instance.rb
apps/workbench/app/models/pipeline_instance_work_unit.rb [new file with mode: 0644]
apps/workbench/app/models/proxy_work_unit.rb [new file with mode: 0644]
apps/workbench/app/models/work_unit.rb [new file with mode: 0644]
apps/workbench/app/views/jobs/_show_status.html.erb
apps/workbench/app/views/pipeline_instances/_running_component.html.erb
apps/workbench/app/views/pipeline_instances/_show_components.html.erb
apps/workbench/app/views/pipeline_instances/_show_components_json.html.erb
apps/workbench/app/views/work_unit/_component_detail.html.erb [new file with mode: 0644]
apps/workbench/app/views/work_unit/_progress.html.erb [new file with mode: 0644]
apps/workbench/app/views/work_unit/_show_child.html.erb [new file with mode: 0644]
apps/workbench/app/views/work_unit/_show_component.html.erb [new file with mode: 0644]
apps/workbench/test/controllers/pipeline_instances_controller_test.rb
apps/workbench/test/integration/jobs_test.rb
apps/workbench/test/test_helper.rb
apps/workbench/test/unit/work_unit_test.rb [new file with mode: 0644]
backports/python-apache-libcloud/fpm-info.sh [new file with mode: 0644]
backports/python-ciso8601/fpm-info.sh
backports/python-gflags/fpm-info.sh [deleted file]
backports/python-llfuse/fpm-info.sh
backports/python-pycrypto/fpm-info.sh
backports/python-pycurl/fpm-info.sh
build/package-build-dockerfiles/Makefile
build/package-build-dockerfiles/centos6/Dockerfile
build/package-build-dockerfiles/centos7/Dockerfile [new file with mode: 0644]
build/package-build-dockerfiles/debian7/Dockerfile
build/package-build-dockerfiles/debian8/Dockerfile
build/package-build-dockerfiles/ubuntu1204/Dockerfile
build/package-build-dockerfiles/ubuntu1404/Dockerfile
build/package-test-dockerfiles/centos6/Dockerfile
build/package-test-dockerfiles/centos7/Dockerfile [new file with mode: 0644]
build/package-test-dockerfiles/centos7/localrepo.repo [new file with mode: 0644]
build/package-testing/rpm-common-test-packages.sh [new file with mode: 0755]
build/package-testing/test-package-arvados-api-server.sh
build/package-testing/test-package-arvados-sso-server.sh
build/package-testing/test-package-arvados-workbench.sh
build/package-testing/test-packages-centos6.sh [changed from file to symlink]
build/package-testing/test-packages-centos7.sh [new symlink]
build/run-build-docker-jobs-image.sh
build/run-build-packages-one-target.sh
build/run-build-packages-python-and-ruby.sh [new file with mode: 0755]
build/run-build-packages-sso.sh
build/run-build-packages.sh
build/run-build-test-packages-one-target.sh [new file with mode: 0755]
build/run-library.sh
build/run-tests.sh
crunch_scripts/crunchrunner
crunch_scripts/cwl-runner [new file with mode: 0755]
doc/_includes/_note_python27_sc.liquid
doc/api/schema/Job.html.textile.liquid
doc/install/arvbox.html.textile.liquid
doc/install/install-compute-node.html.textile.liquid
doc/install/install-keepstore.html.textile.liquid
doc/install/install-shell-server.html.textile.liquid
docker/build_tools/Makefile
docker/compute/Dockerfile
docker/jobs/Dockerfile
docker/jobs/apt.arvados.org.list
docker/shell/Dockerfile
sdk/cli/arvados-cli.gemspec
sdk/cli/bin/arv
sdk/cli/bin/arv-run-pipeline-instance
sdk/cli/bin/crunch-job
sdk/cli/test/test_arv-collection-create.rb
sdk/cwl/LICENSE-2.0.txt [new file with mode: 0644]
sdk/cwl/MANIFEST.in [new file with mode: 0644]
sdk/cwl/arvados_cwl/__init__.py
sdk/cwl/setup.py
sdk/cwl/test_with_arvbox.sh
sdk/cwl/tests/input/blorp.txt [new file with mode: 0644]
sdk/cwl/tests/matcher.py [new file with mode: 0644]
sdk/cwl/tests/order/empty_order.json [new file with mode: 0644]
sdk/cwl/tests/order/inputs_test_order.json [new file with mode: 0644]
sdk/cwl/tests/submit_test_job.json [new file with mode: 0644]
sdk/cwl/tests/test_job.py
sdk/cwl/tests/test_submit.py [new file with mode: 0644]
sdk/cwl/tests/tool/blub.txt [new file with mode: 0644]
sdk/cwl/tests/tool/submit_tool.cwl [new file with mode: 0644]
sdk/cwl/tests/wf/inputs_test.cwl [new file with mode: 0644]
sdk/cwl/tests/wf/submit_wf.cwl [new file with mode: 0644]
sdk/go/arvados/client.go [new file with mode: 0644]
sdk/go/arvados/client_test.go [new file with mode: 0644]
sdk/go/arvados/collection.go [new file with mode: 0644]
sdk/go/arvados/doc.go [new file with mode: 0644]
sdk/go/arvados/duration.go [new file with mode: 0644]
sdk/go/arvados/keep_block.go [new file with mode: 0644]
sdk/go/arvados/keep_service.go [new file with mode: 0644]
sdk/go/arvados/resource_list.go [new file with mode: 0644]
sdk/go/arvados/resource_list_test.go [new file with mode: 0644]
sdk/go/arvados/user.go [new file with mode: 0644]
sdk/go/arvadosclient/arvadosclient.go
sdk/go/arvadostest/fixtures.go
sdk/go/crunchrunner/crunchrunner.go
sdk/go/dispatch/dispatch.go [new file with mode: 0644]
sdk/go/httpserver/request_limiter.go [new file with mode: 0644]
sdk/go/httpserver/request_limiter_test.go [new file with mode: 0644]
sdk/go/httpserver/responsewriter.go
sdk/go/keepclient/collectionreader.go
sdk/go/keepclient/collectionreader_test.go
sdk/go/keepclient/perms.go
sdk/go/keepclient/perms_test.go
sdk/go/manifest/manifest.go
sdk/go/streamer/transfer.go
sdk/python/arvados/arvfile.py
sdk/python/arvados/commands/arv_copy.py
sdk/python/arvados/commands/keepdocker.py
sdk/python/arvados/commands/put.py
sdk/python/arvados/commands/run.py
sdk/python/arvados/events.py
sdk/python/arvados/keep.py
sdk/python/arvados/retry.py
sdk/python/setup.py
sdk/python/tests/arvados_testutil.py
sdk/python/tests/keepstub.py
sdk/python/tests/slow_test.py [new file with mode: 0644]
sdk/python/tests/test_api.py
sdk/python/tests/test_arv_put.py
sdk/python/tests/test_events.py [new file with mode: 0644]
sdk/python/tests/test_keep_client.py
sdk/python/tests/test_retry.py
sdk/python/tests/test_stream.py
sdk/python/tests/test_websockets.py [deleted file]
sdk/ruby/arvados.gemspec
sdk/ruby/lib/arvados.rb
sdk/ruby/lib/arvados/collection.rb
sdk/ruby/lib/arvados/keep.rb
sdk/ruby/test/test_keep_manifest.rb
services/api/Gemfile
services/api/Gemfile.lock
services/api/app/controllers/application_controller.rb
services/api/app/controllers/arvados/v1/api_client_authorizations_controller.rb
services/api/app/controllers/arvados/v1/containers_controller.rb
services/api/app/controllers/arvados/v1/jobs_controller.rb
services/api/app/middlewares/arvados_api_token.rb
services/api/app/models/blob.rb
services/api/app/models/container.rb
services/api/app/models/container_request.rb
services/api/app/models/group.rb
services/api/app/models/job.rb
services/api/app/models/link.rb
services/api/app/models/user.rb
services/api/config/application.default.yml
services/api/config/initializers/fix_www_decode.rb [new file with mode: 0644]
services/api/config/routes.rb
services/api/db/migrate/20160324144017_add_components_to_job.rb [new file with mode: 0644]
services/api/db/migrate/20160506175108_add_auths_to_container.rb [new file with mode: 0644]
services/api/db/migrate/20160509143250_add_auth_and_lock_to_container_index.rb [new file with mode: 0644]
services/api/db/structure.sql
services/api/lib/current_api_client.rb
services/api/lib/eventbus.rb
services/api/lib/load_param.rb
services/api/lib/record_filters.rb
services/api/lib/whitelist_update.rb
services/api/script/permission-updater.rb [new file with mode: 0755]
services/api/test/fixtures/api_client_authorizations.yml
services/api/test/fixtures/container_requests.yml [new file with mode: 0644]
services/api/test/fixtures/containers.yml
services/api/test/fixtures/job_tasks.yml [new file with mode: 0644]
services/api/test/fixtures/jobs.yml
services/api/test/fixtures/links.yml
services/api/test/fixtures/pipeline_instances.yml
services/api/test/fixtures/users.yml
services/api/test/functional/arvados/v1/api_client_authorizations_controller_test.rb
services/api/test/functional/arvados/v1/containers_controller_test.rb [new file with mode: 0644]
services/api/test/functional/arvados/v1/groups_controller_test.rb
services/api/test/functional/arvados/v1/jobs_controller_test.rb
services/api/test/helpers/time_block.rb
services/api/test/integration/collections_performance_test.rb
services/api/test/integration/database_reset_test.rb
services/api/test/integration/websocket_test.rb
services/api/test/test_helper.rb
services/api/test/unit/blob_test.rb
services/api/test/unit/collection_performance_test.rb
services/api/test/unit/container_request_test.rb
services/api/test/unit/container_test.rb
services/api/test/unit/job_test.rb
services/crunch-dispatch-local/crunch-dispatch-local.go
services/crunch-dispatch-local/crunch-dispatch-local_test.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
services/crunch-dispatch-slurm/crunch-finish-slurm.sh [deleted file]
services/crunch-dispatch-slurm/squeue.go [new file with mode: 0644]
services/crunch-run/crunchrun.go
services/crunch-run/crunchrun_test.go
services/crunch-run/logging_test.go
services/datamanager/collection/collection.go
services/datamanager/loggerutil/loggerutil.go
services/datamanager/summary/pull_list.go
services/dockercleaner/setup.py
services/fuse/arvados_fuse/fusedir.py
services/fuse/arvados_fuse/fusefile.py
services/fuse/setup.py
services/fuse/tests/mount_test_base.py
services/fuse/tests/performance/test_collection_performance.py
services/fuse/tests/slow_test.py [new symlink]
services/fuse/tests/test_cache.py [new file with mode: 0644]
services/fuse/tests/test_mount.py
services/keep-balance/balance.go [new file with mode: 0644]
services/keep-balance/balance_run_test.go [new file with mode: 0644]
services/keep-balance/balance_test.go [new file with mode: 0644]
services/keep-balance/block_state.go [new file with mode: 0644]
services/keep-balance/change_set.go [new file with mode: 0644]
services/keep-balance/change_set_test.go [new file with mode: 0644]
services/keep-balance/collection.go [new file with mode: 0644]
services/keep-balance/integration_test.go [new file with mode: 0644]
services/keep-balance/keep_service.go [new file with mode: 0644]
services/keep-balance/main.go [new file with mode: 0644]
services/keep-balance/main_test.go [new file with mode: 0644]
services/keep-balance/time_me.go [new file with mode: 0644]
services/keep-balance/usage.go [new file with mode: 0644]
services/keep-web/handler.go
services/keep-web/server_test.go
services/keepproxy/keepproxy.go
services/keepstore/azure_blob_volume.go
services/keepstore/azure_blob_volume_test.go
services/keepstore/handler_test.go
services/keepstore/handlers.go
services/keepstore/handlers_with_generic_volume_test.go
services/keepstore/keepstore.go
services/keepstore/keepstore_test.go
services/keepstore/logging_router.go
services/keepstore/logging_router_test.go [new file with mode: 0644]
services/keepstore/perms.go
services/keepstore/perms_test.go
services/keepstore/s3_volume.go
services/keepstore/trash_worker_test.go
services/keepstore/volume.go
services/keepstore/volume_generic_test.go
services/keepstore/volume_test.go
services/keepstore/volume_unix.go
services/keepstore/volume_unix_test.go
services/nodemanager/arvnodeman/baseactor.py
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
services/nodemanager/arvnodeman/computenode/dispatch/transitions.py [new file with mode: 0644]
services/nodemanager/arvnodeman/computenode/driver/__init__.py
services/nodemanager/arvnodeman/computenode/driver/azure.py
services/nodemanager/arvnodeman/computenode/driver/ec2.py
services/nodemanager/arvnodeman/computenode/driver/gce.py
services/nodemanager/arvnodeman/config.py
services/nodemanager/arvnodeman/daemon.py
services/nodemanager/arvnodeman/launcher.py
services/nodemanager/tests/test_computenode_dispatch.py
services/nodemanager/tests/test_computenode_dispatch_slurm.py
services/nodemanager/tests/test_computenode_driver.py [new file with mode: 0644]
services/nodemanager/tests/test_computenode_driver_azure.py
services/nodemanager/tests/test_computenode_driver_gce.py
services/nodemanager/tests/test_daemon.py
services/nodemanager/tests/test_failure.py
services/nodemanager/tests/testutil.py
tools/arvbox/bin/arvbox
tools/arvbox/lib/arvbox/docker/Dockerfile.base
tools/arvbox/lib/arvbox/docker/Dockerfile.demo
tools/arvbox/lib/arvbox/docker/api-setup.sh [new file with mode: 0755]
tools/arvbox/lib/arvbox/docker/common.sh
tools/arvbox/lib/arvbox/docker/createusers.sh
tools/arvbox/lib/arvbox/docker/crunch-setup.sh
tools/arvbox/lib/arvbox/docker/keep-setup.sh
tools/arvbox/lib/arvbox/docker/service/api/run-service
tools/arvbox/lib/arvbox/docker/service/arv-git-httpd/run-service
tools/arvbox/lib/arvbox/docker/service/crunch-dispatch-local/run-service
tools/arvbox/lib/arvbox/docker/service/keep-web/run-service
tools/arvbox/lib/arvbox/docker/service/keepproxy/run-service
tools/arvbox/lib/arvbox/docker/service/sdk/run-service
tools/arvbox/lib/arvbox/docker/service/websockets/log/main/.gitstub [new file with mode: 0644]
tools/arvbox/lib/arvbox/docker/service/websockets/log/run [new symlink]
tools/arvbox/lib/arvbox/docker/service/websockets/run [new symlink]
tools/arvbox/lib/arvbox/docker/service/websockets/run-service [new file with mode: 0755]
tools/crunchstat-summary/crunchstat_summary/reader.py
tools/crunchstat-summary/crunchstat_summary/summarizer.py
tools/keep-block-check/.gitignore [new file with mode: 0644]
tools/keep-block-check/keep-block-check.go [new file with mode: 0644]
tools/keep-block-check/keep-block-check_test.go [new file with mode: 0644]
tools/keep-rsync/keep-rsync.go
tools/keep-rsync/keep-rsync_test.go

index b4e2400beda11a9186dd5f4c04468638f237a517..fdcd375ed2081b0d1a770900c3d0dde5ccae67d2 100644 (file)
@@ -37,16 +37,17 @@ GEM
       minitest (~> 5.1)
       thread_safe (~> 0.1)
       tzinfo (~> 1.1)
-    addressable (2.3.6)
+    addressable (2.4.0)
     andand (1.3.3)
     angularjs-rails (1.3.8)
     arel (5.0.1.20140414130214)
-    arvados (0.1.20150511150219)
-      activesupport (>= 3.2.13)
+    arvados (0.1.20160420143004)
+      activesupport (>= 3, < 4.2.6)
       andand (~> 1.3, >= 1.3.3)
-      google-api-client (~> 0.6.3, >= 0.6.3)
+      google-api-client (>= 0.7, < 0.9)
+      i18n (~> 0)
       json (~> 1.7, >= 1.7.7)
-      jwt (>= 0.1.5, < 1.0.0)
+      jwt (>= 0.1.5, < 2)
     autoparse (0.3.3)
       addressable (>= 2.3.1)
       extlib (>= 0.9.15)
@@ -93,24 +94,33 @@ GEM
     erubis (2.7.0)
     execjs (2.2.2)
     extlib (0.9.16)
-    faraday (0.8.9)
-      multipart-post (~> 1.2.0)
+    faraday (0.9.2)
+      multipart-post (>= 1.2, < 3)
     fast_stack (0.1.0)
       rake
       rake-compiler
     ffi (1.9.10)
     flamegraph (0.1.0)
       fast_stack
-    google-api-client (0.6.4)
-      addressable (>= 2.3.2)
-      autoparse (>= 0.3.3)
-      extlib (>= 0.9.15)
-      faraday (~> 0.8.4)
-      jwt (>= 0.1.5)
-      launchy (>= 2.1.1)
-      multi_json (>= 1.0.0)
-      signet (~> 0.4.5)
-      uuidtools (>= 2.1.0)
+    google-api-client (0.8.6)
+      activesupport (>= 3.2)
+      addressable (~> 2.3)
+      autoparse (~> 0.3)
+      extlib (~> 0.9)
+      faraday (~> 0.9)
+      googleauth (~> 0.3)
+      launchy (~> 2.4)
+      multi_json (~> 1.10)
+      retriable (~> 1.4)
+      signet (~> 0.6)
+    googleauth (0.5.1)
+      faraday (~> 0.9)
+      jwt (~> 1.4)
+      logging (~> 2.0)
+      memoist (~> 0.12)
+      multi_json (~> 1.11)
+      os (~> 0.9)
+      signet (~> 0.7)
     headless (1.0.2)
     highline (1.6.21)
     httpclient (2.6.0.1)
@@ -119,8 +129,7 @@ GEM
       railties (>= 3.0, < 5.0)
       thor (>= 0.14, < 2.0)
     json (1.8.3)
-    jwt (0.1.13)
-      multi_json (>= 1.5)
+    jwt (1.5.4)
     launchy (2.4.3)
       addressable (~> 2.3)
     less (2.6.0)
@@ -129,18 +138,23 @@ GEM
       actionpack (>= 3.1)
       less (~> 2.6.0)
     libv8 (3.16.14.7)
+    little-plugger (1.1.4)
+    logging (2.1.0)
+      little-plugger (~> 1.1)
+      multi_json (~> 1.10)
     mail (2.6.3)
       mime-types (>= 1.16, < 3)
+    memoist (0.14.0)
     metaclass (0.0.4)
     mime-types (2.99)
     mini_portile (0.6.2)
-    minitest (5.7.0)
+    minitest (5.8.4)
     mocha (1.1.0)
       metaclass (~> 0.0.1)
     morrisjs-rails (0.5.1)
       railties (> 3.1, < 5)
-    multi_json (1.11.2)
-    multipart-post (1.2.0)
+    multi_json (1.12.0)
+    multipart-post (2.0.0)
     net-scp (1.2.1)
       net-ssh (>= 2.6.5)
     net-sftp (2.1.2)
@@ -151,6 +165,7 @@ GEM
     nokogiri (1.6.6.4)
       mini_portile (~> 0.6.0)
     oj (2.11.2)
+    os (0.9.6)
     passenger (4.0.57)
       daemon_controller (>= 1.2.0)
       rack
@@ -190,6 +205,7 @@ GEM
       rake
     raphael-rails (2.1.2)
     ref (1.0.5)
+    retriable (1.4.1)
     ruby-debug-passenger (0.2.0)
     ruby-prof (0.15.2)
     rubyzip (1.1.7)
@@ -207,11 +223,11 @@ GEM
       multi_json (~> 1.0)
       rubyzip (~> 1.0)
       websocket (~> 1.0)
-    signet (0.4.5)
-      addressable (>= 2.2.3)
-      faraday (~> 0.8.1)
-      jwt (>= 0.1.5)
-      multi_json (>= 1.0.0)
+    signet (0.7.2)
+      addressable (~> 2.3)
+      faraday (~> 0.9)
+      jwt (~> 1.5)
+      multi_json (~> 1.10)
     simplecov (0.9.1)
       docile (~> 1.1.0)
       multi_json (~> 1.0)
@@ -238,7 +254,6 @@ GEM
     uglifier (2.7.0)
       execjs (>= 0.3.0)
       json (>= 1.8.0)
-    uuidtools (2.1.5)
     websocket (1.2.2)
     websocket-driver (0.5.1)
       websocket-extensions (>= 0.1.0)
@@ -294,3 +309,6 @@ DEPENDENCIES
   therubyracer
   uglifier (>= 1.0.3)
   wiselinks
+
+BUNDLED WITH
+   1.12.1
index 14b1c34d11e0d45821a1929c00254038c36ba50f..a37ecda7041c99ff820b59bd7874d5e8f5b7e9e4 100644 (file)
@@ -498,6 +498,10 @@ module ApplicationHelper
     raw("<span class='utc-date' data-utc-date='#{date}' data-utc-date-opts='noseconds'>#{date}</span>")
   end
 
+  def render_time duration, use_words, round_to_min=true
+    render_runtime duration, use_words, round_to_min
+  end
+
 private
   def is_textile?( object, attr )
     is_textile = object.textile_attributes.andand.include?(attr)
index 8fafbc2022d5873032d1f9565c2385a26f4a794b..1ed9e3ed6b6265aab919481f2d8a9920296b08b1 100644 (file)
@@ -69,9 +69,8 @@ module PipelineInstancesHelper
   def determine_wallclock_runtime jobs
     timestamps = []
     jobs.each do |j|
-      insert_at = 0
-      started_at = j[:started_at]
-      finished_at = (if j[:finished_at] then j[:finished_at] else Time.now end)
+      started_at = (j.started_at if j.respond_to?(:started_at)) || (j[:started_at] if j.is_a?(Hash))
+      finished_at = (j.finished_at if j.respond_to?(:finished_at)) || (j[:finished_at] if j.is_a?(Hash)) || Time.now
       if started_at
         timestamps = merge_range timestamps, started_at, finished_at
       end
index 6566aeb7cd5f82c4aa9e8f16a88b4d287f1afc24..73f1f63be4c7d5dcb5fa33e390d722c87b16e0b0 100644 (file)
@@ -53,4 +53,8 @@ class Job < ArvadosBase
     stderr_log_query(limit).results.reverse.
       flat_map { |log| log.properties[:text].split("\n") rescue [] }
   end
+
+  def work_unit(label=nil)
+    JobWorkUnit.new(self, label)
+  end
 end
index 15fc7fdae39a8fc30d9a4750e4ed022dcc6bd936..9fb04737badb1114547c0a6a581f71e9b2bbd459 100644 (file)
@@ -1,2 +1,5 @@
 class JobTask < ArvadosBase
+  def work_unit(label=nil)
+    JobTaskWorkUnit.new(self, label)
+  end
 end
diff --git a/apps/workbench/app/models/job_task_work_unit.rb b/apps/workbench/app/models/job_task_work_unit.rb
new file mode 100644 (file)
index 0000000..47d53ca
--- /dev/null
@@ -0,0 +1,5 @@
+class JobTaskWorkUnit < ProxyWorkUnit
+  def title
+    "job task"
+  end
+end
diff --git a/apps/workbench/app/models/job_work_unit.rb b/apps/workbench/app/models/job_work_unit.rb
new file mode 100644 (file)
index 0000000..a0a7c87
--- /dev/null
@@ -0,0 +1,90 @@
+class JobWorkUnit < ProxyWorkUnit
+  def children
+    return @my_children if @my_children
+
+    # Jobs components
+    items = []
+    components = get(:components)
+    uuids = components.andand.collect {|_, v| v}
+    return items if (!uuids or uuids.empty?)
+
+    rcs = {}
+    uuids.each do |u|
+      r = ArvadosBase::resource_class_for_uuid(u)
+      rcs[r] = [] unless rcs[r]
+      rcs[r] << u
+    end
+    rcs.each do |rc, ids|
+      rc.where(uuid: ids).each do |obj|
+        items << obj.work_unit(components.key(obj.uuid))
+      end
+    end
+
+    @my_children = items
+  end
+
+  def child_summary
+    if children.any?
+      super
+    else
+      get(:tasks_summary)
+    end
+  end
+
+  def parameters
+    get(:script_parameters)
+  end
+
+  def repository
+    get(:repository)
+  end
+
+  def script
+    get(:script)
+  end
+
+  def script_version
+    get(:script_version)
+  end
+
+  def supplied_script_version
+    get(:supplied_script_version)
+  end
+
+  def docker_image
+    get(:docker_image_locator)
+  end
+
+  def nondeterministic
+    get(:nondeterministic)
+  end
+
+  def runtime_constraints
+    get(:runtime_constraints)
+  end
+
+  def priority
+    get(:priority)
+  end
+
+  def log_collection
+    get(:log)
+  end
+
+  def output
+    get(:output)
+  end
+
+  def can_cancel?
+    state_label.in? ["Queued", "Running"]
+  end
+
+  def uri
+    uuid = get(:uuid)
+    "/jobs/#{uuid}"
+  end
+
+  def title
+    "job"
+  end
+end
index 6e556d5b75cae13626aad29d1c0a816a5a78b583..b51f07c40b36e0324b09e14c81dbf632dbcd7a68 100644 (file)
@@ -132,6 +132,10 @@ class PipelineInstance < ArvadosBase
     end
   end
 
+  def work_unit(label=nil)
+    PipelineInstanceWorkUnit.new(self, label || self.name)
+  end
+
   private
 
   def components_map
diff --git a/apps/workbench/app/models/pipeline_instance_work_unit.rb b/apps/workbench/app/models/pipeline_instance_work_unit.rb
new file mode 100644 (file)
index 0000000..bc2b3e7
--- /dev/null
@@ -0,0 +1,43 @@
+class PipelineInstanceWorkUnit < ProxyWorkUnit
+  def children
+    return @my_children if @my_children
+
+    items = []
+
+    jobs = {}
+    results = Job.where(uuid: @proxied.job_ids.values).results
+    results.each do |j|
+      jobs[j.uuid] = j
+    end
+
+    components = get(:components)
+    components.each do |name, c|
+      if c.is_a?(Hash)
+        job = c[:job]
+        if job
+          if job[:uuid] and jobs[job[:uuid]]
+            items << jobs[job[:uuid]].work_unit(name)
+          else
+            items << JobWorkUnit.new(job, name)
+          end
+        else
+          items << JobWorkUnit.new(c, name)
+        end
+      else
+        @unreadable_children = true
+        break
+      end
+    end
+
+    @my_children = items
+  end
+
+  def uri
+    uuid = get(:uuid)
+    "/pipeline_instances/#{uuid}"
+  end
+
+  def title
+    "pipeline"
+  end
+end
diff --git a/apps/workbench/app/models/proxy_work_unit.rb b/apps/workbench/app/models/proxy_work_unit.rb
new file mode 100644 (file)
index 0000000..7cf0a1b
--- /dev/null
@@ -0,0 +1,324 @@
+class ProxyWorkUnit < WorkUnit
+  require 'time'
+
+  attr_accessor :lbl
+  attr_accessor :proxied
+  attr_accessor :my_children
+  attr_accessor :unreadable_children
+
+  def initialize proxied, label
+    @lbl = label
+    @proxied = proxied
+  end
+
+  def label
+    @lbl
+  end
+
+  def uuid
+    get(:uuid)
+  end
+
+  def modified_by_user_uuid
+    get(:modified_by_user_uuid)
+  end
+
+  def created_at
+    t = get(:created_at)
+    t = Time.parse(t) if (t.andand.class == String)
+    t
+  end
+
+  def started_at
+    t = get(:started_at)
+    t = Time.parse(t) if (t.andand.class == String)
+    t
+  end
+
+  def finished_at
+    t = get(:finished_at)
+    t = Time.parse(t) if (t.andand.class == String)
+    t
+  end
+
+  def state_label
+    state = get(:state)
+    if ["Running", "RunningOnServer", "RunningOnClient"].include? state
+      "Running"
+    else
+      state
+    end
+  end
+
+  def state_bootstrap_class
+    state = get(:state)
+    case state
+    when 'Complete'
+      'success'
+    when 'Failed', 'Cancelled'
+      'danger'
+    when 'Running', 'RunningOnServer', 'RunningOnClient'
+      'info'
+    else
+      'default'
+    end
+  end
+
+  def success?
+    state = get(:state)
+    if state == 'Complete'
+      true
+    elsif state == 'Failed' or state == 'Cancelled'
+      false
+    else
+      nil
+    end
+  end
+
+  def child_summary
+    done = 0
+    failed = 0
+    todo = 0
+    running = 0
+    children.each do |c|
+      case c.state_label
+      when 'Complete'
+        done = done+1
+      when 'Failed', 'Cancelled'
+        failed = failed+1
+      when 'Running'
+        running = running+1
+      else
+        todo = todo+1
+      end
+    end
+
+    summary = {}
+    summary[:done] = done
+    summary[:failed] = failed
+    summary[:todo] = todo
+    summary[:running] = running
+    summary
+  end
+
+  def child_summary_str
+    summary = child_summary
+    summary_txt = ''
+
+    if state_label == 'Running'
+      done = summary[:done] || 0
+      running = summary[:running] || 0
+      failed = summary[:failed] || 0
+      todo = summary[:todo] || 0
+      total = done + running + failed + todo
+
+      if total > 0
+        summary_txt += "#{summary[:done]} #{'child'.pluralize(summary[:done])} done,"
+        summary_txt += "#{summary[:failed]} failed,"
+        summary_txt += "#{summary[:running]} running,"
+        summary_txt += "#{summary[:todo]} pending"
+      end
+    end
+    summary_txt
+  end
+
+  def progress
+    state = get(:state)
+    if state == 'Complete'
+      return 1.0
+    elsif state == 'Failed' or state == 'Cancelled'
+      return 0.0
+    end
+
+    summary = child_summary
+    return 0.0 if summary.nil?
+
+    done = summary[:done] || 0
+    running = summary[:running] || 0
+    failed = summary[:failed] || 0
+    todo = summary[:todo] || 0
+    total = done + running + failed + todo
+    if total > 0
+      (done+failed).to_f / total
+    else
+      0.0
+    end
+  end
+
+  def children
+    []
+  end
+
+  def title
+    "process"
+  end
+
+  def has_unreadable_children
+    @unreadable_children
+  end
+
+  def readable?
+    resource_class = ArvadosBase::resource_class_for_uuid(uuid)
+    resource_class.where(uuid: [uuid]).first rescue nil
+  end
+
+  def link_to_log
+    if state_label.in? ["Complete", "Failed", "Cancelled"]
+      lc = log_collection
+      if lc
+        logCollection = Collection.find? lc
+        if logCollection
+          ApplicationController.helpers.link_to("Log", "#{uri}#Log")
+        else
+          "Log unavailable"
+        end
+      end
+    elsif state_label == "Running"
+      if readable?
+        ApplicationController.helpers.link_to("Log", "#{uri}#Log")
+      else
+        "Log unavailable"
+      end
+    end
+  end
+
+  def walltime
+    if state_label != "Queued"
+      if started_at
+        ((if finished_at then finished_at else Time.now() end) - started_at)
+      end
+    end
+  end
+
+  def cputime
+    if state_label != "Queued"
+      if started_at
+        (runtime_constraints.andand[:min_nodes] || 1) * ((finished_at || Time.now()) - started_at)
+      end
+    end
+  end
+
+  def queuedtime
+    if state_label == "Queued"
+      Time.now - Time.parse(created_at.to_s)
+    end
+  end
+
+  def is_running?
+    state_label == 'Running'
+  end
+
+  def is_paused?
+    state_label == 'Paused'
+  end
+
+  def is_finished?
+    state_label.in? ["Complete", "Failed", "Cancelled"]
+  end
+
+  def is_failed?
+    state_label == 'Failed'
+  end
+
+  def show_runtime
+    runningtime = ApplicationController.helpers.determine_wallclock_runtime(if children.any? then children else [self] end)
+
+    walltime = 0
+    if started_at
+      walltime = if finished_at then (finished_at - started_at) else (Time.now - started_at) end
+    end
+
+    resp = '<p>'
+
+    if started_at
+      resp << "This #{title} started at "
+      resp << ApplicationController.helpers.render_localized_date(started_at)
+      resp << ". It "
+      if state_label == 'Complete'
+        resp << "completed in "
+      elsif state_label == 'Failed'
+         resp << "failed after "
+      else
+        resp << "has been active for "
+      end
+
+      if walltime > runningtime
+        resp << ApplicationController.helpers.render_time(walltime, false)
+      else
+       resp << ApplicationController.helpers.render_time(runningtime, false)
+      end
+
+      if finished_at
+        resp << " at "
+        resp << ApplicationController.helpers.render_localized_date(finished_at)
+      end
+      resp << "."
+    else
+      if state_label
+        resp << "This #{title} is "
+        resp << if state_label == 'Running' then 'active' else state_label.downcase end
+        resp << "."
+      end
+    end
+
+    if is_failed?
+      resp << " Check the Log tab for more detail about why it failed."
+    end
+    resp << "</p>"
+
+    resp << "<p>"
+    if state_label
+      resp << "It "
+      if state_label == 'Running'
+        resp << "has run"
+      else
+        resp << "ran"
+      end
+      resp << " for "
+
+      cpu_time = 0
+      if children.any?
+        cpu_time = children.map { |c|
+          if c.started_at
+             (c.runtime_constraints.andand[:min_nodes] || 1) * ((c.finished_at || Time.now()) - c.started_at)
+          else
+            0
+          end
+        }.reduce(:+) || 0
+      else
+        if started_at
+          cpu_time = (runtime_constraints.andand[:min_nodes] || 1) * ((finished_at || Time.now()) - started_at)
+        end
+      end
+
+      resp << ApplicationController.helpers.render_time(runningtime, false)
+      if (walltime - runningtime) > 0
+        resp << "("
+        resp << ApplicationController.helpers.render_time(walltime - runningtime, false)
+        resp << "queued)"
+      end
+      if cpu_time == 0
+        resp << "."
+      else
+        resp << " and used "
+        resp << ApplicationController.helpers.render_time(cpu_time, false)
+        resp << " of node allocation time ("
+        resp << (cpu_time/runningtime).round(1).to_s
+        resp << "&Cross; scaling)."
+      end
+    end
+    resp << "</p>"
+
+    resp
+  end
+
+  protected
+
+  def get key
+    if @proxied.respond_to? key
+      @proxied.send(key)
+    elsif @proxied.is_a?(Hash)
+      @proxied[key]
+    end
+  end
+end
diff --git a/apps/workbench/app/models/work_unit.rb b/apps/workbench/app/models/work_unit.rb
new file mode 100644 (file)
index 0000000..8da6d92
--- /dev/null
@@ -0,0 +1,154 @@
+class WorkUnit
+  # This is an abstract class that documents the WorkUnit interface
+
+  def label
+    # returns the label that was assigned when creating the work unit
+  end
+
+  def uuid
+    # returns the arvados UUID of the underlying object
+  end
+
+  def children
+    # returns an array of child work units
+  end
+
+  def modified_by_user_uuid
+    # returns uuid of the user who modified this work unit most recently
+  end
+
+  def created_at
+    # returns created_at timestamp
+  end
+
+  def started_at
+    # returns started_at timestamp for this work unit
+  end
+
+  def finished_at
+    # returns finished_at timestamp
+  end
+
+  def state_label
+    # returns a string representing state of the work unit
+  end
+
+  def state_bootstrap_class
+    # returns a class like "danger", "success", or "warning" that a view can use directly to make a display class
+  end
+
+  def success?
+    # returns true if the work unit finished successfully,
+    # false if it has a permanent failure,
+    # and nil if the final state is not determined.
+  end
+
+  def progress
+    # returns a number between 0 and 1
+  end
+
+  def log_collection
+    # returns uuid or pdh with saved log data, if any
+  end
+
+  def parameters
+    # returns work unit parameters, if any
+  end
+
+  def script
+    # returns script for this work unit, if any
+  end
+
+  def repository
+    # returns this work unit's script repository, if any
+  end
+
+  def script_version
+    # returns this work unit's script_version, if any
+  end
+
+  def supplied_script_version
+    # returns this work unit's supplied_script_version, if any
+  end
+
+  def docker_image
+    # returns this work unit's docker_image, if any
+  end
+
+  def runtime_constraints
+    # returns this work unit's runtime_constraints, if any
+  end
+
+  def priority
+    # returns this work unit's priority, if any
+  end
+
+  def nondeterministic
+    # returns if this is nondeterministic
+  end
+
+  def output
+    # returns uuid or pdh of output data, if any
+  end
+
+  def child_summary
+    # summary status of any children of this work unit
+  end
+
+  def child_summary_str
+    # textual representation of child summary
+  end
+
+  def can_cancel?
+    # returns true if this work unit can be canceled
+  end
+
+  def readable?
+    # is the proxied object readable by current user?
+  end
+
+  def uri
+    # returns the uri for this work unit
+  end
+
+  def title
+    # title for the work unit
+  end
+
+  def has_unreadable_children
+    # accept it if you can't understand your own children
+  end
+
+  # view helper methods
+  def link_to_log
+    # display a link to log if present
+  end
+
+  def walltime
+    # return walltime for a running or completed work unit
+  end
+
+  def cputime
+    # return cputime for a running or completed work unit
+  end
+
+  def queuedtime
+    # return queued time if the work unit is queued
+  end
+
+  def is_running?
+    # is the work unit in running state?
+  end
+
+  def is_paused?
+    # is the work unit in paused state?
+  end
+
+  def is_finished?
+    # is the work unit in finished state?
+  end
+
+  def is_failed?
+    # is this work unit in failed state?
+  end
+end
index 807520940c9218473f01935f732556afa6cd373a..8d54b20cfd341398f9cc43377721f73f486fe606 100644 (file)
@@ -8,8 +8,7 @@
    pj[:progress_bar] = render(partial: "job_progress",
                               locals: {:j => @object })
    tasks = JobTask.filter([['job_uuid', '=', @object.uuid]]).results
-   render(partial: 'pipeline_instances/running_component',
-          locals: { tasks: tasks, pj: pj, i: 0, expanded: true})
+   render(partial: 'work_unit/show_component', locals: {wu: @object.work_unit(@object[:name] || "this job")})
 %>
 
 <div class="panel panel-default">
index d4e0944b5c510e7bcd4216a4f9808090fea834c8..ded535ef3ad5109e81a33ea1fd9815cde8ac6905 100644 (file)
             <div class="col-md-6">
               <% queuetime = Time.now - Time.parse(current_job[:created_at].to_s) %>
               Queued for <%= render_runtime(queuetime, false) %>.
-              <% begin %>
-                <% if current_job[:queue_position] == 0 %>
-                  This job is next in the queue to run.
-                <% elsif current_job[:queue_position] == 1 %>
-                  There is 1 job in the queue ahead of this one.
-                <% elsif current_job[:queue_position] %>
-                  There are <%= current_job[:queue_position] %> jobs in the queue ahead of this one.
-                <% end %>
-              <% rescue %>
-              <% end %>
             </div>
           <% elsif current_job[:state] == "Running" %>
             <%# column offset 8 %>
index dae57aa0e85ebbe3ccf7ce183a185dcc860b3f0b..4196558b3c07570c2d0e84b059cba20145b4fa41 100644 (file)
@@ -9,7 +9,7 @@
        data-object-uuids="<%= @object.uuid %> <%= job_uuids.join(' ') %>"
        ></div>
 
-  <%= render_pipeline_components("running", :json) %>
+  <%= render partial: 'work_unit/show_component', locals: {wu: @object.work_unit(@object.name)} %>
 
 <% else %>
   <%# state is either New or Ready %>
index 9d1edbf264ff34a7d0c139aa2d7df8c3f3a9486e..6486f73730aba2649c17cbb58626e617b1baa188 100644 (file)
@@ -15,6 +15,7 @@
           <pre><%= Oj.dump(@object.components, indent: 2) %></pre>
         </div>
       </div>
+      <% if backtrace %>
       <div class="panel-heading">
         <h4 class="panel-title">
           <a data-toggle="collapse" data-parent="#components-accordion" href="#components-backtrace">
@@ -27,4 +28,5 @@
           <pre><%= backtrace %></pre>
         </div>
       </div>
+      <% end %>
     </div>
diff --git a/apps/workbench/app/views/work_unit/_component_detail.html.erb b/apps/workbench/app/views/work_unit/_component_detail.html.erb
new file mode 100644 (file)
index 0000000..38e1b5b
--- /dev/null
@@ -0,0 +1,97 @@
+      <div class="container">
+        <div class="row">
+          <div class="col-md-5">
+            <% if current_obj.uuid.nil? %>
+              No <%= current_obj.title %> has been submitted yet.
+            <% else %>
+            <table>
+              <% [:uuid, :modified_by_user_uuid, :created_at, :started_at, :finished_at, :output, :priority].each do |k| %>
+                <% val = current_obj.send(k) if current_obj.respond_to?(k) %>
+                <% if val %>
+                <tr>
+                  <td style="padding-right: 1em">
+                    <%= k.to_s %>:
+                  </td>
+                  <td>
+                    <% if k == :uuid %>
+                      <%= link_to_arvados_object_if_readable(val, val, link_text: val) %>
+                    <% elsif k.to_s.end_with? 'uuid' %>
+                      <%= link_to_arvados_object_if_readable(val, val, friendly_name: true) %>
+                    <% elsif k.to_s.end_with? '_at' %>
+                      <%= render_localized_date(val) %>
+                    <% elsif k == :output %>
+                      <%= link_to_arvados_object_if_readable(val, 'Output data not available', friendly_name: true) %>
+                    <% else %>
+                      <%= val %>
+                    <% end %>
+                  </td>
+                </tr>
+                <% end %>
+              <% end %>
+            </table>
+            <% end %>
+          </div>
+          <div class="col-md-6">
+            <table>
+              <% # link to repo tree/file only if the repo is readable
+                 # and the commit is a sha1...
+                 repo =
+                 (/^[0-9a-f]{40}$/ =~ current_obj.script_version and
+                 Repository.where(name: current_obj.repository).first)
+
+                 # ...and the api server provides an http:// or https:// url
+                 repo = nil unless repo.andand.http_fetch_url
+                 %>
+              <% [:script, :repository, :script_version, :supplied_script_version, :nondeterministic].each do |k| %>
+                <% val = current_obj.send(k) if current_obj.respond_to?(k) %>
+                <% if val %>
+                <tr>
+                  <td style="padding-right: 1em">
+                    <%= k.to_s %>:
+                  </td>
+                  <td>
+                    <% if repo and k == :repository %>
+                      <%= link_to val, show_repository_tree_path(id: repo.uuid, commit: current_obj.script_version, path: '/') %>
+                    <% elsif repo and k == :script %>
+                      <%= link_to val, show_repository_blob_path(id: repo.uuid, commit: current_obj.script_version, path: 'crunch_scripts/'+current_obj.script) %>
+                    <% elsif repo and k == :script_version %>
+                      <%= link_to val, show_repository_commit_path(id: repo.uuid, commit: current_obj.script_version) %>
+                    <% else %>
+                      <%= val %>
+                    <% end %>
+                  </td>
+                </tr>
+                <% end %>
+              <% end %>
+              <% if current_obj.runtime_constraints.andand[:docker_image] and current_obj.docker_image %>
+                <tr>
+                  <td style="padding-right: 1em">
+                    docker_image:
+                  </td>
+                  <td>
+                    <%= current_obj.runtime_constraints[:docker_image] %>
+                  </td>
+                </tr>
+                <tr>
+                  <td style="padding-right: 1em">
+                    docker_image_locator:
+                  </td>
+                  <td>
+                    <%= link_to_arvados_object_if_readable(current_obj.docker_image,
+                      current_obj.docker_image, friendly_name: true) %>
+                  </td>
+                </tr>
+              <% end %>
+            </table>
+          </div>
+        </div>
+
+        <% if current_obj.parameters and !current_obj.parameters.empty? %>
+        <div class="row">
+          <div class="col-md-6">
+            <p>script_parameters:</p>
+            <pre><%= JSON.pretty_generate(current_obj.parameters) rescue nil %></pre>
+          </div>
+        </div>
+        <% end %>
+      </div>
diff --git a/apps/workbench/app/views/work_unit/_progress.html.erb b/apps/workbench/app/views/work_unit/_progress.html.erb
new file mode 100644 (file)
index 0000000..e06bad6
--- /dev/null
@@ -0,0 +1,12 @@
+<% if wu.is_running? %>
+  <% if @object.uuid == wu.uuid and wu.progress == 0.0 %>
+    <span class="label label-<%= wu.state_bootstrap_class %>"> Active </span>
+  <% else%>
+    <div class="progress" style="margin-bottom: 0px">
+      <span class="progress-bar progress-bar-<%= wu.state_bootstrap_class %>" style="width: <%= wu.progress*100 %>%;">
+      </span>
+    </div>
+  <% end %>
+<% else %>
+  <span class="label label-<%= wu.state_bootstrap_class %>"><%= wu.state_label %></span>
+<% end %>
diff --git a/apps/workbench/app/views/work_unit/_show_child.html.erb b/apps/workbench/app/views/work_unit/_show_child.html.erb
new file mode 100644 (file)
index 0000000..b55d888
--- /dev/null
@@ -0,0 +1,72 @@
+<div class="panel panel-default">
+  <div class="panel-heading">
+    <div class="container-fluid">
+      <div class="row-fluid">
+        <div class="col-md-2" style="word-break:break-all;">
+          <h4 class="panel-title">
+            <a data-toggle="collapse" href="#collapse<%= i %>">
+              <%= current_obj.label %> <span class="caret"></span>
+            </a>
+          </h4>
+        </div>
+
+        <div class="col-md-2 pipeline-instance-spacing">
+          <%= render partial: 'work_unit/progress', locals: {wu: current_obj} %>
+        </div>
+
+        <% if not current_obj %>
+          <div class="col-md-8"></div>
+        <% else %>
+          <div class="col-md-1">
+            <%= current_obj.link_to_log %>
+          </div>
+
+          <% walltime = current_obj.walltime %>
+          <% cputime = current_obj.cputime %>
+          <div class="col-md-3">
+          <% if walltime and cputime %>
+            <%= render_runtime(walltime, false) %>
+            <% if cputime > 0 %> / <%= render_runtime(cputime, false) %> (<%= (cputime/walltime).round(1) %>&Cross;)<% end %>
+          <% end %>
+          </div>
+
+          <% queuetime = current_obj.queuedtime %>
+          <% if queuetime %>
+            <div class="col-md-3">
+              Queued for <%= render_runtime(queuetime, false) %>.
+            </div>
+          <% elsif current_obj.is_running? %>
+            <div class="col-md-3">
+              <span class="task-summary-status">
+                <%= current_obj.child_summary_str %>
+              </span>
+            </div>
+          <% elsif current_obj.is_finished? %>
+            <div class="col-md-3 text-overflow-ellipsis">
+              <% if current_obj.output %>
+                <%= link_to_arvados_object_if_readable(current_obj.output, 'Output data not available', link_text: "Output of #{current_obj.label}") %>
+              <% else %>
+                No output.
+              <% end %>
+            </div>
+          <% end %>
+
+          <div class="col-md-1 pipeline-instance-spacing">
+          <% if current_obj.can_cancel? and @object.editable? %>
+              <%= form_tag "#{current_obj.uri}/cancel", remote: true, style: "display:inline; padding-left: 1em" do |f| %>
+                <%= hidden_field_tag :return_to, url_for(@object) %>
+                <%= button_tag "Cancel", {class: 'btn btn-xs btn-danger', id: "cancel-child-button"} %>
+              <% end %>
+          <% end %>
+          </div>
+        <% end %>
+      </div>
+    </div>
+  </div>
+
+  <div id="collapse<%= i %>" class="panel-collapse collapse <%= if expanded then 'in' end %>">
+    <div class="panel-body">
+      <%= render partial: 'work_unit/show_component', locals: {wu: current_obj} %>
+    </div>
+  </div>
+</div>
diff --git a/apps/workbench/app/views/work_unit/_show_component.html.erb b/apps/workbench/app/views/work_unit/_show_component.html.erb
new file mode 100644 (file)
index 0000000..58b8aa8
--- /dev/null
@@ -0,0 +1,75 @@
+<%# Work unit status %>
+
+<div class="container-fluid>
+  <div class="row-fluid">
+    <%# Need additional handling for main object display  %>
+    <% if @object.uuid == wu.uuid %>
+    <div class="container-fluid">
+      <div class="pull-right">
+        <div class="container-fluid">
+          <div class="row-fulid pipeline-instance-spacing">
+            <div class="col-md-8">
+            <% if wu.is_running? and wu.child_summary_str %>
+                <%= wu.child_summary_str %>
+            <% end %>
+            </div>
+            <div class="col-md-3">
+              <%= render partial: 'work_unit/progress', locals: {wu: wu} %>
+            </div>
+            <div class="col-md-1">
+              <% if wu.can_cancel? and @object.editable? %>
+                  <%= form_tag "#{wu.uri}/cancel", remote: true, style: "display:inline; padding-left: 1em" do |f| %>
+                    <%= hidden_field_tag :return_to, url_for(@object) %>
+                    <%= button_tag "Cancel", {class: 'btn btn-xs btn-danger', id: "cancel-obj-button"} %>
+                  <% end %>
+              <% end %>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+    <% end %>
+
+    <div class="col-md-10" >
+      <% if wu.is_paused? %>
+        <p>
+          This <%= wu.title %> is paused. Children that are already running
+          will continue to run, but no new processes will be submitted.
+        </p>
+      <% end %>
+
+      <%= raw(wu.show_runtime) %>
+    </div>
+  </div>
+
+<p>
+  <%= render(partial: 'work_unit/component_detail', locals: {current_obj: wu}) %>
+</p>
+
+<%# Work unit children %>
+
+<%
+  uuids = wu.children.collect {|c| c.uuid}.compact
+  if uuids.any?
+    resource_class = resource_class_for_uuid(uuids.first, friendly_name: true)
+    preload_objects_for_dataclass resource_class, uuids
+  end
+
+  collections = wu.children.collect {|j| j.output}.compact
+  collections.concat wu.children.collect {|j| j.docker_image}.uniq.compact
+  collections_pdhs = collections.select {|x| !(m = CollectionsHelper.match(x)).nil?}.uniq.compact
+  collections_uuids = collections - collections_pdhs
+  preload_collections_for_objects collections_uuids if collections_uuids.any?
+  preload_for_pdhs collections_pdhs if collections_pdhs.any?
+%>
+
+<% if wu.has_unreadable_children %>
+  <%= render(partial: "pipeline_instances/show_components_json",
+             locals: {error_name: "Unreadable components", backtrace: nil, wu: wu}) %>
+<% else %>
+  <% @descendent_count = 0 if !@descendent_count %>
+  <% wu.children.each do |c| %>
+    <% @descendent_count += 1 %>
+    <%= render(partial: 'work_unit/show_child', locals: {current_obj: c, i: @descendent_count, expanded: false}) %>
+  <% end %>
+<% end %>
index ac36f197f4306e5d0ac82f77a47cf8d7c0b1d977..f55c3ac8720fab7e481a10b3f90946f7616256c3 100644 (file)
@@ -85,8 +85,12 @@ class PipelineInstancesControllerTest < ActionController::TestCase
     assert_response :success
     assert_not_nil assigns(:object)
     assert_not_nil assigns(:object).components[:foo][:job]
-    assert assigns(:object).components[:foo][:job][:started_at].is_a? Time
-    assert assigns(:object).components[:foo][:job][:finished_at].is_a? Time
+    start_at = assigns(:object).components[:foo][:job][:started_at]
+    start_at = Time.parse(start_at) if (start_at.andand.class == String)
+    assert start_at.is_a? Time
+    finished_at = assigns(:object).components[:foo][:job][:started_at]
+    finished_at = Time.parse(finished_at) if (finished_at.andand.class == String)
+    assert finished_at.is_a? Time
   end
 
   # The next two tests ensure that a pipeline instance can be copied
index 0c407b3827cb51d4ba25765d1f84d55a24f0b77d..e39d6f4dbf66576df928df93b7cf43030240d3f2 100644 (file)
@@ -101,7 +101,7 @@ class JobsTest < ActionDispatch::IntegrationTest
       if expect_options
         assert_text 'supplied_script_version: master'
       else
-        assert_text 'supplied_script_version: (none)'
+        assert_no_text 'supplied_script_version'
       end
 
       assert_triggers_dom_event 'shown.bs.modal' do
@@ -126,4 +126,67 @@ class JobsTest < ActionDispatch::IntegrationTest
       end
     end
   end
+
+  [
+    ['active', true],
+    ['job_reader2', false],
+  ].each do |user, readable|
+    test "view job with components as #{user} user" do
+      job = api_fixture('jobs')['running_job_with_components']
+      component1 = api_fixture('jobs')['completed_job_in_publicly_accessible_project']
+      component2 = api_fixture('pipeline_instances')['running_pipeline_with_complete_job']
+      component2_child1 = api_fixture('jobs')['previous_job_run']
+      component2_child2 = api_fixture('jobs')['running']
+
+      visit page_with_token(user, "/jobs/#{job['uuid']}")
+      assert page.has_text? job['script_version']
+      assert page.has_no_text? 'script_parameters'
+
+      # The job_reader2 is allowed to read job, component2, and component2_child1,
+      # and component2_child2 only as a component of the pipeline component2
+      if readable
+        assert page.has_link? 'component1'
+        assert page.has_link? 'component2'
+      else
+        assert page.has_no_link? 'component1'
+        assert page.has_link? 'component2'
+      end
+
+      if readable
+        click_link('component1')
+        within('#collapse1') do
+          assert(has_text? component1['uuid'])
+          assert(has_text? component1['script_version'])
+          assert(has_text? 'script_parameters')
+        end
+        click_link('component1')
+      end
+
+      click_link('component2')
+      within('.panel-collapse') do
+        assert(has_text? component2['uuid'])
+        assert(has_text? component2['script_version'])
+        assert(has_no_text? 'script_parameters')
+        assert(has_link? 'previous')
+        assert(has_link? 'running')
+
+        click_link('previous')
+        within('.panel-collapse') do
+          assert(has_text? component2_child1['uuid'])
+          assert(has_text? component2_child1['script_version'])
+        end
+        click_link('previous')
+
+        click_link('running')
+        within('.panel-collapse') do
+          assert(has_text? component2_child2['uuid'])
+          if readable
+            assert(has_text? component2_child2['script_version'])
+          else
+            assert(has_no_text? component2_child2['script_version'])
+          end
+        end
+      end
+    end
+  end
 end
index 41592af993261b6affd55fa6cba0f05780d475ec..78ef2d21f1a15a106c000710c440984b5a210b16 100644 (file)
@@ -233,12 +233,12 @@ end
 
 class ActionController::TestCase
   setup do
-    @counter = 0
+    @test_counter = 0
   end
 
   def check_counter action
-    @counter += 1
-    if @counter == 2
+    @test_counter += 1
+    if @test_counter == 2
       assert_equal 1, 2, "Multiple actions in controller test"
     end
   end
diff --git a/apps/workbench/test/unit/work_unit_test.rb b/apps/workbench/test/unit/work_unit_test.rb
new file mode 100644 (file)
index 0000000..e8af04c
--- /dev/null
@@ -0,0 +1,92 @@
+require 'test_helper'
+
+class WorkUnitTest < ActiveSupport::TestCase
+  setup do
+    Rails.configuration.anonymous_user_token = api_fixture('api_client_authorizations')['anonymous']['api_token']
+  end
+
+  [
+    [Job, 'running_job_with_components', "jwu", 2, "Running", nil, 0.5],
+    [PipelineInstance, 'pipeline_in_running_state', nil, 1, "Running", nil, 0.0],
+    [PipelineInstance, 'has_component_with_completed_jobs', nil, 3, "Complete", true, 1.0],
+    [PipelineInstance, 'pipeline_with_tagged_collection_input', "pwu", 1, "Ready", nil, 0.0],
+  ].each do |type, fixture, label, num_children, state, success, progress|
+    test "children of #{fixture}" do
+      use_token 'active'
+      obj = find_fixture(type, fixture)
+      wu = obj.work_unit(label)
+
+      if label != nil
+        assert_equal(label, wu.label)
+      else
+        assert_equal(obj.name, wu.label)
+      end
+      assert_equal(obj['uuid'], wu.uuid)
+      assert_equal(state, wu.state_label)
+      assert_equal(success, wu.success?)
+      assert_equal(progress, wu.progress)
+
+      assert_equal(num_children, wu.children.size)
+      wu.children.each do |child|
+        assert_equal(true, child.respond_to?(:script))
+      end
+    end
+  end
+
+  [
+    [Job, 'running_job_with_components', 1, 1, nil],
+    [Job, 'queued', nil, nil, 1],
+    [PipelineInstance, 'pipeline_in_running_state', 1, 1, nil],
+    [PipelineInstance, 'has_component_with_completed_jobs', 60, 60, nil],
+  ].each do |type, fixture, walltime, cputime, queuedtime|
+    test "times for #{fixture}" do
+      use_token 'active'
+      obj = find_fixture(type, fixture)
+      wu = obj.work_unit
+
+      if walltime
+        assert_equal true, (wu.walltime >= walltime)
+      else
+        assert_equal walltime, wu.walltime
+      end
+
+      if cputime
+        assert_equal true, (wu.cputime >= cputime)
+      else
+        assert_equal cputime, wu.cputime
+      end
+
+      if queuedtime
+        assert_equal true, (wu.queuedtime >= queuedtime)
+      else
+        assert_equal queuedtime, wu.queuedtime
+      end
+    end
+  end
+
+  [
+    [Job, 'active', 'running_job_with_components', true],
+    [Job, 'active', 'queued', false],
+    [Job, nil, 'completed_job_in_publicly_accessible_project', true],
+    [Job, 'active', 'completed_job_in_publicly_accessible_project', true],
+    [PipelineInstance, 'active', 'pipeline_in_running_state', true],  # no log, but while running the log link points to pi Log tab
+    [PipelineInstance, nil, 'pipeline_in_publicly_accessible_project_but_other_objects_elsewhere', false],
+    [PipelineInstance, 'active', 'pipeline_in_publicly_accessible_project_but_other_objects_elsewhere', false], #no log for completed pi
+    [Job, nil, 'job_in_publicly_accessible_project_but_other_objects_elsewhere', false, "Log unavailable"],
+  ].each do |type, token, fixture, has_log, log_link|
+    test "link_to_log for #{fixture} for #{token}" do
+      use_token token if token
+      obj = find_fixture(type, fixture)
+      wu = obj.work_unit
+
+      link = "#{wu.uri}#Log" if has_log
+      link_to_log = wu.link_to_log
+
+      if has_log
+        assert_includes link_to_log, link
+      else
+        assert_equal log_link, link_to_log
+      end
+    end
+  end
+end
diff --git a/backports/python-apache-libcloud/fpm-info.sh b/backports/python-apache-libcloud/fpm-info.sh
new file mode 100644 (file)
index 0000000..0333bdf
--- /dev/null
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+case $TARGET in
+     centos7)
+         # fpm incorrectly transforms the dependency name in this case.
+         fpm_depends+=(python-backports-ssl_match_hostname)
+         fpm_args+=(--python-disable-dependency backports.ssl-match-hostname)
+     ;;
+esac
index 42f31e365ca58f3add83cbba0bd73855d1190e30..c9665be0967d205d16e28f453df94bb22cb4f223 100644 (file)
@@ -8,4 +8,4 @@ case "$TARGET" in
 esac
 
 # FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
diff --git a/backports/python-gflags/fpm-info.sh b/backports/python-gflags/fpm-info.sh
deleted file mode 100644 (file)
index 67a989e..0000000
+++ /dev/null
@@ -1 +0,0 @@
-fpm_args+=(-v 2.0)
index a7d9398701b7bc4c405027c26f5133fe7b23d383..9fc00987e6f9561f37528368264adf15718b3afd 100644 (file)
@@ -10,4 +10,4 @@ case "$TARGET" in
 esac
 
 # FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
index 52df46573da5fae9d1a87d14589f598bde198f0e..65a83a06c7496c5da09315807965272b8d0ff460 100644 (file)
@@ -11,4 +11,4 @@ case "$TARGET" in
 esac
 
 # FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
index e4fbf59344ec041875f016b8e7bf227420ce4fef..5bfb010f0e56a3391146482fa5b9d5b51658ee02 100644 (file)
@@ -1,5 +1,5 @@
 # FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
 
 case "$TARGET" in
     centos6)
index 9216f8264bc7eef940fe6a3a1a3dcf147def239f..2180b871f7700d0a918f6557269177b2d2ff8217 100644 (file)
@@ -1,9 +1,13 @@
-all: centos6/generated debian7/generated debian8/generated ubuntu1204/generated ubuntu1404/generated
+all: centos6/generated centos7/generated debian7/generated debian8/generated ubuntu1204/generated ubuntu1404/generated
 
 centos6/generated: common-generated-all
        test -d centos6/generated || mkdir centos6/generated
        cp -rlt centos6/generated common-generated/*
 
+centos7/generated: common-generated-all
+       test -d centos7/generated || mkdir centos7/generated
+       cp -rlt centos7/generated common-generated/*
+
 debian7/generated: common-generated-all
        test -d debian7/generated || mkdir debian7/generated
        cp -rlt debian7/generated common-generated/*
@@ -20,10 +24,12 @@ ubuntu1404/generated: common-generated-all
        test -d ubuntu1404/generated || mkdir ubuntu1404/generated
        cp -rlt ubuntu1404/generated common-generated/*
 
-common-generated-all: common-generated/golang-amd64.tar.gz
+GOTARBALL=go1.6.2.linux-amd64.tar.gz
+
+common-generated-all: common-generated/$(GOTARBALL)
 
-common-generated/golang-amd64.tar.gz: common-generated
-       wget -cqO common-generated/golang-amd64.tar.gz http://storage.googleapis.com/golang/go1.4.2.linux-amd64.tar.gz
+common-generated/$(GOTARBALL): common-generated
+       wget -cqO common-generated/$(GOTARBALL) http://storage.googleapis.com/golang/$(GOTARBALL)
 
 common-generated:
        mkdir common-generated
index cfd94c85d3f6943b60458ac6e452c85861919b24..570dde162c466d3dbdf105316af242e0f00f3710 100644 (file)
@@ -2,10 +2,10 @@ FROM centos:6
 MAINTAINER Brett Smith <brett@curoverse.com>
 
 # Install build dependencies provided in base distribution
-RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar scl-utils centos-release-SCL postgresql-devel
+RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel
 
 # Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
 RUN ln -s /usr/local/go/bin/go /usr/local/bin/
 
 # Install RVM
@@ -13,13 +13,18 @@ RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.1 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+    /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
 
 # Need to "touch" RPM database to workaround bug in interaction between
 # overlayfs and yum (https://bugzilla.redhat.com/show_bug.cgi?id=1213602)
 RUN touch /var/lib/rpm/* && yum -q -y install python27 python33
 RUN scl enable python33 "easy_install-3.3 pip" && scl enable python27 "easy_install-2.7 pip"
 
+# fpm requires ffi which now wants xz-libs-5 which isn't packaged for centos6
+# but the library from xz-libs-4.999 appears to be good enough.
+RUN ln -s /usr/lib64/liblzma.so.0 /usr/lib64/lzma.so.5
+
 RUN cd /tmp && \
     curl -OL 'http://pkgs.repoforge.org/rpmforge-release/rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm' && \
     rpm -ivh rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm && \
diff --git a/build/package-build-dockerfiles/centos7/Dockerfile b/build/package-build-dockerfiles/centos7/Dockerfile
new file mode 100644 (file)
index 0000000..311aaa2
--- /dev/null
@@ -0,0 +1,25 @@
+FROM centos:7
+MAINTAINER Brett Smith <brett@curoverse.com>
+
+# Install build dependencies provided in base distribution
+RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel python-devel python-setuptools fuse-devel xz-libs git
+
+# Install golang binary
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
+RUN ln -s /usr/local/go/bin/go /usr/local/bin/
+
+# Install RVM
+RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
+    curl -L https://get.rvm.io | bash -s stable && \
+    /usr/local/rvm/bin/rvm install 2.1 && \
+    /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
+
+# Need to "touch" RPM database to workaround bug in interaction between
+# overlayfs and yum (https://bugzilla.redhat.com/show_bug.cgi?id=1213602)
+RUN touch /var/lib/rpm/* && yum -q -y install python33
+RUN scl enable python33 "easy_install-3.3 pip" && easy_install-2.7 pip
+
+ENV WORKSPACE /arvados
+CMD ["scl", "enable", "python33", "/usr/local/rvm/bin/rvm-exec default bash /jenkins/run-build-packages.sh --target centos7"]
index 0d0459032a0303131cd90b1bff0399c609074886..ddad5426046f19de1e1c7821b8eb8031184df7e5 100644 (file)
@@ -2,17 +2,18 @@ FROM debian:wheezy
 MAINTAINER Ward Vandewege <ward@curoverse.com>
 
 # Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libpq-dev python-pip
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libpq-dev python-pip unzip
 
 # Install RVM
 RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.1 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+    /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
 
 # Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
 RUN ln -s /usr/local/go/bin/go /usr/local/bin/
 
 ENV WORKSPACE /arvados
index fcd390fa279591b537196fab46b9ddbc002b8418..80f06a224bb4516ce483b39a39829916985014f7 100644 (file)
@@ -2,17 +2,18 @@ FROM debian:jessie
 MAINTAINER Ward Vandewege <ward@curoverse.com>
 
 # Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip unzip
 
 # Install RVM
 RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.1 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+    /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
 
 # Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
 RUN ln -s /usr/local/go/bin/go /usr/local/bin/
 
 ENV WORKSPACE /arvados
index 158053c570ad8d48bfe3ccad15db3b3e883003ae..2f628b0d1f91db8a14fa15bc99e6f7cb9eb30756 100644 (file)
@@ -2,17 +2,18 @@ FROM ubuntu:precise
 MAINTAINER Ward Vandewege <ward@curoverse.com>
 
 # Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip build-essential
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip build-essential unzip
 
 # Install RVM
 RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.1 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+    /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
 
 # Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
 RUN ln -s /usr/local/go/bin/go /usr/local/bin/
 
 ENV WORKSPACE /arvados
index 0b8ee7ade697cf42c2cd68211a3cef53739b995a..b9c003ac796613c631dcb070a6fc84a6257e7228 100644 (file)
@@ -2,17 +2,18 @@ FROM ubuntu:trusty
 MAINTAINER Brett Smith <brett@curoverse.com>
 
 # Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip unzip
 
 # Install RVM
 RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.1 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+    /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
 
 # Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
 RUN ln -s /usr/local/go/bin/go /usr/local/bin/
 
 ENV WORKSPACE /arvados
index 69927a17373e6fd5c5ab9b1198e517dfe7793b2b..874820a941793d28f60cb026c93840b39607863c 100644 (file)
@@ -1,7 +1,7 @@
 FROM centos:6
 MAINTAINER Peter Amstutz <peter.amstutz@curoverse.com>
 
-RUN yum -q install --assumeyes scl-utils centos-release-SCL \
+RUN yum -q install --assumeyes scl-utils centos-release-scl \
     which tar
 
 # Install RVM
@@ -10,11 +10,12 @@ RUN touch /var/lib/rpm/* && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.1 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundle fpm
+    /usr/local/rvm/bin/rvm-exec default gem install bundle && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
 
 RUN cd /tmp && \
     curl -OL 'http://pkgs.repoforge.org/rpmforge-release/rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm' && \
     rpm -ivh rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm && \
     sed -i 's/enabled = 0/enabled = 1/' /etc/yum.repos.d/rpmforge.repo
 
-COPY localrepo.repo /etc/yum.repos.d/localrepo.repo
\ No newline at end of file
+COPY localrepo.repo /etc/yum.repos.d/localrepo.repo
diff --git a/build/package-test-dockerfiles/centos7/Dockerfile b/build/package-test-dockerfiles/centos7/Dockerfile
new file mode 100644 (file)
index 0000000..6bc40bf
--- /dev/null
@@ -0,0 +1,15 @@
+FROM centos:7
+MAINTAINER Brett Smith <brett@curoverse.com>
+
+RUN yum -q -y install scl-utils centos-release-scl which tar
+
+# Install RVM
+RUN touch /var/lib/rpm/* && \
+    gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
+    curl -L https://get.rvm.io | bash -s stable && \
+    /usr/local/rvm/bin/rvm install 2.1 && \
+    /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundle && \
+    /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
+
+COPY localrepo.repo /etc/yum.repos.d/localrepo.repo
diff --git a/build/package-test-dockerfiles/centos7/localrepo.repo b/build/package-test-dockerfiles/centos7/localrepo.repo
new file mode 100644 (file)
index 0000000..ebb8765
--- /dev/null
@@ -0,0 +1,5 @@
+[localrepo]
+name=Arvados Test
+baseurl=file:///arvados/packages/centos7
+gpgcheck=0
+enabled=1
diff --git a/build/package-testing/rpm-common-test-packages.sh b/build/package-testing/rpm-common-test-packages.sh
new file mode 100755 (executable)
index 0000000..4d0c32b
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+set -eu
+
+target=$(basename "$0" | grep -Eo '\bcentos[[:digit:]]+\b')
+
+yum -q clean all
+touch /var/lib/rpm/*
+
+export ARV_PACKAGES_DIR="/arvados/packages/$target"
+
+rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.before"
+
+yum install --assumeyes $1
+
+rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.after"
+
+diff "$ARV_PACKAGES_DIR/$1".{before,after} >"$ARV_PACKAGES_DIR/$1.diff" || true
+
+# Enable any Software Collections that the package depended on.
+if [[ -d /opt/rh ]]; then
+    # We have to stage the list to a file, because `ls | while read` would
+    # make a subshell, causing the `source` lines to have no effect.
+    scl_list=$(mktemp)
+    ls /opt/rh >"$scl_list"
+
+    # SCL scripts aren't designed to run with -eu.
+    set +eu
+    while read scl; do
+        source scl_source enable "$scl"
+    done <"$scl_list"
+    set -eu
+    rm "$scl_list"
+fi
+
+mkdir -p /tmp/opts
+cd /tmp/opts
+
+rpm2cpio $(ls -t "$ARV_PACKAGES_DIR/$1"-*.rpm | head -n1) | cpio -idm 2>/dev/null
+
+find -name '*.so' | while read so; do
+    echo -e "\n== Packages dependencies for $so =="
+    ldd "$so" \
+        | awk '($3 ~ /^\//){print $3}' | sort -u | xargs rpm -qf | sort -u
+done
+
+exec /jenkins/package-testing/common-test-packages.sh "$1"
index e97544828c95ee792285c31dc51bc00fda2027df..66d5969428e65093ed0a3c002e8d77c353f00910 100755 (executable)
@@ -7,7 +7,7 @@ case "$TARGET" in
         apt-get install -y nginx
         dpkg-reconfigure arvados-api-server
         ;;
-    centos6)
+    centos*)
         yum install --assumeyes httpd
         yum reinstall --assumeyes arvados-api-server
         ;;
index c1a377e419998057d7187f8574126c66969abaa9..ab0b225ed53b9d8d68b1f3272575b63610fb42de 100755 (executable)
@@ -18,7 +18,7 @@ case "$TARGET" in
     debian*|ubuntu*)
         FORMAT=deb
         ;;
-    centos6)
+    centos*)
         FORMAT=rpm
         ;;
     *)
@@ -75,12 +75,9 @@ fi
 if [[ ! -e "/etc/arvados/sso/database.yml" ]]; then
   # We need to set up our database configuration now.
   if [[ "$FORMAT" == "rpm" ]]; then
-    # postgres packaging on CentOS6 is kind of primitive, needs an initdb
     service postgresql initdb
-    if [ "$TARGET" = "centos6" ]; then
-      sed -i -e "s/127.0.0.1\/32          ident/127.0.0.1\/32          md5/" /var/lib/pgsql/data/pg_hba.conf
-      sed -i -e "s/::1\/128               ident/::1\/128               md5/" /var/lib/pgsql/data/pg_hba.conf
-    fi
+    sed -i -e "s/127.0.0.1\/32          ident/127.0.0.1\/32          md5/" /var/lib/pgsql/data/pg_hba.conf
+    sed -i -e "s/::1\/128               ident/::1\/128               md5/" /var/lib/pgsql/data/pg_hba.conf
   fi
   service postgresql start
 
index 1be4dea3edf3129545ee087026db490aa6bdddbe..5deb1a0aa1a9082c2a5834a2116caf3037328ad1 100755 (executable)
@@ -7,7 +7,7 @@ case "$TARGET" in
         apt-get install -y nginx
         dpkg-reconfigure arvados-workbench
         ;;
-    centos6)
+    centos*)
         yum install --assumeyes httpd
         yum reinstall --assumeyes arvados-workbench
         ;;
deleted file mode 100755 (executable)
index 4e05364f01519973894f2a8b6329723b481da8ad..0000000000000000000000000000000000000000
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-yum -q clean all
-touch /var/lib/rpm/*
-
-export ARV_PACKAGES_DIR=/arvados/packages/centos6
-
-rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.before"
-
-yum install --assumeyes $1
-
-rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.after"
-
-set +e
-diff "$ARV_PACKAGES_DIR/$1.before" "$ARV_PACKAGES_DIR/$1.after" >"$ARV_PACKAGES_DIR/$1.diff"
-set -e
-
-SCL=""
-if scl enable python27 true 2>/dev/null ; then
-    SCL="scl enable python27"
-fi
-
-mkdir -p /tmp/opts
-cd /tmp/opts
-
-rpm2cpio $(ls -t "$ARV_PACKAGES_DIR/$1"-*.rpm | head -n1) | cpio -idm 2>/dev/null
-
-shared=$(find -name '*.so')
-if test -n "$shared" ; then
-    for so in $shared ; do
-        echo
-        echo "== Packages dependencies for $so =="
-        $SCL ldd "$so" \
-            | awk '($3 ~ /^\//){print $3}' | sort -u | xargs rpm -qf | sort -u
-    done
-fi
-
-if test -n "$SCL" ; then
-    exec $SCL "/jenkins/package-testing/common-test-packages.sh '$1'"
-else
-    exec /jenkins/package-testing/common-test-packages.sh "$1"
-fi
new file mode 120000 (symlink)
index 0000000000000000000000000000000000000000..64ef6040c1f64adbc5ab69de241414965faf7e57
--- /dev/null
@@ -0,0 +1 @@
+rpm-common-test-packages.sh
\ No newline at end of file
diff --git a/build/package-testing/test-packages-centos7.sh b/build/package-testing/test-packages-centos7.sh
new file mode 120000 (symlink)
index 0000000..64ef604
--- /dev/null
@@ -0,0 +1 @@
+rpm-common-test-packages.sh
\ No newline at end of file
index fcf849bc4df9fe7e991046b150305830a711cce9..15f788163ed604ea4592daae668c3dbe9e888923 100755 (executable)
@@ -54,21 +54,23 @@ do
     esac
 done
 
-
 EXITCODE=0
 
-COLUMNS=80
-
-title () {
-    printf "\n%*s\n\n" $(((${#title}+$COLUMNS)/2)) "********** $1 **********"
+exit_cleanly() {
+    trap - INT
+    report_outcomes
+    exit $EXITCODE
 }
 
+COLUMNS=80
+. $WORKSPACE/build/run-library.sh
+
 docker_push () {
     if [[ ! -z "$tags" ]]
     then
         for tag in $( echo $tags|tr "," " " )
         do
-             $DOCKER tag -f $1 $1:$tag
+             $DOCKER tag $1 $1:$tag
         done
     fi
 
@@ -82,17 +84,9 @@ docker_push () {
     done
 
     if [[ "$ECODE" != "0" ]]; then
-        title "!!!!!! docker push $* failed !!!!!!"
         EXITCODE=$(($EXITCODE + $ECODE))
     fi
-}
-
-timer_reset() {
-    t0=$SECONDS
-}
-
-timer() {
-    echo -n "$(($SECONDS - $t0))s"
+    checkexit $ECODE "docker push $*"
 }
 
 # Sanity check
@@ -131,15 +125,19 @@ rm -f config.yml
 # Get test config.yml file
 cp $HOME/docker/config.yml .
 
-./build.sh jobs-image
+if [[ ! -z "$tags" ]]; then
+  COMMIT=${tags/,*/} ./build.sh jobs-image
+else
+  ./build.sh jobs-image
+fi
 
 ECODE=$?
 
 if [[ "$ECODE" != "0" ]]; then
-    title "!!!!!! docker BUILD FAILED !!!!!!"
     EXITCODE=$(($EXITCODE + $ECODE))
 fi
 
+checkexit $ECODE "docker build"
 title "docker build complete (`timer`)"
 
 title "uploading images"
@@ -155,10 +153,10 @@ else
         docker login -u arvados
 
         docker_push arvados/jobs
-        title "upload arvados images complete (`timer`)"
+        title "upload arvados images finished (`timer`)"
     else
-        title "upload arvados images SKIPPED because no --upload option set"
+        title "upload arvados images SKIPPED because no --upload option set (`timer`)"
     fi
 fi
 
-exit $EXITCODE
+exit_cleanly
index c5e0a89e3827dd266974045d0ebe16718db7f344..6fdffd09da49a36ad608dd533174f133fb61424e 100755 (executable)
@@ -86,11 +86,11 @@ done
 set -e
 
 if [[ -n "$test_packages" ]]; then
-    if [[ -n "$(find $WORKSPACE/packages/$TARGET -name *.rpm)" ]] ; then
+    if [[ -n "$(find $WORKSPACE/packages/$TARGET -name '*.rpm')" ]] ; then
         createrepo $WORKSPACE/packages/$TARGET
     fi
 
-    if [[ -n "$(find $WORKSPACE/packages/$TARGET -name *.deb)" ]] ; then
+    if [[ -n "$(find $WORKSPACE/packages/$TARGET -name '*.deb')" ]] ; then
         (cd $WORKSPACE/packages/$TARGET
          dpkg-scanpackages .  2> >(grep -v 'warning' 1>&2) | gzip -c > Packages.gz
         )
@@ -128,6 +128,8 @@ if test -z "$packages" ; then
         arvados-src
         arvados-workbench
         crunchstat
+        keep-balance
+        keep-block-check
         keepproxy
         keep-rsync
         keepstore
@@ -137,11 +139,11 @@ if test -z "$packages" ; then
     case "$TARGET" in
         centos6)
             packages="$packages python27-python-arvados-fuse
-                  python27-python-arvados-python-client"
+                  python27-python-arvados-python-client python27-python-arvados-cwl-runner"
             ;;
         *)
             packages="$packages python-arvados-fuse
-                  python-arvados-python-client"
+                  python-arvados-python-client python-arvados-cwl-runner"
             ;;
     esac
 fi
diff --git a/build/run-build-packages-python-and-ruby.sh b/build/run-build-packages-python-and-ruby.sh
new file mode 100755 (executable)
index 0000000..13aa687
--- /dev/null
@@ -0,0 +1,209 @@
+#!/bin/bash
+
+COLUMNS=80
+
+. `dirname "$(readlink -f "$0")"`/run-library.sh
+#. `dirname "$(readlink -f "$0")"`/libcloud-pin
+
+read -rd "\000" helpmessage <<EOF
+$(basename $0): Build Arvados Python packages and Ruby gems
+
+Syntax:
+        WORKSPACE=/path/to/arvados $(basename $0) [options]
+
+Options:
+
+--debug
+    Output debug information (default: false)
+--upload
+    If the build and test steps are successful, upload the python
+    packages to pypi and the gems to rubygems (default: false)
+
+WORKSPACE=path         Path to the Arvados source tree to build packages from
+
+EOF
+
+exit_cleanly() {
+    trap - INT
+    report_outcomes
+    exit ${#failures[@]}
+}
+
+gem_wrapper() {
+  local gem_name="$1"; shift
+  local gem_directory="$1"; shift
+
+  title "Start $gem_name gem build"
+  timer_reset
+
+  cd "$gem_directory"
+  handle_ruby_gem $gem_name
+
+  checkexit $? "$gem_name gem build"
+  title "End of $gem_name gem build (`timer`)"
+}
+
+python_wrapper() {
+  local package_name="$1"; shift
+  local package_directory="$1"; shift
+
+  title "Start $package_name python package build"
+  timer_reset
+
+  cd "$package_directory"
+  if [[ $DEBUG > 0 ]]; then
+    echo `pwd`
+  fi
+  handle_python_package
+
+  checkexit $? "$package_name python package build"
+  title "End of $package_name python package build (`timer`)"
+}
+
+TARGET=
+UPLOAD=0
+DEBUG=${ARVADOS_DEBUG:-0}
+
+PARSEDOPTS=$(getopt --name "$0" --longoptions \
+    help,debug,upload,target: \
+    -- "" "$@")
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+
+eval set -- "$PARSEDOPTS"
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --help)
+            echo >&2 "$helpmessage"
+            echo >&2
+            exit 1
+            ;;
+        --target)
+            TARGET="$2"; shift
+            ;;
+        --upload)
+            UPLOAD=1
+            ;;
+        --debug)
+            DEBUG=1
+            ;;
+        --)
+            if [ $# -gt 1 ]; then
+                echo >&2 "$0: unrecognized argument '$2'. Try: $0 --help"
+                exit 1
+            fi
+            ;;
+    esac
+    shift
+done
+
+if ! [[ -n "$WORKSPACE" ]]; then
+  echo >&2 "$helpmessage"
+  echo >&2
+  echo >&2 "Error: WORKSPACE environment variable not set"
+  echo >&2
+  exit 1
+fi
+
+STDOUT_IF_DEBUG=/dev/null
+STDERR_IF_DEBUG=/dev/null
+DASHQ_UNLESS_DEBUG=-q
+if [[ "$DEBUG" != 0 ]]; then
+    STDOUT_IF_DEBUG=/dev/stdout
+    STDERR_IF_DEBUG=/dev/stderr
+    DASHQ_UNLESS_DEBUG=
+fi
+
+EASY_INSTALL2=$(find_easy_install -$PYTHON2_VERSION "")
+EASY_INSTALL3=$(find_easy_install -$PYTHON3_VERSION 3)
+
+RUN_BUILD_PACKAGES_PATH="`dirname \"$0\"`"
+RUN_BUILD_PACKAGES_PATH="`( cd \"$RUN_BUILD_PACKAGES_PATH\" && pwd )`"  # absolutized and normalized
+if [ -z "$RUN_BUILD_PACKAGES_PATH" ] ; then
+  # error; for some reason, the path is not accessible
+  # to the script (e.g. permissions re-evaled after suid)
+  exit 1  # fail
+fi
+
+debug_echo "$0 is running from $RUN_BUILD_PACKAGES_PATH"
+debug_echo "Workspace is $WORKSPACE"
+
+if [[ -f /etc/profile.d/rvm.sh ]]; then
+    source /etc/profile.d/rvm.sh
+    GEM="rvm-exec default gem"
+else
+    GEM=gem
+fi
+
+# Make all files world-readable -- jenkins runs with umask 027, and has checked
+# out our git tree here
+chmod o+r "$WORKSPACE" -R
+
+# More cleanup - make sure all executables that we'll package are 755
+cd "$WORKSPACE"
+find -type d -name 'bin' |xargs -I {} find {} -type f |xargs -I {} chmod 755 {}
+
+# Now fix our umask to something better suited to building and publishing
+# gems and packages
+umask 0022
+
+debug_echo "umask is" `umask`
+
+gem_wrapper arvados "$WORKSPACE/sdk/ruby"
+gem_wrapper arvados-cli "$WORKSPACE/sdk/cli"
+gem_wrapper arvados-login-sync "$WORKSPACE/services/login-sync"
+
+GEM_BUILD_FAILURES=0
+if [ ${#failures[@]} -ne 0 ]; then
+  GEM_BUILD_FAILURES=${#failures[@]}
+fi
+
+python_wrapper arvados-pam "$WORKSPACE/sdk/pam"
+python_wrapper arvados-python-client "$WORKSPACE/sdk/python"
+python_wrapper arvados-cwl-runner "$WORKSPACE/sdk/cwl"
+python_wrapper arvados_fuse "$WORKSPACE/services/fuse"
+python_wrapper arvados-node-manager "$WORKSPACE/services/nodemanager"
+
+PYTHON_BUILD_FAILURES=0
+if [ $((${#failures[@]} - $GEM_BUILD_FAILURES)) -ne 0 ]; then
+  PYTHON_BUILD_FAILURES=${#failures[@]} - $GEM_BUILD_FAILURES
+fi
+
+if [[ "$UPLOAD" != 0 ]]; then
+
+  if [[ $DEBUG > 0 ]]; then
+    EXTRA_UPLOAD_FLAGS=" --verbose"
+  else
+    EXTRA_UPLOAD_FLAGS=""
+  fi
+
+  if [[ ! -e "$WORKSPACE/packages" ]]; then
+    mkdir -p "$WORKSPACE/packages"
+  fi
+
+  title "Start upload python packages"
+  timer_reset
+
+  if [ "$PYTHON_BUILD_FAILURES" -eq 0 ]; then
+    /usr/local/arvados-dev/jenkins/run_upload_packages.py $EXTRA_UPLOAD_FLAGS --workspace $WORKSPACE python
+  else
+    echo "Skipping python packages upload, there were errors building the packages"
+  fi
+  checkexit $? "upload python packages"
+  title "End of upload python packages (`timer`)"
+
+  title "Start upload ruby gems"
+  timer_reset
+
+  if [ "$GEM_BUILD_FAILURES" -eq 0 ]; then
+    /usr/local/arvados-dev/jenkins/run_upload_packages.py $EXTRA_UPLOAD_FLAGS --workspace $WORKSPACE gems
+  else
+    echo "Skipping ruby gem upload, there were errors building the packages"
+  fi
+  checkexit $? "upload ruby gems"
+  title "End of upload ruby gems (`timer`)"
+
+fi
+
+exit_cleanly
index cc673a6d3353c56a22c416b4139dbb957045da05..b5dcdfce53b9e41518fe18d981af80815c47cde5 100755 (executable)
@@ -80,7 +80,7 @@ case "$TARGET" in
     ubuntu1404)
         FORMAT=deb
         ;;
-    centos6)
+    centos6|centos7)
         FORMAT=rpm
         ;;
     *)
index 697092966bda6575d0ddaaafac635841555de650..e7934a648444ac9f65ce053fd103a6a4918a11d5 100755 (executable)
@@ -82,70 +82,97 @@ declare -a PYTHON_BACKPORTS PYTHON3_BACKPORTS
 PYTHON2_VERSION=2.7
 PYTHON3_VERSION=$(python3 -c 'import sys; print("{v.major}.{v.minor}".format(v=sys.version_info))')
 
+## These defaults are suitable for any Debian-based distribution.
+# You can customize them as needed in distro sections below.
+PYTHON2_PACKAGE=python$PYTHON2_VERSION
+PYTHON2_PKG_PREFIX=python
+PYTHON2_PREFIX=/usr
+PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/dist-packages
+
+PYTHON3_PACKAGE=python$PYTHON3_VERSION
+PYTHON3_PKG_PREFIX=python3
+PYTHON3_PREFIX=/usr
+PYTHON3_INSTALL_LIB=lib/python$PYTHON3_VERSION/dist-packages
+## End Debian Python defaults.
+
 case "$TARGET" in
     debian7)
         FORMAT=deb
-        PYTHON2_PACKAGE=python$PYTHON2_VERSION
-        PYTHON2_PKG_PREFIX=python
-        PYTHON3_PACKAGE=python$PYTHON3_VERSION
-        PYTHON3_PKG_PREFIX=python3
-        PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+        PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
             oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
-            'pycurl<7.21.5' contextlib2)
-        PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+            'pycurl<7.21.5' contextlib2 pyyaml 'rdflib>=4.2.0' \
+            shellescape mistune typing avro ruamel.ordereddict)
+        PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
         ;;
     debian8)
         FORMAT=deb
-        PYTHON2_PACKAGE=python$PYTHON2_VERSION
-        PYTHON2_PKG_PREFIX=python
-        PYTHON3_PACKAGE=python$PYTHON3_VERSION
-        PYTHON3_PKG_PREFIX=python3
-        PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+        PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
             oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
-            'pycurl<7.21.5')
-        PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+            'pycurl<7.21.5' pyyaml 'rdflib>=4.2.0' \
+            shellescape mistune typing avro ruamel.ordereddict)
+        PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
         ;;
     ubuntu1204)
         FORMAT=deb
-        PYTHON2_PACKAGE=python$PYTHON2_VERSION
-        PYTHON2_PKG_PREFIX=python
-        PYTHON3_PACKAGE=python$PYTHON3_VERSION
-        PYTHON3_PKG_PREFIX=python3
-        PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+        PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
             oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
-            contextlib2 \
-            'pycurl<7.21.5')
-        PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+            contextlib2 'pycurl<7.21.5' pyyaml 'rdflib>=4.2.0' \
+            shellescape mistune typing avro isodate ruamel.ordereddict)
+        PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
         ;;
     ubuntu1404)
         FORMAT=deb
-        PYTHON2_PACKAGE=python$PYTHON2_VERSION
-        PYTHON2_PKG_PREFIX=python
-        PYTHON3_PACKAGE=python$PYTHON3_VERSION
-        PYTHON3_PKG_PREFIX=python3
-        PYTHON_BACKPORTS=(pyasn1==0.1.7 pyvcf pyasn1-modules==0.0.5 llfuse==0.41.1 ciso8601 \
+        PYTHON_BACKPORTS=(pyasn1==0.1.7 pyasn1-modules==0.0.5 llfuse==0.41.1 ciso8601 \
             google-api-python-client==1.4.2 six uritemplate oauth2client==1.5.2 httplib2 \
-            rsa 'pycurl<7.21.5' backports.ssl_match_hostname)
-        PYTHON3_BACKPORTS=(docker-py requests websocket-client)
+            rsa 'pycurl<7.21.5' backports.ssl_match_hostname pyyaml 'rdflib>=4.2.0' \
+            shellescape mistune typing avro ruamel.ordereddict)
+        PYTHON3_BACKPORTS=(docker-py==1.7.2 requests websocket-client)
         ;;
     centos6)
         FORMAT=rpm
         PYTHON2_PACKAGE=$(rpm -qf "$(which python$PYTHON2_VERSION)" --queryformat '%{NAME}\n')
         PYTHON2_PKG_PREFIX=$PYTHON2_PACKAGE
+        PYTHON2_PREFIX=/opt/rh/python27/root/usr
+        PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/site-packages
         PYTHON3_PACKAGE=$(rpm -qf "$(which python$PYTHON3_VERSION)" --queryformat '%{NAME}\n')
         PYTHON3_PKG_PREFIX=$PYTHON3_PACKAGE
-        PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+        PYTHON3_PREFIX=/opt/rh/python33/root/usr
+        PYTHON3_INSTALL_LIB=lib/python$PYTHON3_VERSION/site-packages
+        PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
             oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname 'pycurl<7.21.5' \
-            python-daemon lockfile llfuse==0.41.1 'pbr<1.0')
-        PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+            python-daemon lockfile llfuse==0.41.1 'pbr<1.0' pyyaml \
+            'rdflib>=4.2.0' shellescape mistune typing avro requests \
+            isodate pyparsing sparqlwrapper html5lib keepalive \
+            ruamel.ordereddict)
+        PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
+        export PYCURL_SSL_LIBRARY=nss
+        ;;
+    centos7)
+        FORMAT=rpm
+        PYTHON2_PACKAGE=$(rpm -qf "$(which python$PYTHON2_VERSION)" --queryformat '%{NAME}\n')
+        PYTHON2_PKG_PREFIX=$PYTHON2_PACKAGE
+        PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/site-packages
+        PYTHON3_PACKAGE=$(rpm -qf "$(which python$PYTHON3_VERSION)" --queryformat '%{NAME}\n')
+        PYTHON3_PKG_PREFIX=$PYTHON3_PACKAGE
+        PYTHON3_PREFIX=/opt/rh/python33/root/usr
+        PYTHON3_INSTALL_LIB=lib/python$PYTHON3_VERSION/site-packages
+        PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
+            oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
+            rsa uritemplate httplib2 ws4py pykka pyexecjs jsonschema \
+            ciso8601 pycrypto 'pycurl<7.21.5' \
+            python-daemon llfuse==0.41.1 'pbr<1.0' pyyaml \
+            'rdflib>=4.2.0' shellescape mistune typing avro \
+            isodate pyparsing sparqlwrapper html5lib keepalive \
+            ruamel.ordereddict)
+        PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
         export PYCURL_SSL_LIBRARY=nss
         ;;
     *)
@@ -200,6 +227,7 @@ fi
 chmod o+r "$WORKSPACE" -R
 
 # More cleanup - make sure all executables that we'll package are 755
+cd "$WORKSPACE"
 find -type d -name 'bin' |xargs -I {} find {} -type f |xargs -I {} chmod 755 {}
 
 # Now fix our umask to something better suited to building and publishing
@@ -351,7 +379,7 @@ elif [[ $TARGET =~ centos6 ]]; then
             rpm2cpio ${LIBFUSE_DIR}/fuse-2.9.2-6.el7.src.rpm | cpio -i
             perl -pi -e 's/Conflicts:\s*filesystem.*//g' fuse.spec
         )
-        # build rpms from source 
+        # build rpms from source
         rpmbuild -bb /root/rpmbuild/SOURCES/fuse.spec
         rm -f fuse-2.9.2-6.el7.src.rpm
         # move built RPMs to LIBFUSE_DIR
@@ -376,6 +404,8 @@ package_go_binary services/keepstore keepstore \
     "Keep storage daemon, accessible to clients on the LAN"
 package_go_binary services/keepproxy keepproxy \
     "Make a Keep cluster accessible to clients that are not on the LAN"
+package_go_binary services/keep-balance keep-balance \
+    "Rebalance and garbage-collect data blocks stored in Arvados Keep"
 package_go_binary services/keep-web keep-web \
     "Static web hosting service for user data stored in Arvados Keep"
 package_go_binary services/datamanager arvados-data-manager \
@@ -386,6 +416,10 @@ package_go_binary services/crunchstat crunchstat \
     "Gather cpu/memory/network statistics of running Crunch jobs"
 package_go_binary tools/keep-rsync keep-rsync \
     "Copy all data from one set of Keep servers to another"
+package_go_binary tools/keep-block-check keep-block-check \
+    "Verify that all data from one set of Keep servers to another was copied"
+package_go_binary sdk/go/crunchrunner crunchrunner \
+    "Crunchrunner executes a command inside a container and uploads the output"
 
 # The Python SDK
 # Please resist the temptation to add --no-python-fix-name to the fpm call here
@@ -401,7 +435,34 @@ fpm_build $WORKSPACE/sdk/python "${PYTHON2_PKG_PREFIX}-arvados-python-client" 'C
 # cwl-runner
 cd $WORKSPACE/packages/$TARGET
 rm -rf "$WORKSPACE/sdk/cwl/build"
-fpm_build $WORKSPACE/sdk/cwl "${PYTHON2_PKG_PREFIX}-arvados-cwl-runner" 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/sdk/cwl/arvados_cwl_runner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados CWL runner"
+fpm_build $WORKSPACE/sdk/cwl "${PYTHON2_PKG_PREFIX}-arvados-cwl-runner" 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/sdk/cwl/arvados_cwl_runner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados CWL runner" --iteration 3
+
+# schema_salad. This is a python dependency of arvados-cwl-runner,
+# but we can't use the usual PYTHONPACKAGES way to build this package due to the
+# intricacies of how version numbers get generated in setup.py: we need version
+# 1.7.20160316203940. If we don't explicitly list that version with the -v
+# argument to fpm, and instead specify it as schema_salad==1.7.20160316203940, we get
+# a package with version 1.7. That's because our gittagger hack is not being
+# picked up by self.distribution.get_version(), which is called from
+# https://github.com/jordansissel/fpm/blob/master/lib/fpm/package/pyfpm/get_metadata.py
+# by means of this command:
+#
+# python2.7 setup.py --command-packages=pyfpm get_metadata --output=metadata.json
+#
+# So we build this thing separately.
+#
+# Ward, 2016-03-17
+fpm_build schema_salad "" "" python 1.11.20160506154702
+
+# And schema_salad now depends on ruamel-yaml, which apparently has a braindead setup.py that requires special arguments to build (otherwise, it aborts with 'error: you have to install with "pip install ."'). Sigh.
+# Ward, 2016-05-26
+fpm_build ruamel.yaml "" "" python "" --python-setup-py-arguments "--single-version-externally-managed"
+
+# And for cwltool we have the same problem as for schema_salad. Ward, 2016-03-17
+fpm_build cwltool "" "" python 1.0.20160519182434
+
+# FPM eats the trailing .0 in the python-rdflib-jsonld package when built with 'rdflib-jsonld>=0.3.0'. Force the version. Ward, 2016-03-25
+fpm_build rdflib-jsonld "" "" python 0.3.0
 
 # The PAM module
 if [[ $TARGET =~ debian|ubuntu ]]; then
@@ -427,6 +488,11 @@ cd $WORKSPACE/packages/$TARGET
 rm -rf "$WORKSPACE/services/dockercleaner/build"
 fpm_build $WORKSPACE/services/dockercleaner arvados-docker-cleaner 'Curoverse, Inc.' 'python3' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/services/dockercleaner/arvados_docker_cleaner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados Docker image cleaner"
 
+# The Arvados crunchstat-summary tool
+cd $WORKSPACE/packages/$TARGET
+rm -rf "$WORKSPACE/tools/crunchstat-summary/build"
+fpm_build $WORKSPACE/tools/crunchstat-summary ${PYTHON2_PKG_PREFIX}-crunchstat-summary 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/tools/crunchstat-summary/crunchstat_summary.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=Crunchstat-summary reads Arvados Crunch log files and summarize resource usage"
+
 # Forked libcloud
 LIBCLOUD_DIR=$(mktemp -d)
 (
@@ -462,14 +528,15 @@ for deppkg in "${PYTHON_BACKPORTS[@]}"; do
                 set -e
                 cd "$pyfpm_workdir"
                 pip install "${PIP_DOWNLOAD_SWITCHES[@]}" --download . "$deppkg"
-                tar -xf "$deppkg"-*.tar*
+                # Sometimes pip gives us a tarball, sometimes a zip file...
+                DOWNLOADED=`ls $deppkg-*`
+                [[ "$DOWNLOADED" =~ ".tar" ]] && tar -xf $DOWNLOADED
+                [[ "$DOWNLOADED" =~ ".zip" ]] && unzip $DOWNLOADED
                 cd "$deppkg"-*/
                 "python$PYTHON2_VERSION" setup.py $DASHQ_UNLESS_DEBUG egg_info build
                 chmod -R go+rX .
                 set +e
-                # --iteration 2 provides an upgrade for previously built
-                # buggy packages.
-                fpm_build . "$outname" "" python "" --iteration 2
+                fpm_build . "$outname" "" python "" --iteration 3
                 # The upload step uses the package timestamp to determine
                 # whether it's new.  --no-clobber plays nice with that.
                 mv --no-clobber "$outname"*.$FORMAT "$WORKSPACE/packages/$TARGET"
diff --git a/build/run-build-test-packages-one-target.sh b/build/run-build-test-packages-one-target.sh
new file mode 100755 (executable)
index 0000000..ff6bad4
--- /dev/null
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+read -rd "\000" helpmessage <<EOF
+$(basename $0): Build, test and (optionally) upload packages for one target
+
+Syntax:
+        WORKSPACE=/path/to/arvados $(basename $0) [options]
+
+--target <target>
+    Distribution to build packages for (default: debian7)
+--upload
+    If the build and test steps are successful, upload the packages
+    to a remote apt repository (default: false)
+
+WORKSPACE=path         Path to the Arvados source tree to build packages from
+
+EOF
+
+if ! [[ -n "$WORKSPACE" ]]; then
+  echo >&2 "$helpmessage"
+  echo >&2
+  echo >&2 "Error: WORKSPACE environment variable not set"
+  echo >&2
+  exit 1
+fi
+
+if ! [[ -d "$WORKSPACE" ]]; then
+  echo >&2 "$helpmessage"
+  echo >&2
+  echo >&2 "Error: $WORKSPACE is not a directory"
+  echo >&2
+  exit 1
+fi
+
+PARSEDOPTS=$(getopt --name "$0" --longoptions \
+    help,upload,target: \
+    -- "" "$@")
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+
+TARGET=debian7
+UPLOAD=0
+
+eval set -- "$PARSEDOPTS"
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --help)
+            echo >&2 "$helpmessage"
+            echo >&2
+            exit 1
+            ;;
+        --target)
+            TARGET="$2"; shift
+            ;;
+        --upload)
+            UPLOAD=1
+            ;;
+        --)
+            if [ $# -gt 1 ]; then
+                echo >&2 "$0: unrecognized argument '$2'. Try: $0 --help"
+                exit 1
+            fi
+            ;;
+    esac
+    shift
+done
+
+exit_cleanly() {
+    trap - INT
+    report_outcomes
+    exit ${#failures}
+}
+
+COLUMNS=80
+. $WORKSPACE/build/run-library.sh
+
+title "Start build packages"
+timer_reset
+
+$WORKSPACE/build/run-build-packages-one-target.sh --target $TARGET
+
+checkexit $? "build packages"
+title "End of build packages (`timer`)"
+
+title "Start test packages"
+timer_reset
+
+if [ ${#failures[@]} -eq 0 ]; then
+  $WORKSPACE/build/run-build-packages-one-target.sh --target $TARGET --test-packages
+else
+  echo "Skipping package upload, there were errors building the packages"
+fi
+
+checkexit $? "test packages"
+title "End of test packages (`timer`)"
+
+if [[ "$UPLOAD" != 0 ]]; then
+  title "Start upload packages"
+  timer_reset
+
+  if [ ${#failures[@]} -eq 0 ]; then
+    /usr/local/arvados-dev/jenkins/run_upload_packages.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET
+  else
+    echo "Skipping package upload, there were errors building and/or testing the packages"
+  fi
+  checkexit $? "upload packages"
+  title "End of upload packages (`timer`)"
+fi
+
+exit_cleanly
index bc76dd6ff30d513138e79483a79aedba0bf8b61e..58a177ec60b0bad3f6e7e36ecbadd7e4b4bcf5ce 100755 (executable)
@@ -178,8 +178,8 @@ fpm_build () {
   # pip).
   PACKAGE=$1
   shift
-  # The name of the package to build.  Defaults to $PACKAGE.
-  PACKAGE_NAME=${1:-$PACKAGE}
+  # The name of the package to build.
+  PACKAGE_NAME=$1
   shift
   # Optional: the vendor of the package.  Should be "Curoverse, Inc." for
   # packages of our own software.  Passed to fpm --vendor.
@@ -192,6 +192,8 @@ fpm_build () {
   VERSION=$1
   shift
 
+  local default_iteration_value="$(default_iteration "$PACKAGE" "$VERSION")"
+
   case "$PACKAGE_TYPE" in
       python)
           # All Arvados Python2 packages depend on Python 2.7.
@@ -199,7 +201,12 @@ fpm_build () {
           set -- "$@" --python-bin python2.7 \
               --python-easyinstall "$EASY_INSTALL2" \
               --python-package-name-prefix "$PYTHON2_PKG_PREFIX" \
+              --prefix "$PYTHON2_PREFIX" \
+              --python-install-lib "$PYTHON2_INSTALL_LIB" \
+              --exclude "${PYTHON2_INSTALL_LIB#/}/tests" \
               --depends "$PYTHON2_PACKAGE"
+          # Fix --iteration for #9242.
+          default_iteration_value=$(($default_iteration_value + 1))
           ;;
       python3)
           # fpm does not actually support a python3 package type.  Instead
@@ -210,26 +217,28 @@ fpm_build () {
           set -- "$@" --python-bin python3 \
               --python-easyinstall "$EASY_INSTALL3" \
               --python-package-name-prefix "$PYTHON3_PKG_PREFIX" \
+              --prefix "$PYTHON3_PREFIX" \
+              --python-install-lib "$PYTHON3_INSTALL_LIB" \
+              --exclude "${PYTHON3_INSTALL_LIB#/}/tests" \
               --depends "$PYTHON3_PACKAGE"
+          # Fix --iteration for #9242.
+          default_iteration_value=$(($default_iteration_value + 1))
           ;;
   esac
 
   declare -a COMMAND_ARR=("fpm" "--maintainer=Ward Vandewege <ward@curoverse.com>" "-s" "$PACKAGE_TYPE" "-t" "$FORMAT")
-  if [ python = "$PACKAGE_TYPE" ]; then
-    COMMAND_ARR+=(--exclude=\*/{dist,site}-packages/tests/\*)
-    if [ deb = "$FORMAT" ]; then
-        # Dependencies are built from setup.py.  Since setup.py will never
-        # refer to Debian package iterations, it doesn't make sense to
-        # enforce those in the .deb dependencies.
-        COMMAND_ARR+=(--deb-ignore-iteration-in-dependencies)
-    fi
+  if [ python = "$PACKAGE_TYPE" ] && [ deb = "$FORMAT" ]; then
+      # Dependencies are built from setup.py.  Since setup.py will never
+      # refer to Debian package iterations, it doesn't make sense to
+      # enforce those in the .deb dependencies.
+      COMMAND_ARR+=(--deb-ignore-iteration-in-dependencies)
   fi
 
   if [[ "${DEBUG:-0}" != "0" ]]; then
     COMMAND_ARR+=('--verbose' '--log' 'info')
   fi
 
-  if [[ "$PACKAGE_NAME" != "$PACKAGE" ]]; then
+  if [[ -n "$PACKAGE_NAME" ]]; then
     COMMAND_ARR+=('-n' "$PACKAGE_NAME")
   fi
 
@@ -242,7 +251,7 @@ fpm_build () {
   fi
   # We can always add an --iteration here.  If another one is specified in $@,
   # that will take precedence, as desired.
-  COMMAND_ARR+=(--iteration "$(default_iteration "$PACKAGE" "$VERSION")")
+  COMMAND_ARR+=(--iteration "$default_iteration_value")
 
   # Append --depends X and other arguments specified by fpm-info.sh in
   # the package source dir. These are added last so they can override
@@ -256,6 +265,9 @@ fpm_build () {
       "${PACKAGE%%=/*}"
       # backports ("llfuse==0.41.1" => "backports/python-llfuse")
       "${WORKSPACE}/backports/${PACKAGE_TYPE}-${PACKAGE%%[<=>]*}")
+  if [[ -n "$PACKAGE_NAME" ]]; then
+      fpm_dirs+=("${WORKSPACE}/backports/${PACKAGE_NAME}")
+  fi
   for pkgdir in "${fpm_dirs[@]}"; do
       fpminfo="$pkgdir/fpm-info.sh"
       if [[ -e "$fpminfo" ]]; then
@@ -342,3 +354,43 @@ install_package() {
     $SUDO yum -q -y install $PACKAGES
   fi
 }
+
+title () {
+    txt="********** $1 **********"
+    printf "\n%*s%s\n\n" $((($COLUMNS-${#txt})/2)) "" "$txt"
+}
+
+checkexit() {
+    if [[ "$1" != "0" ]]; then
+        title "!!!!!! $2 FAILED !!!!!!"
+        failures+=("$2 (`timer`)")
+    else
+        successes+=("$2 (`timer`)")
+    fi
+}
+
+timer_reset() {
+    t0=$SECONDS
+}
+
+timer() {
+    echo -n "$(($SECONDS - $t0))s"
+}
+
+report_outcomes() {
+    for x in "${successes[@]}"
+    do
+        echo "Pass: $x"
+    done
+
+    if [[ ${#failures[@]} == 0 ]]
+    then
+        echo "All test suites passed."
+    else
+        echo "Failures (${#failures[@]}):"
+        for x in "${failures[@]}"
+        do
+            echo "Fail: $x"
+        done
+    fi
+}
index f56e46ab8b7b75049539c2daaef963aeffb00243..30a80f527afabcee038350a963a077268b02aaa2 100755 (executable)
@@ -2,6 +2,9 @@
 
 . `dirname "$(readlink -f "$0")"`/libcloud-pin
 
+COLUMNS=80
+. `dirname "$(readlink -f "$0")"`/run-library.sh
+
 read -rd "\000" helpmessage <<EOF
 $(basename $0): Install and test Arvados components.
 
@@ -23,6 +26,7 @@ Options:
                You should provide GOPATH, GEMHOME, and VENVDIR options
                from a previous invocation if you use this option.
 --only-install Run specific install step
+--short        Skip (or scale down) some slow tests.
 WORKSPACE=path Arvados source tree to test.
 CONFIGSRC=path Dir with api server config files to copy into source tree.
                (If none given, leave config files alone in source tree.)
@@ -66,6 +70,7 @@ services/fuse
 services/keep-web
 services/keepproxy
 services/keepstore
+services/keep-balance
 services/login-sync
 services/nodemanager
 services/crunch-run
@@ -75,8 +80,10 @@ sdk/cli
 sdk/pam
 sdk/python
 sdk/ruby
+sdk/go/arvados
 sdk/go/arvadosclient
 sdk/go/keepclient
+sdk/go/httpserver
 sdk/go/manifest
 sdk/go/blockdigest
 sdk/go/streamer
@@ -84,6 +91,7 @@ sdk/go/crunchrunner
 sdk/cwl
 tools/crunchstat-summary
 tools/keep-rsync
+tools/keep-block-check
 
 EOF
 
@@ -101,8 +109,7 @@ PYTHONPATH=
 GEMHOME=
 PERLINSTALLBASE=
 
-COLUMNS=80
-
+short=
 skip_install=
 temp=
 temp_preserve=
@@ -124,24 +131,6 @@ fatal() {
     exit 1
 }
 
-report_outcomes() {
-    for x in "${successes[@]}"
-    do
-        echo "Pass: $x"
-    done
-
-    if [[ ${#failures[@]} == 0 ]]
-    then
-        echo "All test suites passed."
-    else
-        echo "Failures (${#failures[@]}):"
-        for x in "${failures[@]}"
-        do
-            echo "Fail: $x"
-        done
-    fi
-}
-
 exit_cleanly() {
     trap - INT
     create-plot-data-from-log.sh $BUILD_NUMBER "$WORKSPACE/apps/workbench/log/test.log" "$WORKSPACE/apps/workbench/log/"
@@ -163,6 +152,8 @@ sanity_checks() {
     echo -n 'go: '
     go version \
         || fatal "No go binary. See http://golang.org/doc/install"
+    [[ $(go version) =~ go1.([0-9]+) ]] && [[ ${BASH_REMATCH[1]} -ge 6 ]] \
+        || fatal "Go >= 1.6 required. See http://golang.org/doc/install"
     echo -n 'gcc: '
     gcc --version | egrep ^gcc \
         || fatal "No gcc. Try: apt-get install build-essential"
@@ -219,6 +210,9 @@ do
         --only)
             only="$1"; skip[$1]=""; shift
             ;;
+        --short)
+            short=1
+            ;;
         --skip-install)
             skip_install=1
             ;;
@@ -401,7 +395,12 @@ setup_virtualenv() {
     if ! [[ -e "$venvdest/bin/activate" ]] || ! [[ -e "$venvdest/bin/pip" ]]; then
         virtualenv --setuptools "$@" "$venvdest" || fatal "virtualenv $venvdest failed"
     fi
-    "$venvdest/bin/pip" install 'setuptools>=18' 'pip>=7'
+    if [[ $("$venvdest/bin/python" --version 2>&1) =~ \ 3\.[012]\. ]]; then
+        # pip 8.0.0 dropped support for python 3.2, e.g., debian wheezy
+        "$venvdest/bin/pip" install 'setuptools>=18' 'pip>=7,<8'
+    else
+        "$venvdest/bin/pip" install 'setuptools>=18' 'pip>=7'
+    fi
     # ubuntu1404 can't seem to install mock via tests_require, but it can do this.
     "$venvdest/bin/pip" install 'mock>=1.0' 'pbr<1.7.0'
 }
@@ -470,23 +469,6 @@ then
     gem install --user-install bundler || fatal 'Could not install bundler'
 fi
 
-checkexit() {
-    if [[ "$1" != "0" ]]; then
-        title "!!!!!! $2 FAILED !!!!!!"
-        failures+=("$2 (`timer`)")
-    else
-        successes+=("$2 (`timer`)")
-    fi
-}
-
-timer_reset() {
-    t0=$SECONDS
-}
-
-timer() {
-    echo -n "$(($SECONDS - $t0))s"
-}
-
 retry() {
     while ! ${@} && [[ "$retry" == 1 ]]
     do
@@ -516,28 +498,29 @@ do_test_once() {
             # before trying "go test". Otherwise, coverage-reporting
             # mode makes Go show the wrong line numbers when reporting
             # compilation errors.
+            go get -t "git.curoverse.com/arvados.git/$1" || return 1
             if [[ -n "${testargs[$1]}" ]]
             then
                 # "go test -check.vv giturl" doesn't work, but this
                 # does:
-                cd "$WORKSPACE/$1" && \
-                    go get -t "git.curoverse.com/arvados.git/$1" && \
-                    go test ${coverflags[@]} ${testargs[$1]}
+                cd "$WORKSPACE/$1" && go test ${short:+-short} ${testargs[$1]}
             else
                 # The above form gets verbose even when testargs is
                 # empty, so use this form in such cases:
-                go get -t "git.curoverse.com/arvados.git/$1" && \
-                    go test ${coverflags[@]} "git.curoverse.com/arvados.git/$1"
+                go test ${short:+-short} ${coverflags[@]} "git.curoverse.com/arvados.git/$1"
             fi
             result="$?"
-            go tool cover -html="$WORKSPACE/tmp/.$covername.tmp" -o "$WORKSPACE/tmp/$covername.html"
-            rm "$WORKSPACE/tmp/.$covername.tmp"
+            if [[ -f "$WORKSPACE/tmp/.$covername.tmp" ]]
+            then
+                go tool cover -html="$WORKSPACE/tmp/.$covername.tmp" -o "$WORKSPACE/tmp/$covername.html"
+                rm "$WORKSPACE/tmp/.$covername.tmp"
+            fi
         elif [[ "$2" == "pip" ]]
         then
             # $3 can name a path directory for us to use, including trailing
             # slash; e.g., the bin/ subdirectory of a virtualenv.
             cd "$WORKSPACE/$1" \
-                && "${3}python" setup.py test ${testargs[$1]}
+                && "${3}python" setup.py ${short:+--short-tests-only} test ${testargs[$1]}
         elif [[ "$2" != "" ]]
         then
             "test_$2"
@@ -599,11 +582,6 @@ do_install_once() {
     fi
 }
 
-title () {
-    txt="********** $1 **********"
-    printf "\n%*s%s\n\n" $((($COLUMNS-${#txt})/2)) "" "$txt"
-}
-
 bundle_install_trylocal() {
     (
         set -e
@@ -729,8 +707,10 @@ do_install services/api apiserver
 
 declare -a gostuff
 gostuff=(
+    sdk/go/arvados
     sdk/go/arvadosclient
     sdk/go/blockdigest
+    sdk/go/httpserver
     sdk/go/manifest
     sdk/go/streamer
     sdk/go/crunchrunner
@@ -739,6 +719,7 @@ gostuff=(
     services/keep-web
     services/keepstore
     sdk/go/keepclient
+    services/keep-balance
     services/keepproxy
     services/datamanager/summary
     services/datamanager/collection
@@ -748,6 +729,7 @@ gostuff=(
     services/crunch-dispatch-slurm
     services/crunch-run
     tools/keep-rsync
+    tools/keep-block-check
     )
 for g in "${gostuff[@]}"
 do
@@ -778,7 +760,7 @@ stop_services
 test_apiserver() {
     rm -f "$WORKSPACE/services/api/git-commit.version"
     cd "$WORKSPACE/services/api" \
-        && RAILS_ENV=test bundle exec rake test TESTOPTS=-v ${testargs[services/api]}
+        && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test TESTOPTS=-v ${testargs[services/api]}
 }
 do_test services/api apiserver
 
@@ -824,21 +806,21 @@ done
 test_workbench() {
     start_nginx_proxy_services \
         && cd "$WORKSPACE/apps/workbench" \
-        && RAILS_ENV=test bundle exec rake test TESTOPTS=-v ${testargs[apps/workbench]}
+        && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test TESTOPTS=-v ${testargs[apps/workbench]}
 }
 do_test apps/workbench workbench
 
 test_workbench_benchmark() {
     start_nginx_proxy_services \
         && cd "$WORKSPACE/apps/workbench" \
-        && RAILS_ENV=test bundle exec rake test:benchmark ${testargs[apps/workbench_benchmark]}
+        && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test:benchmark ${testargs[apps/workbench_benchmark]}
 }
 do_test apps/workbench_benchmark workbench_benchmark
 
 test_workbench_profile() {
     start_nginx_proxy_services \
         && cd "$WORKSPACE/apps/workbench" \
-        && RAILS_ENV=test bundle exec rake test:profile ${testargs[apps/workbench_profile]}
+        && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test:profile ${testargs[apps/workbench_profile]}
 }
 do_test apps/workbench_profile workbench_profile
 
index 71c10c9b3613bae0c98540c631bd575d7324a0f2..9e8832bb5553a033fd0847573624203c21d9f5dd 100755 (executable)
@@ -1,2 +1,7 @@
 #!/bin/sh
-exec $TASK_KEEPMOUNT/$JOB_PARAMETER_CRUNCHRUNNER
+
+if test -n "$JOB_PARAMETER_CRUNCHRUNNER" ; then
+    exec $TASK_KEEPMOUNT/$JOB_PARAMETER_CRUNCHRUNNER
+else
+    exec /usr/local/bin/crunchrunner
+fi
diff --git a/crunch_scripts/cwl-runner b/crunch_scripts/cwl-runner
new file mode 100755 (executable)
index 0000000..c786fc1
--- /dev/null
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+# Crunch script integration for running arvados-cwl-runner (importing
+# arvados_cwl module) inside a crunch job.
+#
+# This gets the job record, transforms the script parameters into a valid CWL
+# input object, then executes the CWL runner to run the underlying workflow or
+# tool.  When the workflow completes, record the output object in an output
+# collection for this runner job.
+
+import arvados
+import arvados_cwl
+import arvados.collection
+import arvados.util
+from cwltool.process import shortname
+import cwltool.main
+import logging
+import os
+import json
+import argparse
+from arvados.api import OrderedJsonModel
+from cwltool.process import adjustFiles
+from cwltool.load_tool import load_tool
+
+# Print package versions
+logging.info(cwltool.main.versionstring())
+
+api = arvados.api("v1")
+
+try:
+    job_order_object = arvados.current_job()['script_parameters']
+
+    def keeppath(v):
+        if arvados.util.keep_locator_pattern.match(v):
+            return "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], v)
+        else:
+            return v
+
+    job_order_object["cwl:tool"] = keeppath(job_order_object["cwl:tool"])
+
+    for k,v in job_order_object.items():
+        if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
+            job_order_object[k] = {
+                "class": "File",
+                "path": keeppath(v)
+            }
+
+    adjustFiles(job_order_object, keeppath)
+
+    runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()))
+
+    t = load_tool(job_order_object, runner.arvMakeTool)
+
+    args = argparse.Namespace()
+    args.project_uuid = arvados.current_job()["owner_uuid"]
+    args.enable_reuse = True
+    args.submit = False
+    args.debug = True
+    args.quiet = False
+    args.ignore_docker_for_reuse = False
+    args.basedir = os.getcwd()
+    args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
+    outputObj = runner.arvExecutor(t, job_order_object, **vars(args))
+
+    files = {}
+    def capture(path):
+        sp = path.split("/")
+        col = sp[0][5:]
+        if col not in files:
+            files[col] = set()
+        files[col].add("/".join(sp[1:]))
+        return path
+
+    adjustFiles(outputObj, capture)
+
+    final = arvados.collection.Collection()
+
+    for k,v in files.iteritems():
+        with arvados.collection.Collection(k) as c:
+            for f in c:
+                final.copy(f, f, c, True)
+
+    def makeRelative(path):
+        return "/".join(path.split("/")[1:])
+
+    adjustFiles(outputObj, makeRelative)
+
+    with final.open("cwl.output.json", "w") as f:
+        json.dump(outputObj, f, indent=4)
+
+    api.job_tasks().update(uuid=arvados.current_task()['uuid'],
+                                         body={
+                                             'output': final.save_new(create_collection_record=False),
+                                             'success': True,
+                                             'progress':1.0
+                                         }).execute()
+except Exception as e:
+    logging.exception("Unhandled exception")
+    api.job_tasks().update(uuid=arvados.current_task()['uuid'],
+                                         body={
+                                             'output': None,
+                                             'success': False,
+                                             'progress':1.0
+                                         }).execute()
index 08e71f595b2e07bd880e45fae4eab1efdbfa12e8..b5ad5d29e3e164fae8ab3b68cd254621eb860487 100644 (file)
@@ -5,7 +5,7 @@ On older Red Hat-based systems, these packages require the python27 Software Col
 To "enable Software Collections on CentOS":https://wiki.centos.org/AdditionalResources/Repositories/SCL, run:
 
 <notextile>
-<pre><code>~$ <span class="userinput">sudo yum install centos-release-SCL scl-utils</span>
+<pre><code>~$ <span class="userinput">sudo yum install centos-release-scl scl-utils</span>
 </code></pre>
 </notextile>
 
index 5bc7611d0deedede7db25a38af830ffb877dc693..58b6a51cb444c2bc02cc9bf4472c3a474ad55b1e 100644 (file)
@@ -47,6 +47,7 @@ See "Specifying Git versions":#script_version below for more detail about accept
 |arvados_sdk_version|string|Git commit hash that specifies the SDK version to use from the Arvados repository|This is set by searching the Arvados repository for a match for the arvados_sdk_version runtime constraint.|
 |docker_image_locator|string|Portable data hash of the collection that contains the Docker image to use|This is set by searching readable collections for a match for the docker_image runtime constraint.|
 |runtime_constraints|hash|Constraints that must be satisfied by the job/task scheduler in order to run the job.|See below.|
+|components|hash|Name and uuid pairs representing the child work units of this job. The uuids can be of different object types.|Example components hash: @{"name1": "zzzzz-8i9sb-xyz...", "name2": "zzzzz-d1hrv-xyz...",}@|
 
 h3(#script_version). Specifying Git versions
 
index 3ddc7c825819e2799d75ea8e9b94eb7f2bb6a41d..9c8cd49f7e8228f7a7a2ff84c9a6bfdadb984caf 100644 (file)
@@ -26,9 +26,8 @@ h2. Usage
 $ arvbox
 Arvados-in-a-box                      http://arvados.org
 
-arvbox (build|start|run|open|shell|ip|stop|rebuild|reset|destroy|log|svrestart)
-
-build <config>      build arvbox Docker image
+build   <config>      build arvbox Docker image
+rebuild <config>      build arvbox Docker image, no layer cache
 start|run <config>  start arvbox container
 open       open arvbox workbench in a web browser
 shell      enter arvbox shell
@@ -37,7 +36,7 @@ host       print arvbox published host
 status     print some information about current arvbox
 stop       stop arvbox container
 restart <config>  stop, then run again
-rebuild  <config>  stop, build arvbox Docker image, run
+reboot  <config>  stop, build arvbox Docker image, run
 reset      delete arvbox arvados data (be careful!)
 destroy    delete all arvbox code and data (be careful!)
 log <service> tail log of specified service
@@ -122,14 +121,14 @@ h2. Making Arvbox accessible from other hosts
 In "dev" and "localdemo" mode, Arvbox can only be accessed on the same host it is running.  To publish Arvbox service ports to the host's service ports and advertise the host's IP address for services, use @publicdev@ or @publicdemo@:
 
 <pre>
-$ arvbox rebuild publicdemo
+$ arvbox start publicdemo
 </pre>
 
 This attempts to auto-detect the correct IP address to use by taking the IP address of the default route device.  If the auto-detection is wrong, you want to publish a hostname instead of a raw address, or you need to access it through a different device (such as a router or firewall), set @ARVBOX_PUBLISH_IP@ to the desire hostname or IP address.
 
 <pre>
 $ export ARVBOX_PUBLISH_IP=example.com
-$ arvbox rebuild publicdemo
+$ arvbox start publicdemo
 </pre>
 
 Note: this expects to bind the host's port 80 (http) for workbench, so you cannot have a conflicting web server already running on the host.  It does not attempt to take bind the host's port 22 (ssh), as a result the arvbox ssh port is not published.
index 9a64ac76d79532d643a895a7df5d2f9971dfd2fc..a2e3e01ecb117aa0170e1349914dfe9b3a07d7ca 100644 (file)
@@ -13,14 +13,14 @@ First, "add the appropriate package repository for your distribution":{{ site.ba
 On Debian-based systems:
 
 <notextile>
-<pre><code>~$ <span class="userinput">sudo apt-get install perl python-virtualenv fuse python-arvados-python-client python-arvados-fuse crunchstat arvados-docker-cleaner iptables ca-certificates</span>
+<pre><code>~$ <span class="userinput">sudo apt-get install perl python-virtualenv fuse python-arvados-python-client python-arvados-fuse crunchrunner crunchstat arvados-docker-cleaner iptables ca-certificates</span>
 </code></pre>
 </notextile>
 
 On Red Hat-based systems:
 
 <notextile>
-<pre><code>~$ <span class="userinput">sudo yum install perl python27-python-virtualenv fuse python27-python-arvados-python-client python27-python-arvados-fuse crunchstat arvados-docker-cleaner iptables ca-certificates</span>
+<pre><code>~$ <span class="userinput">sudo yum install perl python27-python-virtualenv fuse python27-python-arvados-python-client python27-python-arvados-fuse crunchrunner crunchstat arvados-docker-cleaner iptables ca-certificates</span>
 </code></pre>
 </notextile>
 
index 13dfaf6725d40e089759ec62058fcd63ff83c33b..6548422f4f8d0492cfac61a25257c365f238bcde 100644 (file)
@@ -41,12 +41,14 @@ Usage of ./keepstore:
   -azure-storage-account-name="": Azure storage account name used for subsequent --azure-storage-container-volume arguments.
   -azure-storage-container-volume=[]: Use the given container as a storage volume. Can be given multiple times.
   -azure-storage-replication=3: Replication level to report to clients when data is stored in an Azure container.
-  -blob-signature-ttl=1209600: Lifetime of blob permission signatures. See services/api/config/application.default.yml.
+  -blob-signature-ttl=1209600: Lifetime of blob permission signatures. Modifying the ttl will invalidate all existing signatures. See services/api/config/application.default.yml.
   -blob-signing-key-file="": File containing the secret key for generating and verifying blob permission signatures.
   -data-manager-token-file="": File with the API token used by the Data Manager. All DELETE requests or GET /index requests must carry this token.
   -enforce-permissions=false: Enforce permission signatures on requests.
   -listen=":25107": Listening address, in the form "host:port". e.g., 10.0.1.24:8000. Omit the host part to listen on all interfaces.
   -max-buffers=128: Maximum RAM to use for data buffers, given in multiples of block size (64 MiB). When this limit is reached, HTTP requests requiring buffers (like GET and PUT) will wait for buffer space to be released.
+  -max-requests int
+   Maximum concurrent requests. When this limit is reached, new requests will receive 503 responses. Note: this limit does not include idle connections from clients using HTTP keepalive, so it does not strictly limit the number of concurrent connections. (default 2 * max-buffers)
   -never-delete=false: If set, nothing will be deleted. HTTP 405 will be returned for valid DELETE requests.
   -permission-key-file="": Synonym for -blob-signing-key-file.
   -permission-ttl=0: Synonym for -blob-signature-ttl.
index dd5995ffdde442c85f665cb5feae14fd9b0fe879..3d3f4b474d4fa53d00633e63ff2bab97c1066bbd 100644 (file)
@@ -47,14 +47,14 @@ h2. Install the Python SDK and utilities
 On Debian-based systems:
 
 <notextile>
-<pre><code>~$ <span class="userinput">sudo apt-get install python-arvados-python-client python-arvados-fuse</span>
+<pre><code>~$ <span class="userinput">sudo apt-get install python-arvados-python-client python-arvados-fuse crunchrunner</span>
 </code></pre>
 </notextile>
 
 On Red Hat-based systems:
 
 <notextile>
-<pre><code>~$ <span class="userinput">sudo yum install python27-python-arvados-python-client python27-python-arvados-fuse</span>
+<pre><code>~$ <span class="userinput">sudo yum install python27-python-arvados-python-client python27-python-arvados-fuse crunchrunner</span>
 </code></pre>
 </notextile>
 
index 75702960133b3be2555b851babb9d5bf92e4c9a3..8a757d014e6f73919f3f74e8c5eef40faea20585 100644 (file)
@@ -217,7 +217,7 @@ keep-proxy-image: debian-arvados-image $(BUILD) $(KEEP_PROXY_DEPS)
        date >keep-proxy-image
 
 jobs-image: debian-arvados-image $(BUILD) $(JOBS_DEPS)
-       $(DOCKER_BUILD) -t arvados/jobs jobs
+       $(DOCKER_BUILD) --build-arg COMMIT=$(COMMIT) -t arvados/jobs jobs
        date >jobs-image
 
 java-bwa-samtools-image: jobs-image $(BUILD) $(JAVA_BWA_SAMTOOLS_DEPS)
index 948f6c73d4e486973e733c3ea004de3726b4e063..60b5fa4bb6279cf80605d3cfc63343f9ce1e8139 100644 (file)
@@ -6,7 +6,7 @@ MAINTAINER Ward Vandewege <ward@curoverse.com>
 RUN apt-get update -q
 ## 20150915 nico -- fuse.postint has sporatic failures, spliting this up to see if it helps
 RUN apt-get install -qy fuse
-RUN apt-get install -qy supervisor python-pip python-pyvcf python-gflags python-google-api-python-client python-virtualenv libattr1-dev libfuse-dev python-dev python-llfuse crunchstat python-arvados-fuse cron dnsmasq
+RUN apt-get install -qy supervisor python-pip python-gflags python-google-api-python-client python-virtualenv libattr1-dev libfuse-dev python-dev python-llfuse crunchstat python-arvados-fuse cron dnsmasq
 
 ADD fuse.conf /etc/fuse.conf
 RUN chmod 644 /etc/fuse.conf
index 0d7295873f723e637cf76413e01c16c6a2be5d95..d80c3a882defe43676476df144401eee64d97728 100644 (file)
@@ -1,19 +1,19 @@
-# Based on Debian Wheezy
-FROM arvados/debian:wheezy
+# Based on Debian Jessie
+FROM debian:jessie
 MAINTAINER Ward Vandewege <ward@curoverse.com>
 
 ENV DEBIAN_FRONTEND noninteractive
 
 ADD apt.arvados.org.list /etc/apt/sources.list.d/
 RUN apt-key adv --keyserver pool.sks-keyservers.net --recv 1078ECD7
-RUN apt-get update -q
+RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3
 
-RUN apt-get install -qy git python-pip python-virtualenv python-arvados-python-client python-dev libcurl4-gnutls-dev
+ARG COMMIT=latest
+RUN echo $COMMIT && apt-get update -q
 
-RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3
+RUN apt-get install -qy git python-pip python-virtualenv python-arvados-python-client python-dev libcurl4-gnutls-dev nodejs python-arvados-cwl-runner
 
 # Install dependencies and set up system.
-# The FUSE packages help ensure that we can install the Python SDK (arv-mount).
 RUN /usr/sbin/adduser --disabled-password \
       --gecos 'Crunch execution user' crunch && \
     /usr/bin/install --directory --owner=crunch --group=crunch --mode=0700 /keep /tmp/crunch-src /tmp/crunch-job
index 7eb8716071ac41adeaba2ece2428098002c73ed4..3ae6df42160b2c66025f832fa159e739aa975cb5 100644 (file)
@@ -1,2 +1,2 @@
 # apt.arvados.org
-deb http://apt.arvados.org/ wheezy main
+deb http://apt.arvados.org/ jessie main
index 8f0ed41afaefc9ae4aa86daf86097fbcdcad1712..3e6e3e4c6b34e25bdb304f3fbfc3a1780ed69ef3 100644 (file)
@@ -5,7 +5,7 @@ MAINTAINER Ward Vandewege <ward@curoverse.com>
 
 RUN apt-get update -q
 RUN apt-get install -qy \
-    python-pip python-pyvcf python-gflags python-google-api-python-client \
+    python-pip python-gflags python-google-api-python-client \
     python-virtualenv libattr1-dev libfuse-dev python-dev python-llfuse fuse \
     crunchstat python-arvados-fuse cron vim supervisor openssh-server
 
index 526de4a8cff31b35ed647fcc91b5fa3b7d349f3f..0e11f9639d1aa2b233bffe08356986b1a0e5fe0b 100644 (file)
@@ -25,14 +25,15 @@ Gem::Specification.new do |s|
   s.executables << "arv-tag"
   s.required_ruby_version = '>= 2.1.0'
   s.add_runtime_dependency 'arvados', '~> 0.1', '>= 0.1.20150128223554'
-  s.add_runtime_dependency 'google-api-client', '~> 0.6.3', '>= 0.6.3'
+  # Our google-api-client dependency used to be < 0.9, but that could be
+  # satisfied by the buggy 0.9.pre*.  https://dev.arvados.org/issues/9213
+  s.add_runtime_dependency 'google-api-client', '~> 0.6', '>= 0.6.3', '<0.8.9'
   s.add_runtime_dependency 'activesupport', '~> 3.2', '>= 3.2.13'
   s.add_runtime_dependency 'json', '~> 1.7', '>= 1.7.7'
   s.add_runtime_dependency 'trollop', '~> 2.0'
   s.add_runtime_dependency 'andand', '~> 1.3', '>= 1.3.3'
   s.add_runtime_dependency 'oj', '~> 2.0', '>= 2.0.3'
   s.add_runtime_dependency 'curb', '~> 0.8'
-  s.add_runtime_dependency('jwt', '>= 0.1.5', '< 1.0.0')
   s.homepage    =
     'https://arvados.org'
 end
index 10a9d43961b0e6c630c2320df857650e08c064ac..aa038ac54aec89349ef999ef4cfaa852869e079f 100755 (executable)
@@ -14,24 +14,33 @@ if RUBY_VERSION < '1.9.3' then
 end
 
 begin
-  require 'curb'
-  require 'rubygems'
-  require 'arvados/google_api_client'
   require 'json'
+  require 'net/http'
   require 'pp'
-  require 'trollop'
+  require 'tempfile'
+  require 'yaml'
+rescue LoadError => error
+  abort "Error loading libraries: #{error}\n"
+end
+
+begin
+  require 'rubygems'
+  # Load the gems with more requirements first, so we respect any version
+  # constraints they put on gems loaded later.
+  require 'arvados/google_api_client'
+  require 'active_support/inflector'
   require 'andand'
+  require 'curb'
   require 'oj'
-  require 'active_support/inflector'
-  require 'yaml'
-  require 'tempfile'
-  require 'net/http'
-rescue LoadError
+  require 'trollop'
+rescue LoadError => error
   abort <<-EOS
 
+Error loading gems: #{error}
+
 Please install all required gems:
 
-  gem install activesupport andand curb google-api-client json oj trollop yaml
+  gem install arvados activesupport andand curb json oj trollop
 
   EOS
 end
@@ -570,7 +579,7 @@ def parse_arguments(discovery_document, subcommands)
       end
     end
 
-    discovered_params.each do |k,v|
+    discovered_params.merge({resource => {'type' => 'object'}}).each do |k,v|
       k = k.to_sym
       if ['object', 'array'].index(v["type"]) and method_opts.has_key? k
         if method_opts[k].andand.match /^\//
index 70e2f42ede56ab9949f7ee9105c730b83a545966..6dc82c5a20b841b1aeb1400ecdaf7dd6c21d4ed5 100755 (executable)
@@ -17,7 +17,7 @@ begin
   require 'trollop'
   require 'google/api_client'
 rescue LoadError => l
-  puts $:
+  $stderr.puts $:
   abort <<-EOS
 #{$0}: fatal: #{l.message}
 Some runtime dependencies may be missing.
@@ -132,7 +132,7 @@ if $options[:instance]
     abort "#{$0}: syntax error: --instance cannot be combined with --template or --submit."
   end
 elsif not $options[:template]
-  puts "error: you must supply a --template or --instance."
+  $stderr.puts "error: you must supply a --template or --instance."
   p.educate
   abort
 end
index 7c2855c5ea72829d0e8a8cd73a849b6feaa44c56..39238b0fc649d400a380e397c8f0520fbc75b476 100755 (executable)
@@ -416,8 +416,17 @@ if ($docker_locator = $Job->{docker_image_locator}) {
   Log (undef, "docker image hash is $docker_hash");
   $docker_stream =~ s/^\.//;
   my $docker_install_script = qq{
-if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
-    arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
+if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then
+    exit 0
+fi
+declare -a exit_codes=("\${PIPESTATUS[@]}")
+if [ 0 != "\${exit_codes[0]}" ]; then
+   exit "\${exit_codes[0]}"  # `docker images` failed
+elif [ 1 != "\${exit_codes[1]}" ]; then
+   exit "\${exit_codes[1]}"  # `grep` encountered an error
+else
+   # Everything worked fine, but grep didn't find the image on this host.
+   arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
 fi
 };
 
@@ -852,7 +861,11 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
         .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
         .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
         ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
-        ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
+        ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
+        .q{&& declare -a VOLUMES=() }
+        .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi }
+        .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; }
+        .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi };
 
     $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
     $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
@@ -917,6 +930,10 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
       # For now, use the same approach as TASK_WORK above.
       $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
 
+      # Bind mount the crunchrunner binary and host TLS certificates file into
+      # the container.
+      $command .= '"${VOLUMES[@]}" ';
+
       while (my ($env_key, $env_val) = each %ENV)
       {
         if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
@@ -1145,13 +1162,6 @@ sub reapchildren
                     . $slot[$proc{$pid}->{slot}]->{cpu});
     my $jobstepidx = $proc{$pid}->{jobstepidx};
 
-    if (!WIFEXITED($childstatus))
-    {
-      # child did not exit (may be temporarily stopped)
-      Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now.");
-      next;
-    }
-
     $children_reaped++;
     my $elapsed = time - $proc{$pid}->{time};
     my $Jobstep = $jobstep[$jobstepidx];
@@ -1459,6 +1469,9 @@ sub readfrompipes
 sub preprocess_stderr
 {
   my $jobstepidx = shift;
+  # slotindex is only defined for children running Arvados job tasks.
+  # Be prepared to handle the undef case (for setup srun calls, etc.).
+  my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
 
   while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
     my $line = $1;
@@ -1468,19 +1481,16 @@ sub preprocess_stderr
       # whoa.
       $main::please_freeze = 1;
     }
-    elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
-      # Skip the following tempfail checks if this srun proc isn't
-      # attached to a particular worker slot.
-    }
     elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
-      my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
-      $slot[$job_slot_index]->{node}->{fail_count}++;
       $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($job_slot_index);
+      if (defined($job_slot_index)) {
+        $slot[$job_slot_index]->{node}->{fail_count}++;
+        ban_node_by_slot($job_slot_index);
+      }
     }
     elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
       $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+      ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
     }
     elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
       $jobstep[$jobstepidx]->{tempfail} = 1;
@@ -1970,6 +1980,11 @@ sub srun_sync
   delete $reader{$jobstepidx};
 
   my $j = pop @jobstep;
+  # If the srun showed signs of tempfail, ensure the caller treats that as a
+  # failure case.
+  if ($main::please_freeze || $j->{tempfail}) {
+    $exited ||= 255;
+  }
   return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
 }
 
index 3dc4bdd434a101507fee3ebd8f2e5004e66cd49c..f7a9dbe41a853412d84a1752ccae0637ef2250d4 100644 (file)
@@ -1,6 +1,7 @@
 require 'minitest/autorun'
 require 'digest/md5'
 require 'active_support/core_ext'
+require 'tempfile'
 
 class TestCollectionCreate < Minitest::Test
   def setup
@@ -16,7 +17,22 @@ class TestCollectionCreate < Minitest::Test
     end
     assert /^([0-9a-z]{5}-4zz18-[0-9a-z]{15})?$/.match(out)
     assert_equal '', err
-    $stderr.puts err
+  end
+
+  def test_read_resource_object_from_file
+    tempfile = Tempfile.new('collection')
+    begin
+      tempfile.write({manifest_text: foo_manifest}.to_json)
+      tempfile.close
+      out, err = capture_subprocess_io do
+        assert_arv('--format', 'uuid',
+                   'collection', 'create', '--collection', tempfile.path)
+      end
+      assert /^([0-9a-z]{5}-4zz18-[0-9a-z]{15})?$/.match(out)
+      assert_equal '', err
+    ensure
+      tempfile.unlink
+    end
   end
 
   protected
diff --git a/sdk/cwl/LICENSE-2.0.txt b/sdk/cwl/LICENSE-2.0.txt
new file mode 100644 (file)
index 0000000..d645695
--- /dev/null
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/sdk/cwl/MANIFEST.in b/sdk/cwl/MANIFEST.in
new file mode 100644 (file)
index 0000000..71dc536
--- /dev/null
@@ -0,0 +1,2 @@
+include LICENSE-2.0.txt
+include README.rst
index 533d24a6981177e2a5d6ba916b367fd0bb1a2e73..371dd4fb8c5252ac7f37bddb240176dd389405d4 100644 (file)
@@ -1,40 +1,47 @@
 #!/usr/bin/env python
 
+# Implement cwl-runner interface for submitting and running jobs on Arvados.
+
 import argparse
 import arvados
-import arvados.events
+import arvados.collection
 import arvados.commands.keepdocker
 import arvados.commands.run
-import arvados.collection
+import arvados.events
 import arvados.util
-import cwltool.draft2tool
-import cwltool.workflow
-import cwltool.main
-from cwltool.process import shortname
-import threading
+import copy
 import cwltool.docker
+from cwltool.draft2tool import revmap_file, remove_hostfs, CommandLineTool
+from cwltool.errors import WorkflowException
+import cwltool.main
+import cwltool.workflow
 import fnmatch
+from functools import partial
+import json
 import logging
-import re
 import os
+import pkg_resources  # part of setuptools
+import re
 import sys
+import threading
+from cwltool.load_tool import fetch_document
+from cwltool.builder import Builder
+import urlparse
 
-from cwltool.process import get_feature
+from cwltool.process import shortname, get_feature, adjustFiles, adjustFileObjs, scandeps
 from arvados.api import OrderedJsonModel
 
 logger = logging.getLogger('arvados.cwl-runner')
 logger.setLevel(logging.INFO)
 
-crunchrunner_pdh = "83db29f08544e1c319572a6bd971088a+140"
-crunchrunner_download = "https://cloud.curoverse.com/collections/download/qr1hi-4zz18-n3m1yxd0vx78jic/1i1u2qtq66k1atziv4ocfgsg5nu5tj11n4r6e0bhvjg03rix4m/crunchrunner"
-certs_download = "https://cloud.curoverse.com/collections/download/qr1hi-4zz18-n3m1yxd0vx78jic/1i1u2qtq66k1atziv4ocfgsg5nu5tj11n4r6e0bhvjg03rix4m/ca-certificates.crt"
-
 tmpdirre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.tmpdir\)=(.*)")
 outdirre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.outdir\)=(.*)")
 keepre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.keep\)=(.*)")
 
 
 def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid):
+    """Check if a Docker image is available in Keep, if not, upload it using arv-keepdocker."""
+
     if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement:
         dockerRequirement["dockerImageId"] = dockerRequirement["dockerPull"]
 
@@ -52,15 +59,17 @@ def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid
         if image_tag:
             args.append(image_tag)
         logger.info("Uploading Docker image %s", ":".join(args[1:]))
-        arvados.commands.keepdocker.main(args)
+        arvados.commands.keepdocker.main(args, stdout=sys.stderr)
 
     return dockerRequirement["dockerImageId"]
 
 
 class CollectionFsAccess(cwltool.process.StdFsAccess):
+    """Implement the cwltool FsAccess interface for Arvados Collections."""
+
     def __init__(self, basedir):
+        super(CollectionFsAccess, self).__init__(basedir)
         self.collections = {}
-        self.basedir = basedir
 
     def get_collection(self, path):
         p = path.split("/")
@@ -114,6 +123,8 @@ class CollectionFsAccess(cwltool.process.StdFsAccess):
             return os.path.exists(self._abs(fn))
 
 class ArvadosJob(object):
+    """Submit and manage a Crunch job for executing a CWL CommandLineTool."""
+
     def __init__(self, runner):
         self.arvrunner = runner
         self.running = False
@@ -151,6 +162,8 @@ class ArvadosJob(object):
         (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
         if docker_req and kwargs.get("use_container") is not False:
             runtime_constraints["docker_image"] = arv_docker_get_image(self.arvrunner.api, docker_req, pull_image, self.arvrunner.project_uuid)
+        else:
+            runtime_constraints["docker_image"] = "arvados/jobs"
 
         resources = self.builder.resources
         if resources is not None:
@@ -158,24 +171,30 @@ class ArvadosJob(object):
             runtime_constraints["min_ram_mb_per_node"] = resources.get("ram")
             runtime_constraints["min_scratch_mb_per_node"] = resources.get("tmpdirSize", 0) + resources.get("outdirSize", 0)
 
+        filters = [["repository", "=", "arvados"],
+                   ["script", "=", "crunchrunner"],
+                   ["script_version", "in git", "9e5b98e8f5f4727856b53447191f9c06e3da2ba6"]]
+        if not self.arvrunner.ignore_docker_for_reuse:
+            filters.append(["docker_image_locator", "in docker", runtime_constraints["docker_image"]])
+
         try:
-            response = self.arvrunner.api.jobs().create(body={
-                "owner_uuid": self.arvrunner.project_uuid,
-                "script": "crunchrunner",
-                "repository": "arvados",
-                "script_version": "master",
-                "minimum_script_version": "9e5b98e8f5f4727856b53447191f9c06e3da2ba6",
-                "script_parameters": {"tasks": [script_parameters], "crunchrunner": crunchrunner_pdh+"/crunchrunner"},
-                "runtime_constraints": runtime_constraints
-            }, find_or_create=kwargs.get("enable_reuse", True)).execute(num_retries=self.arvrunner.num_retries)
+            response = self.arvrunner.api.jobs().create(
+                body={
+                    "owner_uuid": self.arvrunner.project_uuid,
+                    "script": "crunchrunner",
+                    "repository": "arvados",
+                    "script_version": "master",
+                    "minimum_script_version": "9e5b98e8f5f4727856b53447191f9c06e3da2ba6",
+                    "script_parameters": {"tasks": [script_parameters]},
+                    "runtime_constraints": runtime_constraints
+                },
+                filters=filters,
+                find_or_create=kwargs.get("enable_reuse", True)
+            ).execute(num_retries=self.arvrunner.num_retries)
 
             self.arvrunner.jobs[response["uuid"]] = self
 
-            self.arvrunner.pipeline["components"][self.name] = {"job": response}
-            self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
-                                                                                     body={
-                                                                                         "components": self.arvrunner.pipeline["components"]
-                                                                                     }).execute(num_retries=self.arvrunner.num_retries)
+            self.update_pipeline_component(response)
 
             logger.info("Job %s (%s) is %s", self.name, response["uuid"], response["state"])
 
@@ -186,11 +205,24 @@ class ArvadosJob(object):
             self.output_callback({}, "permanentFail")
 
     def update_pipeline_component(self, record):
-        self.arvrunner.pipeline["components"][self.name] = {"job": record}
-        self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
+        if self.arvrunner.pipeline:
+            self.arvrunner.pipeline["components"][self.name] = {"job": record}
+            self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
                                                                                  body={
                                                                                     "components": self.arvrunner.pipeline["components"]
                                                                                  }).execute(num_retries=self.arvrunner.num_retries)
+        if self.arvrunner.uuid:
+            try:
+                job = self.arvrunner.api.jobs().get(uuid=self.arvrunner.uuid).execute()
+                if job:
+                    components = job["components"]
+                    components[self.name] = record["uuid"]
+                    self.arvrunner.api.jobs().update(uuid=self.arvrunner.uuid,
+                        body={
+                            "components": components
+                        }).execute(num_retries=self.arvrunner.num_retries)
+            except Exception as e:
+                logger.info("Error adding to components: %s", e)
 
     def done(self, record):
         try:
@@ -212,7 +244,15 @@ class ArvadosJob(object):
                     tmpdir = None
                     outdir = None
                     keepdir = None
-                    for l in log.readlines():
+                    for l in log:
+                        # Determine the tmpdir, outdir and keepdir paths from
+                        # the job run.  Unfortunately, we can't take the first
+                        # values we find (which are expected to be near the
+                        # top) and stop scanning because if the node fails and
+                        # the job restarts on a different node these values
+                        # will different runs, and we need to know about the
+                        # final run that actually produced output.
+
                         g = tmpdirre.match(l)
                         if g:
                             tmpdir = g.group(1)
@@ -222,14 +262,49 @@ class ArvadosJob(object):
                         g = keepre.match(l)
                         if g:
                             keepdir = g.group(1)
-                        if tmpdir and outdir and keepdir:
-                            break
+
+                    colname = "Output %s of %s" % (record["output"][0:7], self.name)
+
+                    # check if collection already exists with same owner, name and content
+                    collection_exists = self.arvrunner.api.collections().list(
+                        filters=[["owner_uuid", "=", self.arvrunner.project_uuid],
+                                 ['portable_data_hash', '=', record["output"]],
+                                 ["name", "=", colname]]
+                    ).execute(num_retries=self.arvrunner.num_retries)
+
+                    if not collection_exists["items"]:
+                        # Create a collection located in the same project as the
+                        # pipeline with the contents of the output.
+                        # First, get output record.
+                        collections = self.arvrunner.api.collections().list(
+                            limit=1,
+                            filters=[['portable_data_hash', '=', record["output"]]],
+                            select=["manifest_text"]
+                        ).execute(num_retries=self.arvrunner.num_retries)
+
+                        if not collections["items"]:
+                            raise WorkflowException(
+                                "Job output '%s' cannot be found on API server" % (
+                                    record["output"]))
+
+                        # Create new collection in the parent project
+                        # with the output contents.
+                        self.arvrunner.api.collections().create(body={
+                            "owner_uuid": self.arvrunner.project_uuid,
+                            "name": colname,
+                            "portable_data_hash": record["output"],
+                            "manifest_text": collections["items"][0]["manifest_text"]
+                        }, ensure_unique_name=True).execute(
+                            num_retries=self.arvrunner.num_retries)
 
                     self.builder.outdir = outdir
                     self.builder.pathmapper.keepdir = keepdir
                     outputs = self.collect_outputs("keep:" + record["output"])
+            except WorkflowException as e:
+                logger.error("Error while collecting job outputs:\n%s", e, exc_info=(e if self.arvrunner.debug else False))
+                processStatus = "permanentFail"
             except Exception as e:
-                logger.exception("Got exception while collecting job outputs:")
+                logger.exception("Got unknown exception while collecting job outputs:")
                 processStatus = "permanentFail"
 
             self.output_callback(outputs, processStatus)
@@ -237,23 +312,248 @@ class ArvadosJob(object):
             del self.arvrunner.jobs[record["uuid"]]
 
 
+class RunnerJob(object):
+    """Submit and manage a Crunch job that runs crunch_scripts/cwl-runner."""
+
+    def __init__(self, runner, tool, job_order, enable_reuse):
+        self.arvrunner = runner
+        self.tool = tool
+        self.job_order = job_order
+        self.running = False
+        self.enable_reuse = enable_reuse
+
+    def update_pipeline_component(self, record):
+        pass
+
+    def upload_docker(self, tool):
+        if isinstance(tool, CommandLineTool):
+            (docker_req, docker_is_req) = get_feature(tool, "DockerRequirement")
+            if docker_req:
+                arv_docker_get_image(self.arvrunner.api, docker_req, True, self.arvrunner.project_uuid)
+        elif isinstance(tool, cwltool.workflow.Workflow):
+            for s in tool.steps:
+                self.upload_docker(s.embedded_tool)
+
+    def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs):
+        """Create an Arvados job specification for this workflow.
+
+        The returned dict can be used to create a job (i.e., passed as
+        the +body+ argument to jobs().create()), or as a component in
+        a pipeline template or pipeline instance.
+        """
+        self.upload_docker(self.tool)
+
+        workflowfiles = set()
+        jobfiles = set()
+        workflowfiles.add(self.tool.tool["id"])
+
+        self.name = os.path.basename(self.tool.tool["id"])
+
+        def visitFiles(files, path):
+            files.add(path)
+            return path
+
+        document_loader, workflowobj, uri = fetch_document(self.tool.tool["id"])
+        def loadref(b, u):
+            return document_loader.fetch(urlparse.urljoin(b, u))
+
+        sc = scandeps(uri, workflowobj,
+                      set(("$import", "run")),
+                      set(("$include", "$schemas", "path")),
+                      loadref)
+        adjustFiles(sc, partial(visitFiles, workflowfiles))
+        adjustFiles(self.job_order, partial(visitFiles, jobfiles))
+
+        workflowmapper = ArvPathMapper(self.arvrunner, workflowfiles, "",
+                                       "%s",
+                                       "%s/%s",
+                                       name=self.name,
+                                       **kwargs)
+
+        jobmapper = ArvPathMapper(self.arvrunner, jobfiles, "",
+                                  "%s",
+                                  "%s/%s",
+                                  name=os.path.basename(self.job_order.get("id", "#")),
+                                  **kwargs)
+
+        adjustFiles(self.job_order, lambda p: jobmapper.mapper(p)[1])
+
+        if "id" in self.job_order:
+            del self.job_order["id"]
+
+        self.job_order["cwl:tool"] = workflowmapper.mapper(self.tool.tool["id"])[1]
+        return {
+            "script": "cwl-runner",
+            "script_version": "master",
+            "repository": "arvados",
+            "script_parameters": self.job_order,
+            "runtime_constraints": {
+                "docker_image": "arvados/jobs"
+            }
+        }
+
+    def run(self, *args, **kwargs):
+        job_spec = self.arvados_job_spec(*args, **kwargs)
+        job_spec.setdefault("owner_uuid", self.arvrunner.project_uuid)
+
+        response = self.arvrunner.api.jobs().create(
+            body=job_spec,
+            find_or_create=self.enable_reuse
+        ).execute(num_retries=self.arvrunner.num_retries)
+
+        self.uuid = response["uuid"]
+        self.arvrunner.jobs[self.uuid] = self
+
+        logger.info("Submitted job %s", response["uuid"])
+
+        if kwargs.get("submit"):
+            self.pipeline = self.arvrunner.api.pipeline_instances().create(
+                body={
+                    "owner_uuid": self.arvrunner.project_uuid,
+                    "name": shortname(self.tool.tool["id"]),
+                    "components": {"cwl-runner": {"job": {"uuid": self.uuid, "state": response["state"]} } },
+                    "state": "RunningOnClient"}).execute(num_retries=self.arvrunner.num_retries)
+
+        if response["state"] in ("Complete", "Failed", "Cancelled"):
+            self.done(response)
+
+    def done(self, record):
+        if record["state"] == "Complete":
+            processStatus = "success"
+        else:
+            processStatus = "permanentFail"
+
+        outputs = None
+        try:
+            try:
+                outc = arvados.collection.Collection(record["output"])
+                with outc.open("cwl.output.json") as f:
+                    outputs = json.load(f)
+                def keepify(path):
+                    if not path.startswith("keep:"):
+                        return "keep:%s/%s" % (record["output"], path)
+                adjustFiles(outputs, keepify)
+            except Exception as e:
+                logger.error("While getting final output object: %s", e)
+            self.arvrunner.output_callback(outputs, processStatus)
+        finally:
+            del self.arvrunner.jobs[record["uuid"]]
+
+
+class RunnerTemplate(object):
+    """An Arvados pipeline template that invokes a CWL workflow."""
+
+    type_to_dataclass = {
+        'boolean': 'boolean',
+        'File': 'File',
+        'float': 'number',
+        'int': 'number',
+        'string': 'text',
+    }
+
+    def __init__(self, runner, tool, job_order, enable_reuse):
+        self.runner = runner
+        self.tool = tool
+        self.job = RunnerJob(
+            runner=runner,
+            tool=tool,
+            job_order=job_order,
+            enable_reuse=enable_reuse)
+
+    def pipeline_component_spec(self):
+        """Return a component that Workbench and a-r-p-i will understand.
+
+        Specifically, translate CWL input specs to Arvados pipeline
+        format, like {"dataclass":"File","value":"xyz"}.
+        """
+        spec = self.job.arvados_job_spec()
+
+        # Most of the component spec is exactly the same as the job
+        # spec (script, script_version, etc.).
+        # spec['script_parameters'] isn't right, though. A component
+        # spec's script_parameters hash is a translation of
+        # self.tool.tool['inputs'] with defaults/overrides taken from
+        # the job order. So we move the job parameters out of the way
+        # and build a new spec['script_parameters'].
+        job_params = spec['script_parameters']
+        spec['script_parameters'] = {}
+
+        for param in self.tool.tool['inputs']:
+            param = copy.deepcopy(param)
+
+            # Data type and "required" flag...
+            types = param['type']
+            if not isinstance(types, list):
+                types = [types]
+            param['required'] = 'null' not in types
+            non_null_types = set(types) - set(['null'])
+            if len(non_null_types) == 1:
+                the_type = [c for c in non_null_types][0]
+                dataclass = self.type_to_dataclass.get(the_type)
+                if dataclass:
+                    param['dataclass'] = dataclass
+            # Note: If we didn't figure out a single appropriate
+            # dataclass, we just left that attribute out.  We leave
+            # the "type" attribute there in any case, which might help
+            # downstream.
+
+            # Title and description...
+            title = param.pop('label', '')
+            descr = param.pop('description', '').rstrip('\n')
+            if title:
+                param['title'] = title
+            if descr:
+                param['description'] = descr
+
+            # Fill in the value from the current job order, if any.
+            param_id = shortname(param.pop('id'))
+            value = job_params.get(param_id)
+            if value is None:
+                pass
+            elif not isinstance(value, dict):
+                param['value'] = value
+            elif param.get('dataclass') == 'File' and value.get('path'):
+                param['value'] = value['path']
+
+            spec['script_parameters'][param_id] = param
+        spec['script_parameters']['cwl:tool'] = job_params['cwl:tool']
+        return spec
+
+    def save(self):
+        job_spec = self.pipeline_component_spec()
+        response = self.runner.api.pipeline_templates().create(body={
+            "components": {
+                self.job.name: job_spec,
+            },
+            "name": self.job.name,
+            "owner_uuid": self.runner.project_uuid,
+        }, ensure_unique_name=True).execute(num_retries=self.runner.num_retries)
+        self.uuid = response["uuid"]
+        logger.info("Created template %s", self.uuid)
+
+
 class ArvPathMapper(cwltool.pathmapper.PathMapper):
-    def __init__(self, arvrunner, referenced_files, basedir, **kwargs):
+    """Convert container-local paths to and from Keep collection ids."""
+
+    def __init__(self, arvrunner, referenced_files, input_basedir,
+                 collection_pattern, file_pattern, name=None, **kwargs):
         self._pathmap = arvrunner.get_uploaded()
-        uploadfiles = []
+        uploadfiles = set()
 
         pdh_path = re.compile(r'^keep:[0-9a-f]{32}\+\d+/.+')
 
         for src in referenced_files:
             if isinstance(src, basestring) and pdh_path.match(src):
-                self._pathmap[src] = (src, "$(task.keep)/%s" % src[5:])
+                self._pathmap[src] = (src, collection_pattern % src[5:])
+            if "#" in src:
+                src = src[:src.index("#")]
             if src not in self._pathmap:
-                ab = cwltool.pathmapper.abspath(src, basedir)
-                st = arvados.commands.run.statfile("", ab, fnPattern="$(task.keep)/%s/%s")
+                ab = cwltool.pathmapper.abspath(src, input_basedir)
+                st = arvados.commands.run.statfile("", ab, fnPattern=file_pattern)
                 if kwargs.get("conformance_test"):
                     self._pathmap[src] = (src, ab)
                 elif isinstance(st, arvados.commands.run.UploadFile):
-                    uploadfiles.append((src, ab, st))
+                    uploadfiles.add((src, ab, st))
                 elif isinstance(st, arvados.commands.run.ArvFile):
                     self._pathmap[src] = (ab, st.fn)
                 else:
@@ -264,7 +564,8 @@ class ArvPathMapper(cwltool.pathmapper.PathMapper):
                                              arvrunner.api,
                                              dry_run=kwargs.get("dry_run"),
                                              num_retries=3,
-                                             fnPattern="$(task.keep)/%s/%s",
+                                             fnPattern=file_pattern,
+                                             name=name,
                                              project=arvrunner.project_uuid)
 
         for src, ab, st in uploadfiles:
@@ -282,7 +583,9 @@ class ArvPathMapper(cwltool.pathmapper.PathMapper):
             return super(ArvPathMapper, self).reversemap(target)
 
 
-class ArvadosCommandTool(cwltool.draft2tool.CommandLineTool):
+class ArvadosCommandTool(CommandLineTool):
+    """Wrap cwltool CommandLineTool to override selected methods."""
+
     def __init__(self, arvrunner, toolpath_object, **kwargs):
         super(ArvadosCommandTool, self).__init__(toolpath_object, **kwargs)
         self.arvrunner = arvrunner
@@ -290,11 +593,17 @@ class ArvadosCommandTool(cwltool.draft2tool.CommandLineTool):
     def makeJobRunner(self):
         return ArvadosJob(self.arvrunner)
 
-    def makePathMapper(self, reffiles, input_basedir, **kwargs):
-        return ArvPathMapper(self.arvrunner, reffiles, input_basedir, **kwargs)
+    def makePathMapper(self, reffiles, **kwargs):
+        return ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"],
+                             "$(task.keep)/%s",
+                             "$(task.keep)/%s/%s",
+                             **kwargs)
 
 
 class ArvCwlRunner(object):
+    """Execute a CWL tool or workflow, submit crunch jobs, wait for them to
+    complete, and report output."""
+
     def __init__(self, api_client):
         self.api = api_client
         self.jobs = {}
@@ -303,6 +612,7 @@ class ArvCwlRunner(object):
         self.final_output = None
         self.uploaded = {}
         self.num_retries = 4
+        self.uuid = None
 
     def arvMakeTool(self, toolpath_object, **kwargs):
         if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
@@ -313,16 +623,17 @@ class ArvCwlRunner(object):
     def output_callback(self, out, processStatus):
         if processStatus == "success":
             logger.info("Overall job status is %s", processStatus)
-            self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
-                                                 body={"state": "Complete"}).execute(num_retries=self.num_retries)
+            if self.pipeline:
+                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+                                                     body={"state": "Complete"}).execute(num_retries=self.num_retries)
 
         else:
             logger.warn("Overall job status is %s", processStatus)
-            self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
-                                                 body={"state": "Failed"}).execute(num_retries=self.num_retries)
+            if self.pipeline:
+                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
         self.final_output = out
 
-
     def on_message(self, event):
         if "object_uuid" in event:
             if event["object_uuid"] in self.jobs and event["event_type"] == "update":
@@ -350,52 +661,64 @@ class ArvCwlRunner(object):
     def add_uploaded(self, src, pair):
         self.uploaded[src] = pair
 
-    def arvExecutor(self, tool, job_order, input_basedir, args, **kwargs):
-        events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)
+    def arvExecutor(self, tool, job_order, **kwargs):
+        self.debug = kwargs.get("debug")
 
-        try:
-            self.api.collections().get(uuid=crunchrunner_pdh).execute()
-        except arvados.errors.ApiError as e:
-            import httplib2
-            h = httplib2.Http(ca_certs=arvados.util.ca_certs_path())
-            resp, content = h.request(crunchrunner_download, "GET")
-            resp2, content2 = h.request(certs_download, "GET")
-            with arvados.collection.Collection() as col:
-                with col.open("crunchrunner", "w") as f:
-                    f.write(content)
-                with col.open("ca-certificates.crt", "w") as f:
-                    f.write(content2)
-
-                col.save_new("crunchrunner binary", ensure_unique_name=True)
-
-        self.fs_access = CollectionFsAccess(input_basedir)
+        if kwargs.get("quiet"):
+            logger.setLevel(logging.WARN)
+            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)
 
-        kwargs["fs_access"] = self.fs_access
-        kwargs["enable_reuse"] = args.enable_reuse
+        useruuid = self.api.users().current().execute()["uuid"]
+        self.project_uuid = kwargs.get("project_uuid") if kwargs.get("project_uuid") else useruuid
+        self.pipeline = None
 
-        kwargs["outdir"] = "$(task.outdir)"
-        kwargs["tmpdir"] = "$(task.tmpdir)"
+        if kwargs.get("create_template"):
+            tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse"))
+            tmpl.save()
+            # cwltool.main will write our return value to stdout.
+            return tmpl.uuid
 
-        useruuid = self.api.users().current().execute()["uuid"]
-        self.project_uuid = args.project_uuid if args.project_uuid else useruuid
+        if kwargs.get("submit"):
+            runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse"))
 
-        if kwargs.get("conformance_test"):
-            return cwltool.main.single_job_executor(tool, job_order, input_basedir, args, **kwargs)
-        else:
+        if not kwargs.get("submit") and "cwl_runner_job" not in kwargs:
+            # Create pipeline for local run
             self.pipeline = self.api.pipeline_instances().create(
                 body={
                     "owner_uuid": self.project_uuid,
                     "name": shortname(tool.tool["id"]),
                     "components": {},
                     "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
-
             logger.info("Pipeline instance %s", self.pipeline["uuid"])
 
-            jobiter = tool.job(job_order,
-                               input_basedir,
-                               self.output_callback,
-                               docker_outdir="$(task.outdir)",
-                               **kwargs)
+        if kwargs.get("submit") and not kwargs.get("wait"):
+                runnerjob.run()
+                return runnerjob.uuid
+
+        events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)
+
+        self.debug = kwargs.get("debug")
+        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")
+        self.fs_access = CollectionFsAccess(kwargs["basedir"])
+
+        kwargs["fs_access"] = self.fs_access
+        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
+
+        kwargs["outdir"] = "$(task.outdir)"
+        kwargs["tmpdir"] = "$(task.tmpdir)"
+
+        if kwargs.get("conformance_test"):
+            return cwltool.main.single_job_executor(tool, job_order, **kwargs)
+        else:
+            if kwargs.get("submit"):
+                jobiter = iter((runnerjob,))
+            else:
+                if "cwl_runner_job" in kwargs:
+                    self.uuid = kwargs.get("cwl_runner_job").get('uuid')
+                jobiter = tool.job(job_order,
+                                   self.output_callback,
+                                   docker_outdir="$(task.outdir)",
+                                   **kwargs)
 
             try:
                 self.cond.acquire()
@@ -417,27 +740,55 @@ class ArvCwlRunner(object):
                     self.cond.wait(1)
 
                 events.close()
-
-                if self.final_output is None:
-                    raise cwltool.workflow.WorkflowException("Workflow did not return a result.")
-
-                # create final output collection
             except:
                 if sys.exc_info()[0] is KeyboardInterrupt:
                     logger.error("Interrupted, marking pipeline as failed")
                 else:
-                    logger.exception("Caught unhandled exception, marking pipeline as failed")
-                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
-                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
+                    logger.error("Caught unhandled exception, marking pipeline as failed.  Error was: %s", sys.exc_info()[0], exc_info=(sys.exc_info()[1] if self.debug else False))
+                if self.pipeline:
+                    self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+                                                         body={"state": "Failed"}).execute(num_retries=self.num_retries)
             finally:
                 self.cond.release()
 
+            if self.final_output is None:
+                raise cwltool.workflow.WorkflowException("Workflow did not return a result.")
+
             return self.final_output
 
+def versionstring():
+    """Print version string of key packages for provenance and debugging."""
+
+    arvcwlpkg = pkg_resources.require("arvados-cwl-runner")
+    arvpkg = pkg_resources.require("arvados-python-client")
+    cwlpkg = pkg_resources.require("cwltool")
+
+    return "%s %s, %s %s, %s %s" % (sys.argv[0], arvcwlpkg[0].version,
+                                    "arvados-python-client", arvpkg[0].version,
+                                    "cwltool", cwlpkg[0].version)
+
+def arg_parser():  # type: () -> argparse.ArgumentParser
+    parser = argparse.ArgumentParser(description='Arvados executor for Common Workflow Language')
+
+    parser.add_argument("--conformance-test", action="store_true")
+    parser.add_argument("--basedir", type=str,
+                        help="Base directory used to resolve relative references in the input, default to directory of input object file or current directory (if inputs piped/provided on command line).")
+    parser.add_argument("--outdir", type=str, default=os.path.abspath('.'),
+                        help="Output directory, default current directory")
+
+    parser.add_argument("--eval-timeout",
+                        help="Time to wait for a Javascript expression to evaluate before giving an error, default 20s.",
+                        type=float,
+                        default=20)
+    parser.add_argument("--version", action="store_true", help="Print version and exit")
+
+    exgroup = parser.add_mutually_exclusive_group()
+    exgroup.add_argument("--verbose", action="store_true", help="Default logging")
+    exgroup.add_argument("--quiet", action="store_true", help="Only print warnings and errors.")
+    exgroup.add_argument("--debug", action="store_true", help="Print even more logging")
+
+    parser.add_argument("--tool-help", action="store_true", help="Print command line help for tool")
 
-def main(args, stdout, stderr, api_client=None):
-    args.insert(0, "--leave-outputs")
-    parser = cwltool.main.arg_parser()
     exgroup = parser.add_mutually_exclusive_group()
     exgroup.add_argument("--enable-reuse", action="store_true",
                         default=True, dest="enable_reuse",
@@ -445,12 +796,50 @@ def main(args, stdout, stderr, api_client=None):
     exgroup.add_argument("--disable-reuse", action="store_false",
                         default=True, dest="enable_reuse",
                         help="")
-    parser.add_argument("--project-uuid", type=str, help="Project that will own the workflow jobs")
+
+    parser.add_argument("--project-uuid", type=str, help="Project that will own the workflow jobs, if not provided, will go to home project.")
+    parser.add_argument("--ignore-docker-for-reuse", action="store_true",
+                        help="Ignore Docker image version when deciding whether to reuse past jobs.",
+                        default=False)
+
+    exgroup = parser.add_mutually_exclusive_group()
+    exgroup.add_argument("--submit", action="store_true", help="Submit workflow to run on Arvados.",
+                        default=True, dest="submit")
+    exgroup.add_argument("--local", action="store_false", help="Run workflow on local host (submits jobs to Arvados).",
+                        default=True, dest="submit")
+    exgroup.add_argument("--create-template", action="store_true", help="Create an Arvados pipeline template.")
+
+    exgroup = parser.add_mutually_exclusive_group()
+    exgroup.add_argument("--wait", action="store_true", help="After submitting workflow runner job, wait for completion.",
+                        default=True, dest="wait")
+    exgroup.add_argument("--no-wait", action="store_false", help="Submit workflow runner job and exit.",
+                        default=True, dest="wait")
+
+    parser.add_argument("workflow", type=str, nargs="?", default=None)
+    parser.add_argument("job_order", nargs=argparse.REMAINDER)
+
+    return parser
+
+def main(args, stdout, stderr, api_client=None):
+    parser = arg_parser()
+
+    job_order_object = None
+    arvargs = parser.parse_args(args)
+    if arvargs.create_template and not arvargs.job_order:
+        job_order_object = ({}, "")
 
     try:
-        runner = ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()))
+        if api_client is None:
+            api_client=arvados.api('v1', model=OrderedJsonModel())
+        runner = ArvCwlRunner(api_client)
     except Exception as e:
         logger.error(e)
         return 1
 
-    return cwltool.main.main(args, executor=runner.arvExecutor, makeTool=runner.arvMakeTool, parser=parser)
+    return cwltool.main.main(args=arvargs,
+                             stdout=stdout,
+                             stderr=stderr,
+                             executor=runner.arvExecutor,
+                             makeTool=runner.arvMakeTool,
+                             versionfunc=versionstring,
+                             job_order_object=job_order_object)
index 3fc7433adfcd054c1e28bcd11acb49807506732e..591bddefeef822b47319c79ddac195530725315d 100644 (file)
@@ -30,8 +30,11 @@ setup(name='arvados-cwl-runner',
           'bin/arvados-cwl-runner'
       ],
       install_requires=[
-          'cwltool>=1.0.20160311170456',
-          'arvados-python-client>=0.1.20160219154918'
+          'cwltool==1.0.20160519182434',
+          'arvados-python-client>=0.1.20160322001610'
+      ],
+      data_files=[
+          ('share/doc/arvados-cwl-runner', ['LICENSE-2.0.txt', 'README.rst']),
       ],
       test_suite='tests',
       tests_require=['mock>=1.0'],
index aef27001e00e1a2b330296a3eafa8decea22f518..bee193858410581801ca2308f8b4045f8dea0179 100755 (executable)
@@ -23,6 +23,10 @@ while test -n "$1" ; do
             config=$2
             shift ; shift
             ;;
+        -h|--help)
+            echo "$0 [--no-reset-container] [--leave-running] [--config dev|localdemo]"
+            exit
+            ;;
         -*)
             break
             ;;
@@ -58,6 +62,9 @@ git pull
 export ARVADOS_API_HOST=localhost:8000
 export ARVADOS_API_HOST_INSECURE=1
 export ARVADOS_API_TOKEN=\$(cat /var/lib/arvados/superuser_token)
+
+arv-keepdocker --pull arvados/jobs
+
 env
 exec ./run_test.sh "$@"
 EOF
diff --git a/sdk/cwl/tests/input/blorp.txt b/sdk/cwl/tests/input/blorp.txt
new file mode 100644 (file)
index 0000000..09fc24d
--- /dev/null
@@ -0,0 +1 @@
+blopper blubber
diff --git a/sdk/cwl/tests/matcher.py b/sdk/cwl/tests/matcher.py
new file mode 100644 (file)
index 0000000..d3c9316
--- /dev/null
@@ -0,0 +1,23 @@
+import difflib
+import json
+
+
+class JsonDiffMatcher(object):
+    """Raise AssertionError with a readable JSON diff when not __eq__().
+
+    Used with assert_called_with() so it's possible for a human to see
+    the differences between expected and actual call arguments that
+    include non-trivial data structures.
+    """
+    def __init__(self, expected):
+        self.expected = expected
+
+    def __eq__(self, actual):
+        expected_json = json.dumps(self.expected, sort_keys=True, indent=2)
+        actual_json = json.dumps(actual, sort_keys=True, indent=2)
+        if expected_json != actual_json:
+            raise AssertionError("".join(difflib.context_diff(
+                expected_json.splitlines(1),
+                actual_json.splitlines(1),
+                fromfile="Expected", tofile="Actual")))
+        return True
diff --git a/sdk/cwl/tests/order/empty_order.json b/sdk/cwl/tests/order/empty_order.json
new file mode 100644 (file)
index 0000000..0967ef4
--- /dev/null
@@ -0,0 +1 @@
+{}
diff --git a/sdk/cwl/tests/order/inputs_test_order.json b/sdk/cwl/tests/order/inputs_test_order.json
new file mode 100644 (file)
index 0000000..8830523
--- /dev/null
@@ -0,0 +1,9 @@
+{
+    "fileInput": {
+        "class": "File",
+        "path": "../input/blorp.txt"
+    },
+    "boolInput": true,
+    "floatInput": 1.234,
+    "optionalFloatInput": null
+}
diff --git a/sdk/cwl/tests/submit_test_job.json b/sdk/cwl/tests/submit_test_job.json
new file mode 100644 (file)
index 0000000..95ff0ff
--- /dev/null
@@ -0,0 +1,6 @@
+{
+    "x": {
+        "class": "File",
+        "path": "input/blorp.txt"
+    }
+}
index 56f311026ee58ddbbe9d2dc40f943b9c1cf6654d..dba65b0f8a884c207c29656d9f5d0107479f0b7a 100644 (file)
@@ -1,6 +1,14 @@
-import unittest
-import mock
 import arvados_cwl
+import logging
+import mock
+import unittest
+import os
+import cwltool.process
+
+if not os.getenv('ARVADOS_DEBUG'):
+    logging.getLogger('arvados.cwl-runner').setLevel(logging.WARN)
+    logging.getLogger('arvados.arv-run').setLevel(logging.WARN)
+
 
 class TestJob(unittest.TestCase):
 
@@ -9,41 +17,54 @@ class TestJob(unittest.TestCase):
     def test_run(self):
         runner = mock.MagicMock()
         runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        runner.ignore_docker_for_reuse = False
+        document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema("draft-3")
+
         tool = {
             "inputs": [],
             "outputs": [],
             "baseCommand": "ls"
         }
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool)
+        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, avsc_names=avsc_names, basedir="")
         arvtool.formatgraph = None
-        for j in arvtool.job({}, "", mock.MagicMock()):
+        for j in arvtool.job({}, mock.MagicMock(), basedir=""):
             j.run()
-        runner.api.jobs().create.assert_called_with(body={
-            'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
-            'runtime_constraints': {},
-            'script_parameters': {
-                'tasks': [{
-                    'task.env': {'TMPDIR': '$(task.tmpdir)'},
-                    'command': ['ls']
-                }],
-                'crunchrunner': '83db29f08544e1c319572a6bd971088a+140/crunchrunner'
-            },
-            'script_version': 'master',
-            'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
-            'repository': 'arvados',
-            'script': 'crunchrunner',
-            'runtime_constraints': {
-                'min_cores_per_node': 1,
-                'min_ram_mb_per_node': 1024,
-                'min_scratch_mb_per_node': 2048 # tmpdirSize + outdirSize
-            }
-        }, find_or_create=True)
+            runner.api.jobs().create.assert_called_with(
+                body={
+                    'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+                    'runtime_constraints': {},
+                    'script_parameters': {
+                        'tasks': [{
+                            'task.env': {'TMPDIR': '$(task.tmpdir)'},
+                            'command': ['ls']
+                        }],
+                    },
+                    'script_version': 'master',
+                    'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
+                    'repository': 'arvados',
+                    'script': 'crunchrunner',
+                    'runtime_constraints': {
+                        'docker_image': 'arvados/jobs',
+                        'min_cores_per_node': 1,
+                        'min_ram_mb_per_node': 1024,
+                        'min_scratch_mb_per_node': 2048 # tmpdirSize + outdirSize
+                    }
+                },
+                find_or_create=True,
+                filters=[['repository', '=', 'arvados'],
+                         ['script', '=', 'crunchrunner'],
+                         ['script_version', 'in git', '9e5b98e8f5f4727856b53447191f9c06e3da2ba6'],
+                         ['docker_image_locator', 'in docker', 'arvados/jobs']]
+            )
 
     # The test passes some fields in builder.resources
     # For the remaining fields, the defaults will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
     def test_resource_requirements(self):
         runner = mock.MagicMock()
         runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        runner.ignore_docker_for_reuse = False
+        document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema("draft-3")
+
         tool = {
             "inputs": [],
             "outputs": [],
@@ -55,27 +76,111 @@ class TestJob(unittest.TestCase):
             }],
             "baseCommand": "ls"
         }
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool)
+        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, avsc_names=avsc_names)
         arvtool.formatgraph = None
-        for j in arvtool.job({}, "", mock.MagicMock()):
+        for j in arvtool.job({}, mock.MagicMock(), basedir=""):
             j.run()
-        runner.api.jobs().create.assert_called_with(body={
-            'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
-            'runtime_constraints': {},
-            'script_parameters': {
-                'tasks': [{
-                    'task.env': {'TMPDIR': '$(task.tmpdir)'},
-                    'command': ['ls']
-                }],
-                'crunchrunner': '83db29f08544e1c319572a6bd971088a+140/crunchrunner'
+        runner.api.jobs().create.assert_called_with(
+            body={
+                'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+                'runtime_constraints': {},
+                'script_parameters': {
+                    'tasks': [{
+                        'task.env': {'TMPDIR': '$(task.tmpdir)'},
+                        'command': ['ls']
+                    }]
             },
             'script_version': 'master',
-            'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
-            'repository': 'arvados',
-            'script': 'crunchrunner',
-            'runtime_constraints': {
-                'min_cores_per_node': 3,
-                'min_ram_mb_per_node': 3000,
-                'min_scratch_mb_per_node': 5024 # tmpdirSize + outdirSize
-            }
-        }, find_or_create=True)
+                'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
+                'repository': 'arvados',
+                'script': 'crunchrunner',
+                'runtime_constraints': {
+                    'docker_image': 'arvados/jobs',
+                    'min_cores_per_node': 3,
+                    'min_ram_mb_per_node': 3000,
+                    'min_scratch_mb_per_node': 5024 # tmpdirSize + outdirSize
+                }
+            },
+            find_or_create=True,
+            filters=[['repository', '=', 'arvados'],
+                     ['script', '=', 'crunchrunner'],
+                     ['script_version', 'in git', '9e5b98e8f5f4727856b53447191f9c06e3da2ba6'],
+                     ['docker_image_locator', 'in docker', 'arvados/jobs']])
+
+    @mock.patch("arvados.collection.Collection")
+    def test_done(self, col):
+        api = mock.MagicMock()
+
+        runner = mock.MagicMock()
+        runner.api = api
+        runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        runner.num_retries = 0
+        runner.ignore_docker_for_reuse = False
+
+        col().open.return_value = []
+        api.collections().list().execute.side_effect = ({"items": []},
+                                                        {"items": [{"manifest_text": "XYZ"}]})
+
+        arvjob = arvados_cwl.ArvadosJob(runner)
+        arvjob.name = "testjob"
+        arvjob.builder = mock.MagicMock()
+        arvjob.output_callback = mock.MagicMock()
+        arvjob.collect_outputs = mock.MagicMock()
+
+        arvjob.done({
+            "state": "Complete",
+            "output": "99999999999999999999999999999993+99",
+            "log": "99999999999999999999999999999994+99",
+            "uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        })
+
+        api.collections().list.assert_has_calls([
+            mock.call(),
+            mock.call(filters=[['owner_uuid', '=', 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'],
+                          ['portable_data_hash', '=', '99999999999999999999999999999993+99'],
+                          ['name', '=', 'Output 9999999 of testjob']]),
+            mock.call().execute(num_retries=0),
+            mock.call(limit=1, filters=[['portable_data_hash', '=', '99999999999999999999999999999993+99']],
+                 select=['manifest_text']),
+            mock.call().execute(num_retries=0)])
+
+        api.collections().create.assert_called_with(
+            ensure_unique_name=True,
+            body={'portable_data_hash': '99999999999999999999999999999993+99',
+                  'manifest_text': 'XYZ',
+                  'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+                  'name': 'Output 9999999 of testjob'})
+
+    @mock.patch("arvados.collection.Collection")
+    def test_done_use_existing_collection(self, col):
+        api = mock.MagicMock()
+
+        runner = mock.MagicMock()
+        runner.api = api
+        runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        runner.num_retries = 0
+
+        col().open.return_value = []
+        api.collections().list().execute.side_effect = ({"items": [{"uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2"}]},)
+
+        arvjob = arvados_cwl.ArvadosJob(runner)
+        arvjob.name = "testjob"
+        arvjob.builder = mock.MagicMock()
+        arvjob.output_callback = mock.MagicMock()
+        arvjob.collect_outputs = mock.MagicMock()
+
+        arvjob.done({
+            "state": "Complete",
+            "output": "99999999999999999999999999999993+99",
+            "log": "99999999999999999999999999999994+99",
+            "uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        })
+
+        api.collections().list.assert_has_calls([
+            mock.call(),
+            mock.call(filters=[['owner_uuid', '=', 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'],
+                               ['portable_data_hash', '=', '99999999999999999999999999999993+99'],
+                               ['name', '=', 'Output 9999999 of testjob']]),
+            mock.call().execute(num_retries=0)])
+
+        self.assertFalse(api.collections().create.called)
diff --git a/sdk/cwl/tests/test_submit.py b/sdk/cwl/tests/test_submit.py
new file mode 100644 (file)
index 0000000..38741eb
--- /dev/null
@@ -0,0 +1,250 @@
+import arvados
+import arvados.keep
+import arvados.collection
+import arvados_cwl
+import copy
+import cStringIO
+import functools
+import hashlib
+import mock
+import sys
+import unittest
+
+from .matcher import JsonDiffMatcher
+
+
+def stubs(func):
+    @functools.wraps(func)
+    @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
+    @mock.patch("arvados.collection.KeepClient")
+    @mock.patch("arvados.events.subscribe")
+    def wrapped(self, events, KeepClient, keepdocker, *args, **kwargs):
+        class Stubs:
+            pass
+        stubs = Stubs()
+        stubs.events = events
+        stubs.KeepClient = KeepClient
+        stubs.keepdocker = keepdocker
+
+        def putstub(p, **kwargs):
+            return "%s+%i" % (hashlib.md5(p).hexdigest(), len(p))
+        stubs.KeepClient().put.side_effect = putstub
+
+        stubs.keepdocker.return_value = True
+        stubs.fake_user_uuid = "zzzzz-tpzed-zzzzzzzzzzzzzzz"
+
+        stubs.api = mock.MagicMock()
+        stubs.api.users().current().execute.return_value = {
+            "uuid": stubs.fake_user_uuid,
+        }
+        stubs.api.collections().list().execute.return_value = {"items": []}
+        stubs.api.collections().create().execute.side_effect = ({
+            "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
+            "portable_data_hash": "99999999999999999999999999999991+99",
+        }, {
+            "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
+            "portable_data_hash": "99999999999999999999999999999992+99",
+        })
+        stubs.expect_job_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+        stubs.api.jobs().create().execute.return_value = {
+            "uuid": stubs.expect_job_uuid,
+            "state": "Queued",
+        }
+        stubs.expect_pipeline_template_uuid = "zzzzz-d1hrv-zzzzzzzzzzzzzzz"
+        stubs.api.pipeline_templates().create().execute.return_value = {
+            "uuid": stubs.expect_pipeline_template_uuid,
+        }
+        stubs.expect_job_spec = {
+            'runtime_constraints': {
+                'docker_image': 'arvados/jobs'
+            },
+            'script_parameters': {
+                'x': {
+                    'path': '99999999999999999999999999999992+99/blorp.txt',
+                    'class': 'File'
+                },
+                'cwl:tool':
+                '99999999999999999999999999999991+99/wf/submit_wf.cwl'
+            },
+            'repository': 'arvados',
+            'script_version': 'master',
+            'script': 'cwl-runner'
+        }
+        return func(self, stubs, *args, **kwargs)
+    return wrapped
+
+
+class TestSubmit(unittest.TestCase):
+    @stubs
+    def test_submit(self, stubs):
+        capture_stdout = cStringIO.StringIO()
+        exited = arvados_cwl.main(
+            ["--submit", "--no-wait",
+             "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+            capture_stdout, sys.stderr, api_client=stubs.api)
+        self.assertEqual(exited, 0)
+
+        stubs.api.collections().create.assert_has_calls([
+            mock.call(),
+            mock.call(body={
+                'manifest_text':
+                './tool a3954c369b8924d40547ec8cf5f6a7f4+449 '
+                '0:16:blub.txt 16:433:submit_tool.cwl\n./wf '
+                'e046cace0b1a0a6ee645f6ea8688f7e2+364 0:364:submit_wf.cwl\n',
+                'owner_uuid': 'zzzzz-tpzed-zzzzzzzzzzzzzzz',
+                'name': 'submit_wf.cwl',
+            }, ensure_unique_name=True),
+            mock.call().execute(),
+            mock.call(body={
+                'manifest_text':
+                '. 979af1245a12a1fed634d4222473bfdc+16 0:16:blorp.txt\n',
+                'owner_uuid': 'zzzzz-tpzed-zzzzzzzzzzzzzzz',
+                'name': '#',
+            }, ensure_unique_name=True),
+            mock.call().execute()])
+
+        expect_job = copy.deepcopy(stubs.expect_job_spec)
+        expect_job["owner_uuid"] = stubs.fake_user_uuid
+        stubs.api.jobs().create.assert_called_with(
+            body=expect_job,
+            find_or_create=True)
+        self.assertEqual(capture_stdout.getvalue(),
+                         stubs.expect_job_uuid + '\n')
+
+    @stubs
+    def test_submit_with_project_uuid(self, stubs):
+        project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+
+        exited = arvados_cwl.main(
+            ["--submit", "--no-wait",
+             "--project-uuid", project_uuid,
+             "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+            sys.stdout, sys.stderr, api_client=stubs.api)
+        self.assertEqual(exited, 0)
+
+        expect_body = copy.deepcopy(stubs.expect_job_spec)
+        expect_body["owner_uuid"] = project_uuid
+        stubs.api.jobs().create.assert_called_with(
+            body=expect_body,
+            find_or_create=True)
+
+
+class TestCreateTemplate(unittest.TestCase):
+    @stubs
+    def test_create(self, stubs):
+        project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+
+        capture_stdout = cStringIO.StringIO()
+
+        exited = arvados_cwl.main(
+            ["--create-template", "--no-wait",
+             "--project-uuid", project_uuid,
+             "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+            capture_stdout, sys.stderr, api_client=stubs.api)
+        self.assertEqual(exited, 0)
+
+        stubs.api.pipeline_instances().create.refute_called()
+        stubs.api.jobs().create.refute_called()
+
+        expect_component = copy.deepcopy(stubs.expect_job_spec)
+        expect_component['script_parameters']['x'] = {
+            'dataclass': 'File',
+            'required': True,
+            'type': 'File',
+            'value': '99999999999999999999999999999992+99/blorp.txt',
+        }
+        expect_template = {
+            "components": {
+                "submit_wf.cwl": expect_component,
+            },
+            "name": "submit_wf.cwl",
+            "owner_uuid": project_uuid,
+        }
+        stubs.api.pipeline_templates().create.assert_called_with(
+            body=JsonDiffMatcher(expect_template), ensure_unique_name=True)
+
+        self.assertEqual(capture_stdout.getvalue(),
+                         stubs.expect_pipeline_template_uuid + '\n')
+
+
+class TestTemplateInputs(unittest.TestCase):
+    expect_template = {
+        "components": {
+            "inputs_test.cwl": {
+                'runtime_constraints': {
+                    'docker_image': 'arvados/jobs',
+                },
+                'script_parameters': {
+                    'cwl:tool':
+                    '99999999999999999999999999999991+99/'
+                    'wf/inputs_test.cwl',
+                    'optionalFloatInput': None,
+                    'fileInput': {
+                        'type': 'File',
+                        'dataclass': 'File',
+                        'required': True,
+                        'title': "It's a file; we expect to find some characters in it.",
+                        'description': 'If there were anything further to say, it would be said here,\nor here.'
+                    },
+                    'floatInput': {
+                        'type': 'float',
+                        'dataclass': 'number',
+                        'required': True,
+                        'title': 'Floats like a duck',
+                        'default': 0.1,
+                        'value': 0.1,
+                    },
+                    'optionalFloatInput': {
+                        'type': ['null', 'float'],
+                        'dataclass': 'number',
+                        'required': False,
+                    },
+                    'boolInput': {
+                        'type': 'boolean',
+                        'dataclass': 'boolean',
+                        'required': True,
+                        'title': 'True or false?',
+                    },
+                },
+                'repository': 'arvados',
+                'script_version': 'master',
+                'script': 'cwl-runner',
+            },
+        },
+        "name": "inputs_test.cwl",
+    }
+
+    @stubs
+    def test_inputs_empty(self, stubs):
+        exited = arvados_cwl.main(
+            ["--create-template", "--no-wait",
+             "tests/wf/inputs_test.cwl", "tests/order/empty_order.json"],
+            cStringIO.StringIO(), sys.stderr, api_client=stubs.api)
+        self.assertEqual(exited, 0)
+
+        expect_template = copy.deepcopy(self.expect_template)
+        expect_template["owner_uuid"] = stubs.fake_user_uuid
+
+        stubs.api.pipeline_templates().create.assert_called_with(
+            body=JsonDiffMatcher(expect_template), ensure_unique_name=True)
+
+    @stubs
+    def test_inputs(self, stubs):
+        exited = arvados_cwl.main(
+            ["--create-template", "--no-wait",
+             "tests/wf/inputs_test.cwl", "tests/order/inputs_test_order.json"],
+            cStringIO.StringIO(), sys.stderr, api_client=stubs.api)
+        self.assertEqual(exited, 0)
+
+        self.expect_template["owner_uuid"] = stubs.fake_user_uuid
+
+        expect_template = copy.deepcopy(self.expect_template)
+        expect_template["owner_uuid"] = stubs.fake_user_uuid
+        params = expect_template[
+            "components"]["inputs_test.cwl"]["script_parameters"]
+        params["fileInput"]["value"] = '99999999999999999999999999999992+99/blorp.txt'
+        params["floatInput"]["value"] = 1.234
+        params["boolInput"]["value"] = True
+
+        stubs.api.pipeline_templates().create.assert_called_with(
+            body=JsonDiffMatcher(expect_template), ensure_unique_name=True)
diff --git a/sdk/cwl/tests/tool/blub.txt b/sdk/cwl/tests/tool/blub.txt
new file mode 100644 (file)
index 0000000..f12927b
--- /dev/null
@@ -0,0 +1 @@
+blibber blubber
diff --git a/sdk/cwl/tests/tool/submit_tool.cwl b/sdk/cwl/tests/tool/submit_tool.cwl
new file mode 100644 (file)
index 0000000..e9fa423
--- /dev/null
@@ -0,0 +1,20 @@
+# Test case for arvados-cwl-runner
+#
+# Used to test whether scanning a tool file for dependencies (e.g. default
+# value blub.txt) and uploading to Keep works as intended.
+
+class: CommandLineTool
+cwlVersion: draft-3
+requirements:
+  - class: DockerRequirement
+    dockerPull: debian:8
+inputs:
+  - id: x
+    type: File
+    default:
+      class: File
+      path: blub.txt
+    inputBinding:
+      position: 1
+outputs: []
+baseCommand: cat
diff --git a/sdk/cwl/tests/wf/inputs_test.cwl b/sdk/cwl/tests/wf/inputs_test.cwl
new file mode 100644 (file)
index 0000000..ec43207
--- /dev/null
@@ -0,0 +1,28 @@
+# Test case for arvados-cwl-runner. Used to test propagation of
+# various input types as script_parameters in pipeline templates.
+
+class: Workflow
+cwlVersion: draft-3
+inputs:
+  - id: "#fileInput"
+    type: File
+    label: It's a file; we expect to find some characters in it.
+    description: |
+      If there were anything further to say, it would be said here,
+      or here.
+  - id: "#boolInput"
+    type: boolean
+    label: True or false?
+  - id: "#floatInput"
+    type: float
+    label: Floats like a duck
+    default: 0.1
+  - id: "#optionalFloatInput"
+    type: ["null", float]
+outputs: []
+steps:
+  - id: step1
+    inputs:
+      - { id: x, source: "#x" }
+    outputs: []
+    run: ../tool/submit_tool.cwl
diff --git a/sdk/cwl/tests/wf/submit_wf.cwl b/sdk/cwl/tests/wf/submit_wf.cwl
new file mode 100644 (file)
index 0000000..36db603
--- /dev/null
@@ -0,0 +1,17 @@
+# Test case for arvados-cwl-runner
+#
+# Used to test whether scanning a workflow file for dependencies
+# (e.g. submit_tool.cwl) and uploading to Keep works as intended.
+
+class: Workflow
+cwlVersion: draft-3
+inputs:
+  - id: x
+    type: File
+outputs: []
+steps:
+  - id: step1
+    inputs:
+      - { id: x, source: "#x" }
+    outputs: []
+    run: ../tool/submit_tool.cwl
diff --git a/sdk/go/arvados/client.go b/sdk/go/arvados/client.go
new file mode 100644 (file)
index 0000000..ee830c8
--- /dev/null
@@ -0,0 +1,169 @@
+package arvados
+
+import (
+       "crypto/tls"
+       "encoding/json"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net/http"
+       "net/url"
+       "os"
+)
+
+// A Client is an HTTP client with an API endpoint and a set of
+// Arvados credentials.
+//
+// It offers methods for accessing individual Arvados APIs, and
+// methods that implement common patterns like fetching multiple pages
+// of results using List APIs.
+type Client struct {
+       // HTTP client used to make requests. If nil,
+       // http.DefaultClient or InsecureHTTPClient will be used.
+       Client *http.Client
+
+       // Hostname (or host:port) of Arvados API server.
+       APIHost string
+
+       // User authentication token.
+       AuthToken string
+
+       // Accept unverified certificates. This works only if the
+       // Client field is nil: otherwise, it has no effect.
+       Insecure bool
+}
+
+// The default http.Client used by a Client with Insecure==true and
+// Client==nil.
+var InsecureHTTPClient = &http.Client{
+       Transport: &http.Transport{
+               TLSClientConfig: &tls.Config{
+                       InsecureSkipVerify: true}}}
+
+// NewClientFromEnv creates a new Client that uses the default HTTP
+// client with the API endpoint and credentials given by the
+// ARVADOS_API_* environment variables.
+func NewClientFromEnv() *Client {
+       return &Client{
+               APIHost:   os.Getenv("ARVADOS_API_HOST"),
+               AuthToken: os.Getenv("ARVADOS_API_TOKEN"),
+               Insecure:  os.Getenv("ARVADOS_API_HOST_INSECURE") != "",
+       }
+}
+
+// Do adds authentication headers and then calls (*http.Client)Do().
+func (c *Client) Do(req *http.Request) (*http.Response, error) {
+       if c.AuthToken != "" {
+               req.Header.Add("Authorization", "OAuth2 "+c.AuthToken)
+       }
+       return c.httpClient().Do(req)
+}
+
+// DoAndDecode performs req and unmarshals the response (which must be
+// JSON) into dst. Use this instead of RequestAndDecode if you need
+// more control of the http.Request object.
+func (c *Client) DoAndDecode(dst interface{}, req *http.Request) error {
+       resp, err := c.Do(req)
+       if err != nil {
+               return err
+       }
+       defer resp.Body.Close()
+       buf, err := ioutil.ReadAll(resp.Body)
+       if err != nil {
+               return err
+       }
+       if resp.StatusCode != 200 {
+               return fmt.Errorf("request failed (%s): %s", req.URL, resp.Status)
+       }
+       if dst == nil {
+               return nil
+       }
+       return json.Unmarshal(buf, dst)
+}
+
+// RequestAndDecode performs an API request and unmarshals the
+// response (which must be JSON) into dst. Method and body arguments
+// are the same as for http.NewRequest(). The given path is added to
+// the server's scheme/host/port to form the request URL. The given
+// params are passed via POST form or query string.
+//
+// path must not contain a query string.
+func (c *Client) RequestAndDecode(dst interface{}, method, path string, body io.Reader, params interface{}) error {
+       urlString := c.apiURL(path)
+       var urlValues url.Values
+       if v, ok := params.(url.Values); ok {
+               urlValues = v
+       } else if params != nil {
+               // Convert an arbitrary struct to url.Values. For
+               // example, Foo{Bar: []int{1,2,3}, Baz: "waz"} becomes
+               // url.Values{`bar`:`{"a":[1,2,3]}`,`Baz`:`waz`}
+               //
+               // TODO: Do this more efficiently, possibly using
+               // json.Decode/Encode, so the whole thing doesn't have
+               // to get encoded, decoded, and re-encoded.
+               j, err := json.Marshal(params)
+               if err != nil {
+                       return err
+               }
+               var generic map[string]interface{}
+               err = json.Unmarshal(j, &generic)
+               if err != nil {
+                       return err
+               }
+               urlValues = url.Values{}
+               for k, v := range generic {
+                       if v, ok := v.(string); ok {
+                               urlValues.Set(k, v)
+                               continue
+                       }
+                       j, err := json.Marshal(v)
+                       if err != nil {
+                               return err
+                       }
+                       urlValues.Set(k, string(j))
+               }
+       }
+       if (method == "GET" || body != nil) && urlValues != nil {
+               // FIXME: what if params don't fit in URL
+               u, err := url.Parse(urlString)
+               if err != nil {
+                       return err
+               }
+               u.RawQuery = urlValues.Encode()
+               urlString = u.String()
+       }
+       req, err := http.NewRequest(method, urlString, body)
+       if err != nil {
+               return err
+       }
+       return c.DoAndDecode(dst, req)
+}
+
+func (c *Client) httpClient() *http.Client {
+       switch {
+       case c.Client != nil:
+               return c.Client
+       case c.Insecure:
+               return InsecureHTTPClient
+       default:
+               return http.DefaultClient
+       }
+}
+
+func (c *Client) apiURL(path string) string {
+       return "https://" + c.APIHost + "/" + path
+}
+
+// DiscoveryDocument is the Arvados server's description of itself.
+type DiscoveryDocument struct {
+       DefaultCollectionReplication int   `json:"defaultCollectionReplication"`
+       BlobSignatureTTL             int64 `json:"blobSignatureTtl"`
+}
+
+// DiscoveryDocument returns a *DiscoveryDocument. The returned object
+// should not be modified: the same object may be returned by
+// subsequent calls.
+func (c *Client) DiscoveryDocument() (*DiscoveryDocument, error) {
+       var dd DiscoveryDocument
+       return &dd, c.RequestAndDecode(&dd, "GET", "discovery/v1/apis/arvados/v1/rest", nil, nil)
+}
diff --git a/sdk/go/arvados/client_test.go b/sdk/go/arvados/client_test.go
new file mode 100644 (file)
index 0000000..2db50bf
--- /dev/null
@@ -0,0 +1,83 @@
+package arvados
+
+import (
+       "bytes"
+       "fmt"
+       "io/ioutil"
+       "net/http"
+       "sync"
+       "testing"
+)
+
+type stubTransport struct {
+       Responses map[string]string
+       Requests  []http.Request
+       sync.Mutex
+}
+
+func (stub *stubTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+       stub.Lock()
+       stub.Requests = append(stub.Requests, *req)
+       stub.Unlock()
+
+       resp := &http.Response{
+               Status:     "200 OK",
+               StatusCode: 200,
+               Proto:      "HTTP/1.1",
+               ProtoMajor: 1,
+               ProtoMinor: 1,
+               Request:    req,
+       }
+       str := stub.Responses[req.URL.Path]
+       if str == "" {
+               resp.Status = "404 Not Found"
+               resp.StatusCode = 404
+               str = "{}"
+       }
+       buf := bytes.NewBufferString(str)
+       resp.Body = ioutil.NopCloser(buf)
+       resp.ContentLength = int64(buf.Len())
+       return resp, nil
+}
+
+type errorTransport struct{}
+
+func (stub *errorTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+       return nil, fmt.Errorf("something awful happened")
+}
+
+func TestCurrentUser(t *testing.T) {
+       t.Parallel()
+       stub := &stubTransport{
+               Responses: map[string]string{
+                       "/arvados/v1/users/current": `{"uuid":"zzzzz-abcde-012340123401234"}`,
+               },
+       }
+       c := &Client{
+               Client: &http.Client{
+                       Transport: stub,
+               },
+               APIHost:   "zzzzz.arvadosapi.com",
+               AuthToken: "xyzzy",
+       }
+       u, err := c.CurrentUser()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if x := "zzzzz-abcde-012340123401234"; u.UUID != x {
+               t.Errorf("got uuid %q, expected %q", u.UUID, x)
+       }
+       if len(stub.Requests) < 1 {
+               t.Fatal("empty stub.Requests")
+       }
+       hdr := stub.Requests[len(stub.Requests)-1].Header
+       if hdr.Get("Authorization") != "OAuth2 xyzzy" {
+               t.Errorf("got headers %+q, expected Authorization header", hdr)
+       }
+
+       c.Client.Transport = &errorTransport{}
+       u, err = c.CurrentUser()
+       if err == nil {
+               t.Errorf("got nil error, expected something awful")
+       }
+}
diff --git a/sdk/go/arvados/collection.go b/sdk/go/arvados/collection.go
new file mode 100644 (file)
index 0000000..71f5247
--- /dev/null
@@ -0,0 +1,62 @@
+package arvados
+
+import (
+       "bufio"
+       "fmt"
+       "strings"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/manifest"
+)
+
+// Collection is an arvados#collection resource.
+type Collection struct {
+       UUID                   string     `json:"uuid,omitempty"`
+       ExpiresAt              *time.Time `json:"expires_at,omitempty"`
+       ManifestText           string     `json:"manifest_text,omitempty"`
+       CreatedAt              *time.Time `json:"created_at,omitempty"`
+       ModifiedAt             *time.Time `json:"modified_at,omitempty"`
+       PortableDataHash       string     `json:"portable_data_hash,omitempty"`
+       ReplicationConfirmed   *int       `json:"replication_confirmed,omitempty"`
+       ReplicationConfirmedAt *time.Time `json:"replication_confirmed_at,omitempty"`
+       ReplicationDesired     *int       `json:"replication_desired,omitempty"`
+}
+
+// SizedDigests returns the hash+size part of each data block
+// referenced by the collection.
+func (c *Collection) SizedDigests() ([]SizedDigest, error) {
+       if c.ManifestText == "" && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
+               // TODO: Check more subtle forms of corruption, too
+               return nil, fmt.Errorf("manifest is missing")
+       }
+       var sds []SizedDigest
+       scanner := bufio.NewScanner(strings.NewReader(c.ManifestText))
+       scanner.Buffer(make([]byte, 1048576), len(c.ManifestText))
+       for scanner.Scan() {
+               line := scanner.Text()
+               tokens := strings.Split(line, " ")
+               if len(tokens) < 3 {
+                       return nil, fmt.Errorf("Invalid stream (<3 tokens): %q", line)
+               }
+               for _, token := range tokens[1:] {
+                       if !manifest.LocatorPattern.MatchString(token) {
+                               // FIXME: ensure it's a file token
+                               break
+                       }
+                       // FIXME: shouldn't assume 32 char hash
+                       if i := strings.IndexRune(token[33:], '+'); i >= 0 {
+                               token = token[:33+i]
+                       }
+                       sds = append(sds, SizedDigest(token))
+               }
+       }
+       return sds, scanner.Err()
+}
+
+// CollectionList is an arvados#collectionList resource.
+type CollectionList struct {
+       Items          []Collection `json:"items"`
+       ItemsAvailable int          `json:"items_available"`
+       Offset         int          `json:"offset"`
+       Limit          int          `json:"limit"`
+}
diff --git a/sdk/go/arvados/doc.go b/sdk/go/arvados/doc.go
new file mode 100644 (file)
index 0000000..1e8141e
--- /dev/null
@@ -0,0 +1,12 @@
+// Package arvados is a client library for Arvados.
+//
+// The API is not stable: it should be considered experimental
+// pre-release.
+//
+// The intent is to offer model types and API call functions that can
+// be generated automatically (or at least mostly automatically) from
+// a discovery document. For the time being, there is a manually
+// generated subset of those types and API calls with (approximately)
+// the right signatures, plus client/authentication support and some
+// convenience functions.
+package arvados
diff --git a/sdk/go/arvados/duration.go b/sdk/go/arvados/duration.go
new file mode 100644 (file)
index 0000000..1639c58
--- /dev/null
@@ -0,0 +1,31 @@
+package arvados
+
+import (
+       "encoding/json"
+       "fmt"
+       "time"
+)
+
+// Duration is time.Duration but looks like "12s" in JSON, rather than
+// a number of nanoseconds.
+type Duration time.Duration
+
+// UnmarshalJSON implements json.Unmarshaler
+func (d *Duration) UnmarshalJSON(data []byte) error {
+       if data[0] == '"' {
+               dur, err := time.ParseDuration(string(data[1 : len(data)-1]))
+               *d = Duration(dur)
+               return err
+       }
+       return fmt.Errorf("duration must be given as a string like \"600s\" or \"1h30m\"")
+}
+
+// MarshalJSON implements json.Marshaler
+func (d *Duration) MarshalJSON() ([]byte, error) {
+       return json.Marshal(d.String())
+}
+
+// String implements fmt.Stringer
+func (d Duration) String() string {
+       return time.Duration(d).String()
+}
diff --git a/sdk/go/arvados/keep_block.go b/sdk/go/arvados/keep_block.go
new file mode 100644 (file)
index 0000000..c9a7712
--- /dev/null
@@ -0,0 +1,15 @@
+package arvados
+
+import (
+       "strconv"
+       "strings"
+)
+
+// SizedDigest is a minimal Keep block locator: hash+size
+type SizedDigest string
+
+// Size returns the size of the data block, in bytes.
+func (sd SizedDigest) Size() int64 {
+       n, _ := strconv.ParseInt(strings.Split(string(sd), "+")[1], 10, 64)
+       return n
+}
diff --git a/sdk/go/arvados/keep_service.go b/sdk/go/arvados/keep_service.go
new file mode 100644 (file)
index 0000000..4af1b79
--- /dev/null
@@ -0,0 +1,123 @@
+package arvados
+
+import (
+       "bufio"
+       "fmt"
+       "net/http"
+       "strconv"
+       "strings"
+)
+
+// KeepService is an arvados#keepService record
+type KeepService struct {
+       UUID           string `json:"uuid"`
+       ServiceHost    string `json:"service_host"`
+       ServicePort    int    `json:"service_port"`
+       ServiceSSLFlag bool   `json:"service_ssl_flag"`
+       ServiceType    string `json:"service_type"`
+       ReadOnly       bool   `json:"read_only"`
+}
+
+// KeepServiceList is an arvados#keepServiceList record
+type KeepServiceList struct {
+       Items          []KeepService `json:"items"`
+       ItemsAvailable int           `json:"items_available"`
+       Offset         int           `json:"offset"`
+       Limit          int           `json:"limit"`
+}
+
+// KeepServiceIndexEntry is what a keep service's index response tells
+// us about a stored block.
+type KeepServiceIndexEntry struct {
+       SizedDigest
+       Mtime int64
+}
+
+// EachKeepService calls f once for every readable
+// KeepService. EachKeepService stops if it encounters an
+// error, such as f returning a non-nil error.
+func (c *Client) EachKeepService(f func(KeepService) error) error {
+       params := ResourceListParams{}
+       for {
+               var page KeepServiceList
+               err := c.RequestAndDecode(&page, "GET", "arvados/v1/keep_services", nil, params)
+               if err != nil {
+                       return err
+               }
+               for _, item := range page.Items {
+                       err = f(item)
+                       if err != nil {
+                               return err
+                       }
+               }
+               params.Offset = params.Offset + len(page.Items)
+               if params.Offset >= page.ItemsAvailable {
+                       return nil
+               }
+       }
+}
+
+func (s *KeepService) url(path string) string {
+       var f string
+       if s.ServiceSSLFlag {
+               f = "https://%s:%d/%s"
+       } else {
+               f = "http://%s:%d/%s"
+       }
+       return fmt.Sprintf(f, s.ServiceHost, s.ServicePort, path)
+}
+
+// String implements fmt.Stringer
+func (s *KeepService) String() string {
+       return s.UUID
+}
+
+// Index returns an unsorted list of blocks that can be retrieved from
+// this server.
+func (s *KeepService) Index(c *Client, prefix string) ([]KeepServiceIndexEntry, error) {
+       url := s.url("index/" + prefix)
+       req, err := http.NewRequest("GET", url, nil)
+       if err != nil {
+               return nil, fmt.Errorf("NewRequest(%v): %v", url, err)
+       }
+       resp, err := c.Do(req)
+       if err != nil {
+               return nil, fmt.Errorf("Do(%v): %v", url, err)
+       } else if resp.StatusCode != 200 {
+               return nil, fmt.Errorf("%v: %v", url, resp.Status)
+       }
+       defer resp.Body.Close()
+
+       var entries []KeepServiceIndexEntry
+       scanner := bufio.NewScanner(resp.Body)
+       sawEOF := false
+       for scanner.Scan() {
+               if sawEOF {
+                       return nil, fmt.Errorf("Index response contained non-terminal blank line")
+               }
+               line := scanner.Text()
+               if line == "" {
+                       sawEOF = true
+                       continue
+               }
+               fields := strings.Split(line, " ")
+               if len(fields) != 2 {
+                       return nil, fmt.Errorf("Malformed index line %q: %d fields", line, len(fields))
+               }
+               mtime, err := strconv.ParseInt(fields[1], 10, 64)
+               if err != nil {
+                       return nil, fmt.Errorf("Malformed index line %q: mtime: %v", line, err)
+               }
+               entries = append(entries, KeepServiceIndexEntry{
+                       SizedDigest: SizedDigest(fields[0]),
+                       Mtime:       mtime,
+               })
+       }
+       if err := scanner.Err(); err != nil {
+               return nil, fmt.Errorf("Error scanning index response: %v", err)
+       }
+       if !sawEOF {
+               return nil, fmt.Errorf("Index response had no EOF marker")
+       }
+       return entries, nil
+}
diff --git a/sdk/go/arvados/resource_list.go b/sdk/go/arvados/resource_list.go
new file mode 100644 (file)
index 0000000..e9ea268
--- /dev/null
@@ -0,0 +1,25 @@
+package arvados
+
+import "encoding/json"
+
+// ResourceListParams expresses which results are requested in a
+// list/index API.
+type ResourceListParams struct {
+       Select  []string `json:"select,omitempty"`
+       Filters []Filter `json:"filters,omitempty"`
+       Limit   *int     `json:"limit,omitempty"`
+       Offset  int      `json:"offset,omitempty"`
+       Order   string   `json:"order,omitempty"`
+}
+
+// A Filter restricts the set of records returned by a list/index API.
+type Filter struct {
+       Attr     string
+       Operator string
+       Operand  interface{}
+}
+
+// MarshalJSON encodes a Filter in the form expected by the API.
+func (f *Filter) MarshalJSON() ([]byte, error) {
+       return json.Marshal([]interface{}{f.Attr, f.Operator, f.Operand})
+}
diff --git a/sdk/go/arvados/resource_list_test.go b/sdk/go/arvados/resource_list_test.go
new file mode 100644 (file)
index 0000000..b5e6e7d
--- /dev/null
@@ -0,0 +1,21 @@
+package arvados
+
+import (
+       "bytes"
+       "encoding/json"
+       "testing"
+       "time"
+)
+
+func TestMarshalFiltersWithNanoseconds(t *testing.T) {
+       t0 := time.Now()
+       t0str := t0.Format(time.RFC3339Nano)
+       buf, err := json.Marshal([]Filter{
+               {Attr: "modified_at", Operator: "=", Operand: t0}})
+       if err != nil {
+               t.Fatal(err)
+       }
+       if expect := []byte(`[["modified_at","=","` + t0str + `"]]`); 0 != bytes.Compare(buf, expect) {
+               t.Errorf("Encoded as %q, expected %q", buf, expect)
+       }
+}
diff --git a/sdk/go/arvados/user.go b/sdk/go/arvados/user.go
new file mode 100644 (file)
index 0000000..684a3af
--- /dev/null
@@ -0,0 +1,17 @@
+package arvados
+
+// User is an arvados#user record
+type User struct {
+       UUID     string `json:"uuid,omitempty"`
+       IsActive bool   `json:"is_active"`
+       IsAdmin  bool   `json:"is_admin"`
+       Username string `json:"username,omitempty"`
+}
+
+// CurrentUser calls arvados.v1.users.current, and returns the User
+// record corresponding to this client's credentials.
+func (c *Client) CurrentUser() (User, error) {
+       var u User
+       err := c.RequestAndDecode(&u, "GET", "arvados/v1/users/current", nil, nil)
+       return u, err
+}
index b67eaa59a6749fb7e9b1a3da5ad2617344fc799d..8cdfa484bd96df3f479c7a40c8bb6692eddc1da5 100644 (file)
@@ -273,7 +273,7 @@ func newAPIServerError(ServerAddress string, resp *http.Response) APIServerError
 // Returns a non-nil error if an error occurs making the API call, the
 // API responds with a non-successful HTTP status, or an error occurs
 // parsing the response body.
-func (c ArvadosClient) Call(method string, resourceType string, uuid string, action string, parameters Dict, output interface{}) error {
+func (c ArvadosClient) Call(method, resourceType, uuid, action string, parameters Dict, output interface{}) error {
        reader, err := c.CallRaw(method, resourceType, uuid, action, parameters)
        if reader != nil {
                defer reader.Close()
index 47b75b384577e50a244355b8ef3dd35ba92c20ab..84a3bff06c0f09e3d326925d89b30ef4deaf0804 100644 (file)
@@ -13,6 +13,9 @@ const (
        FooBarDirCollection   = "zzzzz-4zz18-foonbarfilesdir"
        FooPdh                = "1f4b0bc7583c2a7f9102c395f4ffc5e3+45"
        HelloWorldPdh         = "55713e6a34081eb03609e7ad5fcad129+62"
+
+       Dispatch1Token    = "kwi8oowusvbutahacwk2geulqewy5oaqmpalczfna4b6bb0hfw"
+       Dispatch1AuthUUID = "zzzzz-gj3su-k9dvestay1plssr"
 )
 
 // A valid manifest designed to test various edge cases and parsing
@@ -33,3 +36,5 @@ var (
        }
        MD5CollisionMD5 = "cee9a457e790cf20d4bdaa6d69f01e41"
 )
+
+const BlobSigningKey = "zfhgfenhffzltr9dixws36j1yhksjoll2grmku38mi7yxd66h5j4q9w4jzanezacp8s6q0ro3hxakfye02152hncy6zml2ed0uc"
index 226cf9122be430d8c08c03c595447a3448a19a22..14c75afff282cbfd6fc389f0d81678cadb502260 100644 (file)
@@ -11,7 +11,6 @@ import (
        "os"
        "os/exec"
        "os/signal"
-       "path"
        "strings"
        "syscall"
 )
@@ -117,6 +116,8 @@ func setupCommand(cmd *exec.Cmd, taskp TaskDef, outdir string, replacements map[
                cmd.Stdout = os.Stdout
        }
 
+       cmd.Stderr = os.Stderr
+
        if taskp.Env != nil {
                // Set up subprocess environment
                cmd.Env = os.Environ()
@@ -325,14 +326,23 @@ func main() {
                log.Fatal(err)
        }
 
-       certpath := path.Join(path.Dir(os.Args[0]), "ca-certificates.crt")
-       certdata, err := ioutil.ReadFile(certpath)
-       if err == nil {
-               log.Printf("Using TLS certificates at %v", certpath)
-               certs := x509.NewCertPool()
-               certs.AppendCertsFromPEM(certdata)
-               api.Client.Transport.(*http.Transport).TLSClientConfig.RootCAs = certs
+       // Container may not have certificates installed, so need to look for
+       // /etc/arvados/ca-certificates.crt in addition to normal system certs.
+       var certFiles = []string{
+               "/etc/ssl/certs/ca-certificates.crt", // Debian
+               "/etc/pki/tls/certs/ca-bundle.crt",   // Red Hat
+               "/etc/arvados/ca-certificates.crt",
+       }
+
+       certs := x509.NewCertPool()
+       for _, file := range certFiles {
+               data, err := ioutil.ReadFile(file)
+               if err == nil {
+                       log.Printf("Using TLS certificates at %v", file)
+                       certs.AppendCertsFromPEM(data)
+               }
        }
+       api.Client.Transport.(*http.Transport).TLSClientConfig.RootCAs = certs
 
        jobUuid := os.Getenv("JOB_UUID")
        taskUuid := os.Getenv("TASK_UUID")
diff --git a/sdk/go/dispatch/dispatch.go b/sdk/go/dispatch/dispatch.go
new file mode 100644 (file)
index 0000000..fb7b5fb
--- /dev/null
@@ -0,0 +1,253 @@
+// Framework for monitoring the Arvados container Queue, Locks container
+// records, and runs goroutine callbacks which implement execution and
+// monitoring of the containers.
+package dispatch
+
+import (
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "log"
+       "os"
+       "os/signal"
+       "sync"
+       "syscall"
+       "time"
+)
+
+// Constants for container states
+const (
+       Queued    = "Queued"
+       Locked    = "Locked"
+       Running   = "Running"
+       Complete  = "Complete"
+       Cancelled = "Cancelled"
+)
+
+type apiClientAuthorization struct {
+       UUID     string `json:"uuid"`
+       APIToken string `json:"api_token"`
+}
+
+type apiClientAuthorizationList struct {
+       Items []apiClientAuthorization `json:"items"`
+}
+
+// Represents an Arvados container record
+type Container struct {
+       UUID               string           `json:"uuid"`
+       State              string           `json:"state"`
+       Priority           int              `json:"priority"`
+       RuntimeConstraints map[string]int64 `json:"runtime_constraints"`
+       LockedByUUID       string           `json:"locked_by_uuid"`
+}
+
+// ContainerList is a list of the containers from api
+type ContainerList struct {
+       Items          []Container `json:"items"`
+       ItemsAvailable int         `json:"items_available"`
+}
+
+// Dispatcher holds the state of the dispatcher
+type Dispatcher struct {
+       // The Arvados client
+       Arv arvadosclient.ArvadosClient
+
+       // When a new queued container appears and is either already owned by
+       // this dispatcher or is successfully locked, the dispatcher will call
+       // go RunContainer().  The RunContainer() goroutine gets a channel over
+       // which it will receive updates to the container state.  The
+       // RunContainer() goroutine should only assume status updates come when
+       // the container record changes on the API server; if it needs to
+       // monitor the job submission to the underlying slurm/grid engine/etc
+       // queue it should spin up its own polling goroutines.  When the
+       // channel is closed, that means the container is no longer being
+       // handled by this dispatcher and the goroutine should terminate.  The
+       // goroutine is responsible for draining the 'status' channel, failure
+       // to do so may deadlock the dispatcher.
+       RunContainer func(*Dispatcher, Container, chan Container)
+
+       // Amount of time to wait between polling for updates.
+       PollInterval time.Duration
+
+       // Channel used to signal that RunDispatcher loop should exit.
+       DoneProcessing chan struct{}
+
+       mineMutex  sync.Mutex
+       mineMap    map[string]chan Container
+       Auth       apiClientAuthorization
+       containers chan Container
+}
+
+// Goroutine-safely add/remove uuid to the set of "my" containers, i.e., ones
+// for which this process is actively starting/monitoring.  Returns channel to
+// be used to send container status updates.
+func (dispatcher *Dispatcher) setMine(uuid string) chan Container {
+       dispatcher.mineMutex.Lock()
+       defer dispatcher.mineMutex.Unlock()
+       if ch, ok := dispatcher.mineMap[uuid]; ok {
+               return ch
+       }
+
+       ch := make(chan Container)
+       dispatcher.mineMap[uuid] = ch
+       return ch
+}
+
+// Release a container which is no longer being monitored.
+func (dispatcher *Dispatcher) notMine(uuid string) {
+       dispatcher.mineMutex.Lock()
+       defer dispatcher.mineMutex.Unlock()
+       if ch, ok := dispatcher.mineMap[uuid]; ok {
+               close(ch)
+               delete(dispatcher.mineMap, uuid)
+       }
+}
+
+// Check if there is a channel for updates associated with this container.  If
+// so send the container record on the channel and return true, if not return
+// false.
+func (dispatcher *Dispatcher) updateMine(c Container) bool {
+       dispatcher.mineMutex.Lock()
+       defer dispatcher.mineMutex.Unlock()
+       ch, ok := dispatcher.mineMap[c.UUID]
+       if ok {
+               ch <- c
+               return true
+       }
+       return false
+}
+
+func (dispatcher *Dispatcher) getContainers(params arvadosclient.Dict, touched map[string]bool) {
+       var containers ContainerList
+       err := dispatcher.Arv.List("containers", params, &containers)
+       if err != nil {
+               log.Printf("Error getting list of containers: %q", err)
+               return
+       }
+
+       if containers.ItemsAvailable > len(containers.Items) {
+               // TODO: support paging
+               log.Printf("Warning!  %d containers are available but only received %d, paged requests are not yet supported, some containers may be ignored.",
+                       containers.ItemsAvailable,
+                       len(containers.Items))
+       }
+       for _, container := range containers.Items {
+               touched[container.UUID] = true
+               dispatcher.containers <- container
+       }
+}
+
+func (dispatcher *Dispatcher) pollContainers() {
+       ticker := time.NewTicker(dispatcher.PollInterval)
+
+       paramsQ := arvadosclient.Dict{
+               "filters": [][]interface{}{{"state", "=", "Queued"}, {"priority", ">", "0"}},
+               "order":   []string{"priority desc"},
+               "limit":   "1000"}
+       paramsP := arvadosclient.Dict{
+               "filters": [][]interface{}{{"locked_by_uuid", "=", dispatcher.Auth.UUID}},
+               "limit":   "1000"}
+
+       for {
+               select {
+               case <-ticker.C:
+                       touched := make(map[string]bool)
+                       dispatcher.getContainers(paramsQ, touched)
+                       dispatcher.getContainers(paramsP, touched)
+                       dispatcher.mineMutex.Lock()
+                       var monitored []string
+                       for k := range dispatcher.mineMap {
+                               if _, ok := touched[k]; !ok {
+                                       monitored = append(monitored, k)
+                               }
+                       }
+                       dispatcher.mineMutex.Unlock()
+                       if monitored != nil {
+                               dispatcher.getContainers(arvadosclient.Dict{
+                                       "filters": [][]interface{}{{"uuid", "in", monitored}}}, touched)
+                       }
+               case <-dispatcher.DoneProcessing:
+                       close(dispatcher.containers)
+                       ticker.Stop()
+                       return
+               }
+       }
+}
+
+func (dispatcher *Dispatcher) handleUpdate(container Container) {
+       if container.LockedByUUID != dispatcher.Auth.UUID && container.State != Queued {
+               // If container is Complete, Cancelled, or Queued, LockedByUUID
+               // will be nil.  If the container was formerly Locked, moved
+               // back to Queued and then locked by another dispatcher,
+               // LockedByUUID will be different.  In either case, we want
+               // to stop monitoring it.
+               log.Printf("Container %v now in state %q with locked_by_uuid %q", container.UUID, container.State, container.LockedByUUID)
+               dispatcher.notMine(container.UUID)
+               return
+       }
+
+       if dispatcher.updateMine(container) {
+               // Already monitored, sent status update
+               return
+       }
+
+       if container.State == Queued {
+               // Try to take the lock
+               if err := dispatcher.UpdateState(container.UUID, Locked); err != nil {
+                       return
+               }
+               container.State = Locked
+       }
+
+       if container.State == Locked || container.State == Running {
+               // Not currently monitored but in Locked or Running state and
+               // owned by this dispatcher, so start monitoring.
+               go dispatcher.RunContainer(dispatcher, container, dispatcher.setMine(container.UUID))
+       }
+}
+
+// UpdateState makes an API call to change the state of a container.
+func (dispatcher *Dispatcher) UpdateState(uuid, newState string) error {
+       err := dispatcher.Arv.Update("containers", uuid,
+               arvadosclient.Dict{
+                       "container": arvadosclient.Dict{"state": newState}},
+               nil)
+       if err != nil {
+               log.Printf("Error updating container %s to state %q: %q", uuid, newState, err)
+       }
+       return err
+}
+
+// RunDispatcher runs the main loop of the dispatcher until receiving a message
+// on the dispatcher.DoneProcessing channel.  It also installs a signal handler
+// to terminate gracefully on SIGINT, SIGTERM or SIGQUIT.
+func (dispatcher *Dispatcher) RunDispatcher() (err error) {
+       err = dispatcher.Arv.Call("GET", "api_client_authorizations", "", "current", nil, &dispatcher.Auth)
+       if err != nil {
+               log.Printf("Error getting my token UUID: %v", err)
+               return
+       }
+
+       dispatcher.mineMap = make(map[string]chan Container)
+       dispatcher.containers = make(chan Container)
+
+       // Graceful shutdown on signal
+       sigChan := make(chan os.Signal)
+       signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
+
+       go func(sig <-chan os.Signal) {
+               for sig := range sig {
+                       log.Printf("Caught signal: %v", sig)
+                       dispatcher.DoneProcessing <- struct{}{}
+               }
+       }(sigChan)
+
+       defer close(sigChan)
+       defer signal.Stop(sigChan)
+
+       go dispatcher.pollContainers()
+       for container := range dispatcher.containers {
+               dispatcher.handleUpdate(container)
+       }
+
+       return nil
+}
diff --git a/sdk/go/httpserver/request_limiter.go b/sdk/go/httpserver/request_limiter.go
new file mode 100644 (file)
index 0000000..178ffb9
--- /dev/null
@@ -0,0 +1,29 @@
+package httpserver
+
+import (
+       "net/http"
+)
+
+type limiterHandler struct {
+       requests chan struct{}
+       handler  http.Handler
+}
+
+func NewRequestLimiter(maxRequests int, handler http.Handler) http.Handler {
+       return &limiterHandler{
+               requests: make(chan struct{}, maxRequests),
+               handler:  handler,
+       }
+}
+
+func (h *limiterHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+       select {
+       case h.requests <- struct{}{}:
+       default:
+               // reached max requests
+               resp.WriteHeader(http.StatusServiceUnavailable)
+               return
+       }
+       h.handler.ServeHTTP(resp, req)
+       <-h.requests
+}
diff --git a/sdk/go/httpserver/request_limiter_test.go b/sdk/go/httpserver/request_limiter_test.go
new file mode 100644 (file)
index 0000000..a8cc806
--- /dev/null
@@ -0,0 +1,106 @@
+package httpserver
+
+import (
+       "net/http"
+       "net/http/httptest"
+       "sync"
+       "testing"
+       "time"
+)
+
+type testHandler struct {
+       inHandler   chan struct{}
+       okToProceed chan struct{}
+}
+
+func (h *testHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+       h.inHandler <- struct{}{}
+       <-h.okToProceed
+}
+
+func newTestHandler(maxReqs int) *testHandler {
+       return &testHandler{
+               inHandler:   make(chan struct{}),
+               okToProceed: make(chan struct{}),
+       }
+}
+
+func TestRequestLimiter1(t *testing.T) {
+       h := newTestHandler(10)
+       l := NewRequestLimiter(1, h)
+       var wg sync.WaitGroup
+       resps := make([]*httptest.ResponseRecorder, 10)
+       for i := 0; i < 10; i++ {
+               wg.Add(1)
+               resps[i] = httptest.NewRecorder()
+               go func(i int) {
+                       l.ServeHTTP(resps[i], &http.Request{})
+                       wg.Done()
+               }(i)
+       }
+       done := make(chan struct{})
+       go func() {
+               // Make sure one request has entered the handler
+               <-h.inHandler
+               // Make sure all unsuccessful requests finish (but don't wait
+               // for the one that's still waiting for okToProceed)
+               wg.Add(-1)
+               wg.Wait()
+               // Wait for the last goroutine
+               wg.Add(1)
+               h.okToProceed <- struct{}{}
+               wg.Wait()
+               done <- struct{}{}
+       }()
+       select {
+       case <-done:
+       case <-time.After(10 * time.Second):
+               t.Fatal("test timed out, probably deadlocked")
+       }
+       n200 := 0
+       n503 := 0
+       for i := 0; i < 10; i++ {
+               switch resps[i].Code {
+               case 200:
+                       n200++
+               case 503:
+                       n503++
+               default:
+                       t.Fatalf("Unexpected response code %d", resps[i].Code)
+               }
+       }
+       if n200 != 1 || n503 != 9 {
+               t.Fatalf("Got %d 200 responses, %d 503 responses (expected 1, 9)", n200, n503)
+       }
+       // Now that all 10 are finished, an 11th request should
+       // succeed.
+       go func() {
+               <-h.inHandler
+               h.okToProceed <- struct{}{}
+       }()
+       resp := httptest.NewRecorder()
+       l.ServeHTTP(resp, &http.Request{})
+       if resp.Code != 200 {
+               t.Errorf("Got status %d on 11th request, want 200", resp.Code)
+       }
+}
+
+func TestRequestLimiter10(t *testing.T) {
+       h := newTestHandler(10)
+       l := NewRequestLimiter(10, h)
+       var wg sync.WaitGroup
+       for i := 0; i < 10; i++ {
+               wg.Add(1)
+               go func() {
+                       l.ServeHTTP(httptest.NewRecorder(), &http.Request{})
+                       wg.Done()
+               }()
+               // Make sure the handler starts before we initiate the
+               // next request, but don't let it finish yet.
+               <-h.inHandler
+       }
+       for i := 0; i < 10; i++ {
+               h.okToProceed <- struct{}{}
+       }
+       wg.Wait()
+}
index 1af4dc87567b7e49060f3b0b764b4f6273154758..b9f4c23a8120f1227fc18b3865479a38aeeeac7c 100644 (file)
@@ -9,9 +9,9 @@ import (
 // error.
 type ResponseWriter struct {
        http.ResponseWriter
-       wroteStatus *int        // Last status given to WriteHeader()
-       wroteBodyBytes *int     // Bytes successfully written
-       err *error              // Last error returned from Write()
+       wroteStatus    *int   // Last status given to WriteHeader()
+       wroteBodyBytes *int   // Bytes successfully written
+       err            *error // Last error returned from Write()
 }
 
 func WrapResponseWriter(orig http.ResponseWriter) ResponseWriter {
index d2c171d96111af3e3c6922f73511427d0d41ae2a..bed60f499562a36c4585018932860fe35df34701 100644 (file)
@@ -209,6 +209,10 @@ GET:
                }
                var buf = make([]byte, fs.Offset+fs.Len)
                _, err = io.ReadFull(rdr, buf)
+               errClosing := rdr.Close()
+               if err == nil {
+                       err = errClosing
+               }
                if err != nil {
                        r.err = err
                        close(r.errNotNil)
index 58a047c55a053c14d8324266363fc7ad7fae33fa..2cc23738855dfeab3cd8ab2ef33cb27055a35fa1 100644 (file)
@@ -220,4 +220,5 @@ func (s *CollectionReaderUnit) TestCollectionReaderDataError(c *check.C) {
                c.Check(err, check.NotNil)
                c.Check(err, check.Not(check.Equals), io.EOF)
        }
+       c.Check(rdr.Close(), check.NotNil)
 }
index 12105c6cfca72f19ec424890fe6f8fc1c275822e..d650f0d7ad1bffc14b301a9bb2f4859f65133a4d 100644 (file)
@@ -29,13 +29,15 @@ var (
 
 // makePermSignature generates a SHA-1 HMAC digest for the given blob,
 // token, expiry, and site secret.
-func makePermSignature(blobHash, apiToken, expiry string, permissionSecret []byte) string {
+func makePermSignature(blobHash, apiToken, expiry, blobSignatureTTL string, permissionSecret []byte) string {
        hmac := hmac.New(sha1.New, permissionSecret)
        hmac.Write([]byte(blobHash))
        hmac.Write([]byte("@"))
        hmac.Write([]byte(apiToken))
        hmac.Write([]byte("@"))
        hmac.Write([]byte(expiry))
+       hmac.Write([]byte("@"))
+       hmac.Write([]byte(blobSignatureTTL))
        digest := hmac.Sum(nil)
        return fmt.Sprintf("%x", digest)
 }
@@ -46,15 +48,16 @@ func makePermSignature(blobHash, apiToken, expiry string, permissionSecret []byt
 //
 // This function is intended to be used by system components and admin
 // utilities: userland programs do not know the permissionSecret.
-func SignLocator(blobLocator, apiToken string, expiry time.Time, permissionSecret []byte) string {
+func SignLocator(blobLocator, apiToken string, expiry time.Time, blobSignatureTTL time.Duration, permissionSecret []byte) string {
        if len(permissionSecret) == 0 || apiToken == "" {
                return blobLocator
        }
        // Strip off all hints: only the hash is used to sign.
        blobHash := strings.Split(blobLocator, "+")[0]
        timestampHex := fmt.Sprintf("%08x", expiry.Unix())
+       blobSignatureTTLHex := strconv.FormatInt(int64(blobSignatureTTL.Seconds()), 16)
        return blobLocator +
-               "+A" + makePermSignature(blobHash, apiToken, timestampHex, permissionSecret) +
+               "+A" + makePermSignature(blobHash, apiToken, timestampHex, blobSignatureTTLHex, permissionSecret) +
                "@" + timestampHex
 }
 
@@ -70,7 +73,7 @@ var signedLocatorRe = regexp.MustCompile(`^([[:xdigit:]]{32}).*\+A([[:xdigit:]]{
 //
 // This function is intended to be used by system components and admin
 // utilities: userland programs do not know the permissionSecret.
-func VerifySignature(signedLocator, apiToken string, permissionSecret []byte) error {
+func VerifySignature(signedLocator, apiToken string, blobSignatureTTL time.Duration, permissionSecret []byte) error {
        matches := signedLocatorRe.FindStringSubmatch(signedLocator)
        if matches == nil {
                return ErrSignatureMissing
@@ -83,7 +86,8 @@ func VerifySignature(signedLocator, apiToken string, permissionSecret []byte) er
        } else if expiryTime.Before(time.Now()) {
                return ErrSignatureExpired
        }
-       if signatureHex != makePermSignature(blobHash, apiToken, expiryHex, permissionSecret) {
+       blobSignatureTTLHex := strconv.FormatInt(int64(blobSignatureTTL.Seconds()), 16)
+       if signatureHex != makePermSignature(blobHash, apiToken, expiryHex, blobSignatureTTLHex, permissionSecret) {
                return ErrSignatureInvalid
        }
        return nil
index 138079528747962ba87defe5975768283dc4084c..242b15c0a6e4d33f6b1977a22fb0555737773884 100644 (file)
@@ -16,83 +16,84 @@ const (
                "gokee3eamvjy8qq1fvy238838enjmy5wzy2md7yvsitp5vztft6j4q866efym7e6" +
                "vu5wm9fpnwjyxfldw3vbo01mgjs75rgo7qioh8z8ij7jpyp8508okhgbbex3ceei" +
                "786u5rw2a9gx743dj3fgq2irk"
-       knownSignature     = "257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a"
+       knownSignature     = "89118b78732c33104a4d6231e8b5a5fa1e4301e3"
        knownTimestamp     = "7fffffff"
        knownSigHint       = "+A" + knownSignature + "@" + knownTimestamp
        knownSignedLocator = knownLocator + knownSigHint
+       blobSignatureTTL   = 1209600 * time.Second
 )
 
 func TestSignLocator(t *testing.T) {
        if ts, err := parseHexTimestamp(knownTimestamp); err != nil {
                t.Errorf("bad knownTimestamp %s", knownTimestamp)
        } else {
-               if knownSignedLocator != SignLocator(knownLocator, knownToken, ts, []byte(knownKey)) {
+               if knownSignedLocator != SignLocator(knownLocator, knownToken, ts, blobSignatureTTL, []byte(knownKey)) {
                        t.Fail()
                }
        }
 }
 
 func TestVerifySignature(t *testing.T) {
-       if VerifySignature(knownSignedLocator, knownToken, []byte(knownKey)) != nil {
+       if VerifySignature(knownSignedLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
                t.Fail()
        }
 }
 
 func TestVerifySignatureExtraHints(t *testing.T) {
-       if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint, knownToken, []byte(knownKey)) != nil {
+       if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
                t.Fatal("Verify cannot handle hint before permission signature")
        }
 
-       if VerifySignature(knownLocator+knownSigHint+"+Zfoo", knownToken, []byte(knownKey)) != nil {
+       if VerifySignature(knownLocator+knownSigHint+"+Zfoo", knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
                t.Fatal("Verify cannot handle hint after permission signature")
        }
 
-       if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint+"+Zfoo", knownToken, []byte(knownKey)) != nil {
+       if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint+"+Zfoo", knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
                t.Fatal("Verify cannot handle hints around permission signature")
        }
 }
 
 // The size hint on the locator string should not affect signature validation.
 func TestVerifySignatureWrongSize(t *testing.T) {
-       if VerifySignature(knownHash+"+999999"+knownSigHint, knownToken, []byte(knownKey)) != nil {
+       if VerifySignature(knownHash+"+999999"+knownSigHint, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
                t.Fatal("Verify cannot handle incorrect size hint")
        }
 
-       if VerifySignature(knownHash+knownSigHint, knownToken, []byte(knownKey)) != nil {
+       if VerifySignature(knownHash+knownSigHint, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
                t.Fatal("Verify cannot handle missing size hint")
        }
 }
 
 func TestVerifySignatureBadSig(t *testing.T) {
        badLocator := knownLocator + "+Aaaaaaaaaaaaaaaa@" + knownTimestamp
-       if VerifySignature(badLocator, knownToken, []byte(knownKey)) != ErrSignatureMissing {
+       if VerifySignature(badLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != ErrSignatureMissing {
                t.Fail()
        }
 }
 
 func TestVerifySignatureBadTimestamp(t *testing.T) {
        badLocator := knownLocator + "+A" + knownSignature + "@OOOOOOOl"
-       if VerifySignature(badLocator, knownToken, []byte(knownKey)) != ErrSignatureMissing {
+       if VerifySignature(badLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != ErrSignatureMissing {
                t.Fail()
        }
 }
 
 func TestVerifySignatureBadSecret(t *testing.T) {
-       if VerifySignature(knownSignedLocator, knownToken, []byte("00000000000000000000")) != ErrSignatureInvalid {
+       if VerifySignature(knownSignedLocator, knownToken, blobSignatureTTL, []byte("00000000000000000000")) != ErrSignatureInvalid {
                t.Fail()
        }
 }
 
 func TestVerifySignatureBadToken(t *testing.T) {
-       if VerifySignature(knownSignedLocator, "00000000", []byte(knownKey)) != ErrSignatureInvalid {
+       if VerifySignature(knownSignedLocator, "00000000", blobSignatureTTL, []byte(knownKey)) != ErrSignatureInvalid {
                t.Fail()
        }
 }
 
 func TestVerifySignatureExpired(t *testing.T) {
        yesterday := time.Now().AddDate(0, 0, -1)
-       expiredLocator := SignLocator(knownHash, knownToken, yesterday, []byte(knownKey))
-       if VerifySignature(expiredLocator, knownToken, []byte(knownKey)) != ErrSignatureExpired {
+       expiredLocator := SignLocator(knownHash, knownToken, yesterday, blobSignatureTTL, []byte(knownKey))
+       if VerifySignature(expiredLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != ErrSignatureExpired {
                t.Fail()
        }
 }
index cf0ae85c5782a8848d234de10d3fce81b6e661cd..22b1c974e634cd8229b645421ecb09480807c000 100644 (file)
@@ -265,7 +265,7 @@ func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
        return ch
 }
 
-// Blocks may appear mulitple times within the same manifest if they
+// Blocks may appear multiple times within the same manifest if they
 // are used by multiple files. In that case this Iterator will output
 // the same block multiple times.
 //
index 3f5f9344c521f3021e2f6fd35f025e4b9c7fb95e..499b4d966810223d7e1747caff4a1ce39113b470 100644 (file)
@@ -16,13 +16,13 @@ channel back to the transfer() function.
 Meanwhile, the transfer() function selects() on two channels, the "requests"
 channel and the "slices" channel.
 
-When a message is recieved on the "slices" channel, this means the a new
+When a message is received on the "slices" channel, this means the a new
 section of the buffer has data, or an error is signaled.  Since the data has
 been read directly into the source_buffer, it is able to simply increases the
 size of the body slice to encompass the newly filled in section.  Then any
 pending reads are serviced with handleReadRequest (described below).
 
-When a message is recieved on the "requests" channel, it means a StreamReader
+When a message is received on the "requests" channel, it means a StreamReader
 wants access to a slice of the buffer.  This is passed to handleReadRequest().
 
 The handleReadRequest() function takes a sliceRequest consisting of a buffer
index 71af6445a5ec508f1953777871a345c2d3e03ca1..b78c63e301b81d5ddb2644983e2e83017e98bbdf 100644 (file)
@@ -108,6 +108,7 @@ class ArvadosFileReaderBase(_FileLikeObjectBase):
         cache_pos, cache_data = self._readline_cache
         if self.tell() == cache_pos:
             data = [cache_data]
+            self._filepos += len(cache_data)
         else:
             data = ['']
         data_size = len(data[-1])
@@ -123,6 +124,7 @@ class ArvadosFileReaderBase(_FileLikeObjectBase):
         except ValueError:
             nextline_index = len(data)
         nextline_index = min(nextline_index, size)
+        self._filepos -= len(data) - nextline_index
         self._readline_cache = (self.tell(), data[nextline_index:])
         return data[:nextline_index]
 
index 2ee97b9867036ded89244d1f11c2294efccb9ef3..badbd668d951c46dd882b2468940463a17610728 100755 (executable)
@@ -17,6 +17,7 @@
 # arv-copy will issue an error.
 
 import argparse
+import contextlib
 import getpass
 import os
 import re
@@ -35,6 +36,8 @@ import arvados.commands.keepdocker
 
 from arvados.api import OrderedJsonModel
 
+COMMIT_HASH_RE = re.compile(r'^[0-9a-f]{1,40}$')
+
 logger = logging.getLogger('arvados.arv-copy')
 
 # local_repo_dir records which git repositories from the Arvados source
@@ -70,6 +73,9 @@ def main():
     copy_opts.add_argument(
         '-f', '--force', dest='force', action='store_true',
         help='Perform copy even if the object appears to exist at the remote destination.')
+    copy_opts.add_argument(
+        '--force-filters', action='store_true', default=False,
+        help="Copy pipeline template filters verbatim, even if they act differently on the destination cluster.")
     copy_opts.add_argument(
         '--src', dest='source_arvados', required=True,
         help='The name of the source Arvados instance (required) - points at an Arvados config file. May be either a pathname to a config file, or (for example) "foo" as shorthand for $HOME/.config/arvados/foo.conf.')
@@ -265,6 +271,94 @@ def copy_pipeline_instance(pi_uuid, src, dst, args):
     new_pi = dst.pipeline_instances().create(body=pi, ensure_unique_name=True).execute(num_retries=args.retries)
     return new_pi
 
+def filter_iter(arg):
+    """Iterate a filter string-or-list.
+
+    Pass in a filter field that can either be a string or list.
+    This will iterate elements as if the field had been written as a list.
+    """
+    if isinstance(arg, basestring):
+        return iter((arg,))
+    else:
+        return iter(arg)
+
+def migrate_repository_filter(repo_filter, src_repository, dst_repository):
+    """Update a single repository filter in-place for the destination.
+
+    If the filter checks that the repository is src_repository, it is
+    updated to check that the repository is dst_repository.  If it does
+    anything else, this function raises ValueError.
+    """
+    if src_repository is None:
+        raise ValueError("component does not specify a source repository")
+    elif dst_repository is None:
+        raise ValueError("no destination repository specified to update repository filter")
+    elif repo_filter[1:] == ['=', src_repository]:
+        repo_filter[2] = dst_repository
+    elif repo_filter[1:] == ['in', [src_repository]]:
+        repo_filter[2] = [dst_repository]
+    else:
+        raise ValueError("repository filter is not a simple source match")
+
+def migrate_script_version_filter(version_filter):
+    """Update a single script_version filter in-place for the destination.
+
+    Currently this function checks that all the filter operands are Git
+    commit hashes.  If they're not, it raises ValueError to indicate that
+    the filter is not portable.  It could be extended to make other
+    transformations in the future.
+    """
+    if not all(COMMIT_HASH_RE.match(v) for v in filter_iter(version_filter[2])):
+        raise ValueError("script_version filter is not limited to commit hashes")
+
+def attr_filtered(filter_, *attr_names):
+    """Return True if filter_ applies to any of attr_names, else False."""
+    return any((name == 'any') or (name in attr_names)
+               for name in filter_iter(filter_[0]))
+
+@contextlib.contextmanager
+def exception_handler(handler, *exc_types):
+    """If any exc_types are raised in the block, call handler on the exception."""
+    try:
+        yield
+    except exc_types as error:
+        handler(error)
+
+def migrate_components_filters(template_components, dst_git_repo):
+    """Update template component filters in-place for the destination.
+
+    template_components is a dictionary of components in a pipeline template.
+    This method walks over each component's filters, and updates them to have
+    identical semantics on the destination cluster.  It returns a list of
+    error strings that describe what filters could not be updated safely.
+
+    dst_git_repo is the name of the destination Git repository, which can
+    be None if that is not known.
+    """
+    errors = []
+    for cname, cspec in template_components.iteritems():
+        def add_error(errmsg):
+            errors.append("{}: {}".format(cname, errmsg))
+        if not isinstance(cspec, dict):
+            add_error("value is not a component definition")
+            continue
+        src_repository = cspec.get('repository')
+        filters = cspec.get('filters', [])
+        if not isinstance(filters, list):
+            add_error("filters are not a list")
+            continue
+        for cfilter in filters:
+            if not (isinstance(cfilter, list) and (len(cfilter) == 3)):
+                add_error("malformed filter {!r}".format(cfilter))
+                continue
+            if attr_filtered(cfilter, 'repository'):
+                with exception_handler(add_error, ValueError):
+                    migrate_repository_filter(cfilter, src_repository, dst_git_repo)
+            if attr_filtered(cfilter, 'script_version'):
+                with exception_handler(add_error, ValueError):
+                    migrate_script_version_filter(cfilter)
+    return errors
+
 # copy_pipeline_template(pt_uuid, src, dst, args)
 #
 #    Copies a pipeline template identified by pt_uuid from src to dst.
@@ -281,6 +375,12 @@ def copy_pipeline_template(pt_uuid, src, dst, args):
     # fetch the pipeline template from the source instance
     pt = src.pipeline_templates().get(uuid=pt_uuid).execute(num_retries=args.retries)
 
+    if not args.force_filters:
+        filter_errors = migrate_components_filters(pt['components'], args.dst_git_repo)
+        if filter_errors:
+            abort("Template filters cannot be copied safely. Use --force-filters to copy anyway.\n" +
+                  "\n".join(filter_errors))
+
     if args.recursive:
         check_git_availability()
 
index e48a6d15472cc2c90cd4af7e35251b653aa87cce..e8ce2ee21d6299dfb9cf4082d105b2d9a439757f 100644 (file)
@@ -283,15 +283,18 @@ def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None)
     return [(image['collection'], image) for image in images
             if image['collection'] in existing_coll_uuids]
 
-def main(arguments=None):
+def items_owned_by(owner_uuid, arv_items):
+    return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
+
+def main(arguments=None, stdout=sys.stdout):
     args = arg_parser.parse_args(arguments)
     api = arvados.api('v1')
 
     if args.image is None or args.image == 'images':
-        fmt = "{:30}  {:10}  {:12}  {:29}  {:20}"
-        print fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED")
+        fmt = "{:30}  {:10}  {:12}  {:29}  {:20}\n"
+        stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
         for i, j in list_images_in_arv(api, args.retries):
-            print(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
+            stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
         sys.exit(0)
 
     # Pull the image if requested, unless the image is specified as a hash
@@ -326,10 +329,10 @@ def main(arguments=None):
                 num_retries=args.retries)['uuid']
 
         # Find image hash tags
-        existing_links = api.links().list(
+        existing_links = _get_docker_links(
+            api, args.retries,
             filters=[['link_class', '=', 'docker_image_hash'],
-                     ['name', '=', image_hash]]
-            ).execute(num_retries=args.retries)['items']
+                     ['name', '=', image_hash]])
         if existing_links:
             # get readable collections
             collections = api.collections().list(
@@ -339,21 +342,18 @@ def main(arguments=None):
 
             if collections:
                 # check for repo+tag links on these collections
-                existing_repo_tag = (api.links().list(
-                    filters=[['link_class', '=', 'docker_image_repo+tag'],
-                             ['name', '=', image_repo_tag],
-                             ['head_uuid', 'in', collections]]
-                    ).execute(num_retries=args.retries)['items']) if image_repo_tag else []
-
-                # Filter on elements owned by the parent project
-                owned_col = [c for c in collections if c['owner_uuid'] == parent_project_uuid]
-                owned_img = [c for c in existing_links if c['owner_uuid'] == parent_project_uuid]
-                owned_rep = [c for c in existing_repo_tag if c['owner_uuid'] == parent_project_uuid]
-
-                if owned_col:
-                    # already have a collection owned by this project
-                    coll_uuid = owned_col[0]['uuid']
+                if image_repo_tag:
+                    existing_repo_tag = _get_docker_links(
+                        api, args.retries,
+                        filters=[['link_class', '=', 'docker_image_repo+tag'],
+                                 ['name', '=', image_repo_tag],
+                                 ['head_uuid', 'in', collections]])
                 else:
+                    existing_repo_tag = []
+
+                try:
+                    coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
+                except StopIteration:
                     # create new collection owned by the project
                     coll_uuid = api.collections().create(
                         body={"manifest_text": collections[0]['manifest_text'],
@@ -363,19 +363,20 @@ def main(arguments=None):
                         ).execute(num_retries=args.retries)['uuid']
 
                 link_base = {'owner_uuid': parent_project_uuid,
-                             'head_uuid':  coll_uuid }
+                             'head_uuid':  coll_uuid,
+                             'properties': existing_links[0]['properties']}
 
-                if not owned_img:
+                if not any(items_owned_by(parent_project_uuid, existing_links)):
                     # create image link owned by the project
                     make_link(api, args.retries,
                               'docker_image_hash', image_hash, **link_base)
 
-                if not owned_rep and image_repo_tag:
+                if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
                     # create repo+tag link owned by the project
                     make_link(api, args.retries, 'docker_image_repo+tag',
                               image_repo_tag, **link_base)
 
-                print(coll_uuid)
+                stdout.write(coll_uuid + "\n")
 
                 sys.exit(0)
 
@@ -393,7 +394,7 @@ def main(arguments=None):
         put_args += ['--name', collection_name]
 
     coll_uuid = arv_put.main(
-        put_args + ['--filename', outfile_name, image_file.name]).strip()
+        put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
 
     # Read the image metadata and make Arvados links from it.
     image_file.seek(0)
index 6fa26c672d26962a3e9cd01e140907eecec783e4..5cb699f49f7a21eb8d7789a52bd0aea7bdd056f0 100644 (file)
@@ -226,6 +226,23 @@ class ResumeCache(object):
         self.cache_file.seek(0)
         return json.load(self.cache_file)
 
+    def check_cache(self, api_client=None, num_retries=0):
+        try:
+            state = self.load()
+            locator = None
+            try:
+                if "_finished_streams" in state and len(state["_finished_streams"]) > 0:
+                    locator = state["_finished_streams"][0][1][0]
+                elif "_current_stream_locators" in state and len(state["_current_stream_locators"]) > 0:
+                    locator = state["_current_stream_locators"][0]
+                if locator is not None:
+                    kc = arvados.keep.KeepClient(api_client=api_client)
+                    kc.head(locator, num_retries=num_retries)
+            except Exception as e:
+                self.restart()
+        except (ValueError):
+            pass
+
     def save(self, data):
         try:
             new_cache_fd, new_cache_name = tempfile.mkstemp(
@@ -438,6 +455,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     if args.resume:
         try:
             resume_cache = ResumeCache(ResumeCache.make_path(args))
+            resume_cache.check_cache(api_client=api_client, num_retries=args.retries)
         except (IOError, OSError, ValueError):
             pass  # Couldn't open cache directory/file.  Continue without it.
         except ResumeCacheConflict:
@@ -490,6 +508,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     if args.progress:  # Print newline to split stderr from stdout for humans.
         print >>stderr
 
+    output = None
     if args.stream:
         output = writer.manifest_text()
         if args.normalize:
@@ -530,9 +549,12 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
             status = 1
 
     # Print the locator (uuid) of the new collection.
-    stdout.write(output)
-    if not output.endswith('\n'):
-        stdout.write('\n')
+    if output is None:
+        status = status or 1
+    else:
+        stdout.write(output)
+        if not output.endswith('\n'):
+            stdout.write('\n')
 
     for sigcode, orig_handler in orig_signal_handlers.items():
         signal.signal(sigcode, orig_handler)
index ef39be81a4650cda86e20c6d13a7d23848398ecb..5d29c45117acd71e924838bb9b758af77d8e9b91 100644 (file)
@@ -34,6 +34,12 @@ class ArvFile(object):
         self.prefix = prefix
         self.fn = fn
 
+    def __hash__(self):
+        return (self.prefix+self.fn).__hash__()
+
+    def __eq__(self, other):
+        return (self.prefix == other.prefix) and (self.fn == other.fn)
+
 class UploadFile(ArvFile):
     pass
 
@@ -101,10 +107,10 @@ def statfile(prefix, fn, fnPattern="$(file %s/%s)", dirPattern="$(dir %s/%s/)"):
 
     return prefix+fn
 
-def uploadfiles(files, api, dry_run=False, num_retries=0, project=None, fnPattern="$(file %s/%s)"):
+def uploadfiles(files, api, dry_run=False, num_retries=0, project=None, fnPattern="$(file %s/%s)", name=None):
     # Find the smallest path prefix that includes all the files that need to be uploaded.
     # This starts at the root and iteratively removes common parent directory prefixes
-    # until all file pathes no longer have a common parent.
+    # until all file paths no longer have a common parent.
     n = True
     pathprefix = "/"
     while n:
@@ -148,9 +154,21 @@ def uploadfiles(files, api, dry_run=False, num_retries=0, project=None, fnPatter
                 stream = sp[0]
                 collection.start_new_stream(stream)
             collection.write_file(f.fn, sp[1])
-        item = api.collections().create(body={"owner_uuid": project, "manifest_text": collection.manifest_text()}).execute()
+
+        exists = api.collections().list(filters=[["owner_uuid", "=", project],
+                                                 ["portable_data_hash", "=", collection.portable_data_hash()],
+                                                 ["name", "=", name]]).execute(num_retries=num_retries)
+        if exists["items"]:
+            item = exists["items"][0]
+            logger.info("Using collection %s", item["uuid"])
+        else:
+            body = {"owner_uuid": project, "manifest_text": collection.manifest_text()}
+            if name is not None:
+                body["name"] = name
+            item = api.collections().create(body=body, ensure_unique_name=True).execute()
+            logger.info("Uploaded to %s", item["uuid"])
+
         pdh = item["portable_data_hash"]
-        logger.info("Uploaded to %s", item["uuid"])
 
     for c in files:
         c.fn = fnPattern % (pdh, c.fn)
index df824a331ea41a2fd702587be9c5d2828884ffb5..81a9b36182a8545adbdcd3fd6afec7f0fba53602 100644 (file)
@@ -1,9 +1,11 @@
 import arvados
 import config
 import errors
+from retry import RetryLoop
 
 import logging
 import json
+import thread
 import threading
 import time
 import os
@@ -14,8 +16,8 @@ from ws4py.client.threadedclient import WebSocketClient
 _logger = logging.getLogger('arvados.events')
 
 
-class EventClient(WebSocketClient):
-    def __init__(self, url, filters, on_event, last_log_id):
+class _EventClient(WebSocketClient):
+    def __init__(self, url, filters, on_event, last_log_id, on_closed):
         ssl_options = {'ca_certs': arvados.util.ca_certs_path()}
         if config.flag_is_true('ARVADOS_API_HOST_INSECURE'):
             ssl_options['cert_reqs'] = ssl.CERT_NONE
@@ -26,19 +28,23 @@ class EventClient(WebSocketClient):
         # IPv4 addresses (common with "localhost"), only one of them
         # will be attempted -- and it might not be the right one. See
         # ws4py's WebSocketBaseClient.__init__.
-        super(EventClient, self).__init__(url, ssl_options=ssl_options)
+        super(_EventClient, self).__init__(url, ssl_options=ssl_options)
+
         self.filters = filters
         self.on_event = on_event
         self.last_log_id = last_log_id
         self._closing_lock = threading.RLock()
         self._closing = False
         self._closed = threading.Event()
+        self.on_closed = on_closed
 
     def opened(self):
-        self.subscribe(self.filters, self.last_log_id)
+        for f in self.filters:
+            self.subscribe(f, self.last_log_id)
 
     def closed(self, code, reason=None):
         self._closed.set()
+        self.on_closed()
 
     def received_message(self, m):
         with self._closing_lock:
@@ -51,21 +57,85 @@ class EventClient(WebSocketClient):
         :timeout: is the number of seconds to wait for ws4py to
         indicate that the connection has closed.
         """
-        super(EventClient, self).close(code, reason)
+        super(_EventClient, self).close(code, reason)
         with self._closing_lock:
             # make sure we don't process any more messages.
             self._closing = True
         # wait for ws4py to tell us the connection is closed.
         self._closed.wait(timeout=timeout)
 
-    def subscribe(self, filters, last_log_id=None):
-        m = {"method": "subscribe", "filters": filters}
+    def subscribe(self, f, last_log_id=None):
+        m = {"method": "subscribe", "filters": f}
         if last_log_id is not None:
             m["last_log_id"] = last_log_id
         self.send(json.dumps(m))
 
-    def unsubscribe(self, filters):
-        self.send(json.dumps({"method": "unsubscribe", "filters": filters}))
+    def unsubscribe(self, f):
+        self.send(json.dumps({"method": "unsubscribe", "filters": f}))
+
+
+class EventClient(object):
+    def __init__(self, url, filters, on_event_cb, last_log_id):
+        self.url = url
+        if filters:
+            self.filters = [filters]
+        else:
+            self.filters = [[]]
+        self.on_event_cb = on_event_cb
+        self.last_log_id = last_log_id
+        self.is_closed = threading.Event()
+        self._setup_event_client()
+
+    def _setup_event_client(self):
+        self.ec = _EventClient(self.url, self.filters, self.on_event,
+                               self.last_log_id, self.on_closed)
+        self.ec.daemon = True
+        try:
+            self.ec.connect()
+        except Exception:
+            self.ec.close_connection()
+            raise
+
+    def subscribe(self, f, last_log_id=None):
+        self.filters.append(f)
+        self.ec.subscribe(f, last_log_id)
+
+    def unsubscribe(self, f):
+        del self.filters[self.filters.index(f)]
+        self.ec.unsubscribe(f)
+
+    def close(self, code=1000, reason='', timeout=0):
+        self.is_closed.set()
+        self.ec.close(code, reason, timeout)
+
+    def on_event(self, m):
+        if m.get('id') != None:
+            self.last_log_id = m.get('id')
+        try:
+            self.on_event_cb(m)
+        except Exception as e:
+            _logger.exception("Unexpected exception from event callback.")
+            thread.interrupt_main()
+
+    def on_closed(self):
+        if not self.is_closed.is_set():
+            _logger.warn("Unexpected close. Reconnecting.")
+            for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=15):
+                try:
+                    self._setup_event_client()
+                    break
+                except Exception as e:
+                    _logger.warn("Error '%s' during websocket reconnect.", e)
+            if tries_left == 0:
+                _logger.exception("EventClient thread could not contact websocket server.")
+                self.is_closed.set()
+                thread.interrupt_main()
+                return
+
+    def run_forever(self):
+        # Have to poll here to let KeyboardInterrupt get raised.
+        while not self.is_closed.wait(1):
+            pass
 
 
 class PollClient(threading.Thread):
@@ -89,7 +159,21 @@ class PollClient(threading.Thread):
             self.id = self.last_log_id
         else:
             for f in self.filters:
-                items = self.api.logs().list(limit=1, order="id desc", filters=f).execute()['items']
+                for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=self.poll_time):
+                    try:
+                        items = self.api.logs().list(limit=1, order="id desc", filters=f).execute()['items']
+                        break
+                    except errors.ApiError as error:
+                        pass
+                    else:
+                        tries_left = 0
+                        break
+                if tries_left == 0:
+                    _logger.exception("PollClient thread could not contact API server.")
+                    with self._closing_lock:
+                        self._closing.set()
+                    thread.interrupt_main()
+                    return
                 if items:
                     if items[0]['id'] > self.id:
                         self.id = items[0]['id']
@@ -100,14 +184,32 @@ class PollClient(threading.Thread):
             max_id = self.id
             moreitems = False
             for f in self.filters:
-                items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute()
+                for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=self.poll_time):
+                    try:
+                        items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute()
+                        break
+                    except errors.ApiError as error:
+                        pass
+                    else:
+                        tries_left = 0
+                        break
+                if tries_left == 0:
+                    _logger.exception("PollClient thread could not contact API server.")
+                    with self._closing_lock:
+                        self._closing.set()
+                    thread.interrupt_main()
+                    return
                 for i in items["items"]:
                     if i['id'] > max_id:
                         max_id = i['id']
                     with self._closing_lock:
                         if self._closing.is_set():
                             return
-                        self.on_event(i)
+                        try:
+                            self.on_event(i)
+                        except Exception as e:
+                            _logger.exception("Unexpected exception from event callback.")
+                            thread.interrupt_main()
                 if items["items_available"] > len(items["items"]):
                     moreitems = True
             self.id = max_id
@@ -143,12 +245,12 @@ class PollClient(threading.Thread):
             # to do so raises the same exception."
             pass
 
-    def subscribe(self, filters):
+    def subscribe(self, f):
         self.on_event({'status': 200})
-        self.filters.append(filters)
+        self.filters.append(f)
 
-    def unsubscribe(self, filters):
-        del self.filters[self.filters.index(filters)]
+    def unsubscribe(self, f):
+        del self.filters[self.filters.index(f)]
 
 
 def _subscribe_websocket(api, filters, on_event, last_log_id=None):
@@ -156,20 +258,14 @@ def _subscribe_websocket(api, filters, on_event, last_log_id=None):
     if not endpoint:
         raise errors.FeatureNotEnabledError(
             "Server does not advertise a websocket endpoint")
+    uri_with_token = "{}?api_token={}".format(endpoint, api.api_token)
     try:
-        uri_with_token = "{}?api_token={}".format(endpoint, api.api_token)
         client = EventClient(uri_with_token, filters, on_event, last_log_id)
-        ok = False
-        try:
-            client.connect()
-            ok = True
-            return client
-        finally:
-            if not ok:
-                client.close_connection()
-    except:
+    except Exception:
         _logger.warn("Failed to connect to websockets on %s" % endpoint)
         raise
+    else:
+        return client
 
 
 def subscribe(api, filters, on_event, poll_fallback=15, last_log_id=None):
index cd39f83703f4b341e07201a67c0afb58a11574ae..3c0ad6f7a9e57a444bdcf06416505923e497ad31 100644 (file)
@@ -212,7 +212,6 @@ class KeepBlockCache(object):
                 self._cache.insert(0, n)
                 return n, True
 
-
 class Counter(object):
     def __init__(self, v=0):
         self._lk = threading.Lock()
@@ -374,10 +373,10 @@ class KeepClient(object):
             s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 75)
             return s
 
-        def get(self, locator, timeout=None):
+        def get(self, locator, method="GET", timeout=None):
             # locator is a KeepLocator object.
             url = self.root + str(locator)
-            _logger.debug("Request: GET %s", url)
+            _logger.debug("Request: %s %s", method, url)
             curl = self._get_user_agent()
             ok = None
             try:
@@ -391,7 +390,10 @@ class KeepClient(object):
                         '{}: {}'.format(k,v) for k,v in self.get_headers.iteritems()])
                     curl.setopt(pycurl.WRITEFUNCTION, response_body.write)
                     curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
+                    if method == "HEAD":
+                        curl.setopt(pycurl.NOBODY, True)
                     self._setcurltimeouts(curl, timeout)
+
                     try:
                         curl.perform()
                     except Exception as e:
@@ -402,6 +404,7 @@ class KeepClient(object):
                         'headers': self._headers,
                         'error': False,
                     }
+
                 ok = retry.check_http_response_success(self._result['status_code'])
                 if not ok:
                     self._result['error'] = arvados.errors.HttpError(
@@ -425,11 +428,18 @@ class KeepClient(object):
                 _logger.debug("Request fail: GET %s => %s: %s",
                               url, type(self._result['error']), str(self._result['error']))
                 return None
+            if method == "HEAD":
+                _logger.info("HEAD %s: %s bytes",
+                         self._result['status_code'],
+                         self._result.get('content-length'))
+                return True
+
             _logger.info("GET %s: %s bytes in %s msec (%.3f MiB/sec)",
                          self._result['status_code'],
                          len(self._result['body']),
                          t.msecs,
                          (len(self._result['body'])/(1024.0*1024))/t.secs if t.secs > 0 else 0)
+
             if self.download_counter:
                 self.download_counter.add(len(self._result['body']))
             resp_md5 = hashlib.md5(self._result['body']).hexdigest()
@@ -870,8 +880,15 @@ class KeepClient(object):
         else:
             return None
 
+    @retry.retry_method
+    def head(self, loc_s, num_retries=None):
+        return self._get_or_head(loc_s, method="HEAD", num_retries=num_retries)
+
     @retry.retry_method
     def get(self, loc_s, num_retries=None):
+        return self._get_or_head(loc_s, method="GET", num_retries=num_retries)
+
+    def _get_or_head(self, loc_s, method="GET", num_retries=None):
         """Get data from Keep.
 
         This method fetches one or more blocks of data from Keep.  It
@@ -897,11 +914,12 @@ class KeepClient(object):
         self.get_counter.add(1)
 
         locator = KeepLocator(loc_s)
-        slot, first = self.block_cache.reserve_cache(locator.md5sum)
-        if not first:
-            self.hits_counter.add(1)
-            v = slot.get()
-            return v
+        if method == "GET":
+            slot, first = self.block_cache.reserve_cache(locator.md5sum)
+            if not first:
+                self.hits_counter.add(1)
+                v = slot.get()
+                return v
 
         self.misses_counter.add(1)
 
@@ -951,16 +969,20 @@ class KeepClient(object):
                                for root in sorted_roots
                                if roots_map[root].usable()]
             for keep_service in services_to_try:
-                blob = keep_service.get(locator, timeout=self.current_timeout(num_retries-tries_left))
+                blob = keep_service.get(locator, method=method, timeout=self.current_timeout(num_retries-tries_left))
                 if blob is not None:
                     break
             loop.save_result((blob, len(services_to_try)))
 
         # Always cache the result, then return it if we succeeded.
-        slot.set(blob)
-        self.block_cache.cap_cache()
+        if method == "GET":
+            slot.set(blob)
+            self.block_cache.cap_cache()
         if loop.success():
-            return blob
+            if method == "HEAD":
+                return True
+            else:
+                return blob
 
         # Q: Including 403 is necessary for the Keep tests to continue
         # passing, but maybe they should expect KeepReadError instead?
index d8f5317d2c4c160c833339929302edaae679d6f5..5ba4f4ea41016a6225ebb3fca194265e56b56a0b 100644 (file)
@@ -31,7 +31,8 @@ class RetryLoop(object):
             return loop.last_result()
     """
     def __init__(self, num_retries, success_check=lambda r: True,
-                 backoff_start=0, backoff_growth=2, save_results=1):
+                 backoff_start=0, backoff_growth=2, save_results=1,
+                 max_wait=60):
         """Construct a new RetryLoop.
 
         Arguments:
@@ -50,11 +51,13 @@ class RetryLoop(object):
         * save_results: Specify a number to save the last N results
           that the loop recorded.  These records are available through
           the results attribute, oldest first.  Default 1.
+        * max_wait: Maximum number of seconds to wait between retries.
         """
         self.tries_left = num_retries + 1
         self.check_result = success_check
         self.backoff_wait = backoff_start
         self.backoff_growth = backoff_growth
+        self.max_wait = max_wait
         self.next_start_time = 0
         self.results = deque(maxlen=save_results)
         self._running = None
@@ -76,6 +79,8 @@ class RetryLoop(object):
             wait_time = max(0, self.next_start_time - time.time())
             time.sleep(wait_time)
             self.backoff_wait *= self.backoff_growth
+            if self.backoff_wait > self.max_wait:
+                self.backoff_wait = self.max_wait
         self.next_start_time = time.time() + self.backoff_wait
         self.tries_left -= 1
         return self.tries_left
index 759e8ff67edf1ec8b99b0de86ee8a3e4602b73b7..e0aae9625eb54d82eb4ee983696487079fa0d441 100644 (file)
@@ -15,6 +15,11 @@ try:
 except ImportError:
     tagger = egg_info_cmd.egg_info
 
+short_tests_only = False
+if '--short-tests-only' in sys.argv:
+    short_tests_only = True
+    sys.argv.remove('--short-tests-only')
+
 setup(name='arvados-python-client',
       version='0.1',
       description='Arvados client library',
@@ -41,6 +46,7 @@ setup(name='arvados-python-client',
       install_requires=[
           'google-api-python-client==1.4.2',
           'oauth2client >=1.4.6, <2',
+          'pyasn1-modules==0.0.5',
           'ciso8601',
           'httplib2',
           'pycurl >=7.19.5.1, <7.21.5',
index b2cf43652bce288858dca4a1db1a121b040af9ec..71c9b178e7525808508babf86a383a37b4ab4ba6 100644 (file)
@@ -85,7 +85,8 @@ class FakeCurl:
             self._headerfunction("HTTP/1.1 {} Status".format(self._resp_code))
             for k, v in self._resp_headers.iteritems():
                 self._headerfunction(k + ': ' + str(v))
-        self._writer(self._resp_body)
+        if type(self._resp_body) is not bool:
+            self._writer(self._resp_body)
 
     def close(self):
         pass
index f074f8d6cf67ee768a4f09e2515a850cac5e4c5c..d79788c07e3ef4d26d055e6d72a8a59f755c66ae 100644 (file)
@@ -121,6 +121,20 @@ class Handler(BaseHTTPServer.BaseHTTPRequestHandler, object):
         self.wfile_bandwidth_write(self.server.store[datahash])
         self.server._do_delay('response_close')
 
+    def do_HEAD(self):
+        self.server._do_delay('response')
+        r = re.search(r'[0-9a-f]{32}', self.path)
+        if not r:
+            return self.send_response(422)
+        datahash = r.group(0)
+        if datahash not in self.server.store:
+            return self.send_response(404)
+        self.send_response(200)
+        self.send_header('Content-type', 'application/octet-stream')
+        self.send_header('Content-length', str(len(self.server.store[datahash])))
+        self.end_headers()
+        self.server._do_delay('response_close')
+
     def do_PUT(self):
         self.server._do_delay('request_body')
         # The comments at https://bugs.python.org/issue1491 implies that Python
diff --git a/sdk/python/tests/slow_test.py b/sdk/python/tests/slow_test.py
new file mode 100644 (file)
index 0000000..643ba92
--- /dev/null
@@ -0,0 +1,7 @@
+import __main__
+import os
+import unittest
+
+slow_test = lambda _: unittest.skipIf(
+    __main__.short_tests_only,
+    "running --short tests only")
index 795a9aa22a2b0ad161a6bf1ee26228ca1ba771f5..6c379e1455f9f81c4c71d0bdafeb5ca03d02b75d 100644 (file)
@@ -37,7 +37,7 @@ class ArvadosApiTest(run_test_server.TestCaseWithServers):
 
     def test_empty_list(self):
         answer = arvados.api('v1').humans().list(
-            filters=[['uuid', 'is', None]]).execute()
+            filters=[['uuid', '=', None]]).execute()
         self.assertEqual(answer['items_available'], len(answer['items']))
 
     def test_nonempty_list(self):
index 896b880778a1b0965429420af8a6f349048ef5c9..e64d91474170ce688780c3ab94ea3ae6bb69bbfb 100644 (file)
@@ -19,7 +19,7 @@ from cStringIO import StringIO
 import arvados
 import arvados.commands.put as arv_put
 
-from arvados_testutil import ArvadosBaseTestCase
+from arvados_testutil import ArvadosBaseTestCase, fake_httplib2_response
 import run_test_server
 
 class ArvadosPutResumeCacheTest(ArvadosBaseTestCase):
@@ -127,6 +127,43 @@ class ArvadosPutResumeCacheTest(ArvadosBaseTestCase):
             else:
                 config['ARVADOS_API_HOST'] = orig_host
 
+    @mock.patch('arvados.keep.KeepClient.head')
+    def test_resume_cache_with_current_stream_locators(self, keep_client_head):
+        keep_client_head.side_effect = [True]
+        thing = {}
+        thing['_current_stream_locators'] = ['098f6bcd4621d373cade4e832627b4f6+4', '1f253c60a2306e0ee12fb6ce0c587904+6']
+        with tempfile.NamedTemporaryFile() as cachefile:
+            self.last_cache = arv_put.ResumeCache(cachefile.name)
+        self.last_cache.save(thing)
+        self.last_cache.close()
+        resume_cache = arv_put.ResumeCache(self.last_cache.filename)
+        self.assertNotEqual(None, resume_cache)
+
+    @mock.patch('arvados.keep.KeepClient.head')
+    def test_resume_cache_with_finished_streams(self, keep_client_head):
+        keep_client_head.side_effect = [True]
+        thing = {}
+        thing['_finished_streams'] = [['.', ['098f6bcd4621d373cade4e832627b4f6+4', '1f253c60a2306e0ee12fb6ce0c587904+6']]]
+        with tempfile.NamedTemporaryFile() as cachefile:
+            self.last_cache = arv_put.ResumeCache(cachefile.name)
+        self.last_cache.save(thing)
+        self.last_cache.close()
+        resume_cache = arv_put.ResumeCache(self.last_cache.filename)
+        self.assertNotEqual(None, resume_cache)
+
+    @mock.patch('arvados.keep.KeepClient.head')
+    def test_resume_cache_with_finished_streams_error_on_head(self, keep_client_head):
+        keep_client_head.side_effect = Exception('Locator not found')
+        thing = {}
+        thing['_finished_streams'] = [['.', ['098f6bcd4621d373cade4e832627b4f6+4', '1f253c60a2306e0ee12fb6ce0c587904+6']]]
+        with tempfile.NamedTemporaryFile() as cachefile:
+            self.last_cache = arv_put.ResumeCache(cachefile.name)
+        self.last_cache.save(thing)
+        self.last_cache.close()
+        resume_cache = arv_put.ResumeCache(self.last_cache.filename)
+        self.assertNotEqual(None, resume_cache)
+        self.assertRaises(None, resume_cache.check_cache())
+
     def test_basic_cache_storage(self):
         thing = ['test', 'list']
         with tempfile.NamedTemporaryFile() as cachefile:
@@ -423,6 +460,20 @@ class ArvadosPutTest(run_test_server.TestCaseWithServers, ArvadosBaseTestCase):
                           self.call_main_with_args,
                           ['--project-uuid', self.Z_UUID, '--stream'])
 
+    def test_api_error_handling(self):
+        collections_mock = mock.Mock(name='arv.collections()')
+        coll_create_mock = collections_mock().create().execute
+        coll_create_mock.side_effect = arvados.errors.ApiError(
+            fake_httplib2_response(403), '{}')
+        arv_put.api_client = arvados.api('v1')
+        arv_put.api_client.collections = collections_mock
+        with self.assertRaises(SystemExit) as exc_test:
+            self.call_main_with_args(['/dev/null'])
+        self.assertLess(0, exc_test.exception.args[0])
+        self.assertLess(0, coll_create_mock.call_count)
+        self.assertEqual("", self.main_stdout.getvalue())
+
+
 class ArvPutIntegrationTest(run_test_server.TestCaseWithServers,
                             ArvadosBaseTestCase):
     def _getKeepServerConfig():
diff --git a/sdk/python/tests/test_events.py b/sdk/python/tests/test_events.py
new file mode 100644 (file)
index 0000000..f2cdba2
--- /dev/null
@@ -0,0 +1,372 @@
+import arvados
+import io
+import logging
+import mock
+import Queue
+import run_test_server
+import threading
+import time
+import unittest
+
+import arvados_testutil
+
+class WebsocketTest(run_test_server.TestCaseWithServers):
+    MAIN_SERVER = {}
+
+    TIME_PAST = time.time()-3600
+    TIME_FUTURE = time.time()+3600
+    MOCK_WS_URL = 'wss://[{}]/'.format(arvados_testutil.TEST_HOST)
+
+    def setUp(self):
+        self.ws = None
+
+    def tearDown(self):
+        try:
+            if self.ws:
+                self.ws.close()
+        except Exception as e:
+            print("Error in teardown: ", e)
+        super(WebsocketTest, self).tearDown()
+        run_test_server.reset()
+
+    def _test_subscribe(self, poll_fallback, expect_type, start_time=None, expected=1):
+        run_test_server.authorize_with('active')
+        events = Queue.Queue(100)
+
+        # Create ancestor before subscribing.
+        # When listening with start_time in the past, this should also be retrieved.
+        # However, when start_time is omitted in subscribe, this should not be fetched.
+        ancestor = arvados.api('v1').humans().create(body={}).execute()
+
+        filters = [['object_uuid', 'is_a', 'arvados#human']]
+        if start_time:
+            filters.append(['created_at', '>=', start_time])
+
+        self.ws = arvados.events.subscribe(
+            arvados.api('v1'), filters,
+            events.put_nowait,
+            poll_fallback=poll_fallback,
+            last_log_id=(1 if start_time else None))
+        self.assertIsInstance(self.ws, expect_type)
+        self.assertEqual(200, events.get(True, 5)['status'])
+        human = arvados.api('v1').humans().create(body={}).execute()
+
+        log_object_uuids = []
+        for i in range(0, expected):
+            log_object_uuids.append(events.get(True, 5)['object_uuid'])
+
+        if expected > 0:
+            self.assertIn(human['uuid'], log_object_uuids)
+
+        if expected > 1:
+            self.assertIn(ancestor['uuid'], log_object_uuids)
+
+        with self.assertRaises(Queue.Empty):
+            # assertEqual just serves to show us what unexpected thing
+            # comes out of the queue when the assertRaises fails; when
+            # the test passes, this assertEqual doesn't get called.
+            self.assertEqual(events.get(True, 2), None)
+
+    def test_subscribe_websocket(self):
+        self._test_subscribe(
+            poll_fallback=False, expect_type=arvados.events.EventClient, expected=1)
+
+    @mock.patch('arvados.events.EventClient.__init__')
+    def test_subscribe_poll(self, event_client_constr):
+        event_client_constr.side_effect = Exception('All is well')
+        self._test_subscribe(
+            poll_fallback=0.25, expect_type=arvados.events.PollClient, expected=1)
+
+    def test_subscribe_poll_retry(self):
+        api_mock = mock.MagicMock()
+        n = []
+        def on_ev(ev):
+            n.append(ev)
+
+        error_mock = mock.MagicMock()
+        error_mock.resp.status = 0
+        error_mock._get_reason.return_value = "testing"
+        api_mock.logs().list().execute.side_effect = (arvados.errors.ApiError(error_mock, ""),
+                                                      {"items": [{"id": 1}], "items_available": 1},
+                                                      arvados.errors.ApiError(error_mock, ""),
+                                                      {"items": [{"id": 1}], "items_available": 1})
+        pc = arvados.events.PollClient(api_mock, [], on_ev, 15, None)
+        pc.start()
+        while len(n) < 2:
+            time.sleep(.1)
+        pc.close()
+
+    def test_subscribe_websocket_with_start_time_past(self):
+        self._test_subscribe(
+            poll_fallback=False, expect_type=arvados.events.EventClient,
+            start_time=self.localiso(self.TIME_PAST),
+            expected=2)
+
+    @mock.patch('arvados.events.EventClient.__init__')
+    def test_subscribe_poll_with_start_time_past(self, event_client_constr):
+        event_client_constr.side_effect = Exception('All is well')
+        self._test_subscribe(
+            poll_fallback=0.25, expect_type=arvados.events.PollClient,
+            start_time=self.localiso(self.TIME_PAST),
+            expected=2)
+
+    def test_subscribe_websocket_with_start_time_future(self):
+        self._test_subscribe(
+            poll_fallback=False, expect_type=arvados.events.EventClient,
+            start_time=self.localiso(self.TIME_FUTURE),
+            expected=0)
+
+    @mock.patch('arvados.events.EventClient.__init__')
+    def test_subscribe_poll_with_start_time_future(self, event_client_constr):
+        event_client_constr.side_effect = Exception('All is well')
+        self._test_subscribe(
+            poll_fallback=0.25, expect_type=arvados.events.PollClient,
+            start_time=self.localiso(self.TIME_FUTURE),
+            expected=0)
+
+    def test_subscribe_websocket_with_start_time_past_utc(self):
+        self._test_subscribe(
+            poll_fallback=False, expect_type=arvados.events.EventClient,
+            start_time=self.utciso(self.TIME_PAST),
+            expected=2)
+
+    def test_subscribe_websocket_with_start_time_future_utc(self):
+        self._test_subscribe(
+            poll_fallback=False, expect_type=arvados.events.EventClient,
+            start_time=self.utciso(self.TIME_FUTURE),
+            expected=0)
+
+    def utciso(self, t):
+        return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
+
+    def localiso(self, t):
+        return time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(t)) + self.isotz(-time.timezone/60)
+
+    def isotz(self, offset):
+        """Convert minutes-east-of-UTC to ISO8601 time zone designator"""
+        return '{:+03d}{:02d}'.format(offset/60, offset%60)
+
+    # Test websocket reconnection on (un)execpted close
+    def _test_websocket_reconnect(self, close_unexpected):
+        run_test_server.authorize_with('active')
+        events = Queue.Queue(100)
+
+        logstream = io.BytesIO()
+        rootLogger = logging.getLogger()
+        streamHandler = logging.StreamHandler(logstream)
+        rootLogger.addHandler(streamHandler)
+
+        filters = [['object_uuid', 'is_a', 'arvados#human']]
+        filters.append(['created_at', '>=', self.localiso(self.TIME_PAST)])
+        self.ws = arvados.events.subscribe(
+            arvados.api('v1'), filters,
+            events.put_nowait,
+            poll_fallback=False,
+            last_log_id=None)
+        self.assertIsInstance(self.ws, arvados.events.EventClient)
+        self.assertEqual(200, events.get(True, 5)['status'])
+
+        # create obj
+        human = arvados.api('v1').humans().create(body={}).execute()
+
+        # expect an event
+        self.assertIn(human['uuid'], events.get(True, 5)['object_uuid'])
+        with self.assertRaises(Queue.Empty):
+            self.assertEqual(events.get(True, 2), None)
+
+        # close (im)properly
+        if close_unexpected:
+            self.ws.ec.close_connection()
+        else:
+            self.ws.close()
+
+        # create one more obj
+        human2 = arvados.api('v1').humans().create(body={}).execute()
+
+        # (un)expect the object creation event
+        if close_unexpected:
+            log_object_uuids = []
+            for i in range(0, 2):
+                event = events.get(True, 5)
+                if event.get('object_uuid') != None:
+                    log_object_uuids.append(event['object_uuid'])
+            with self.assertRaises(Queue.Empty):
+                self.assertEqual(events.get(True, 2), None)
+            self.assertNotIn(human['uuid'], log_object_uuids)
+            self.assertIn(human2['uuid'], log_object_uuids)
+        else:
+            with self.assertRaises(Queue.Empty):
+                self.assertEqual(events.get(True, 2), None)
+
+        # verify log message to ensure that an (un)expected close
+        log_messages = logstream.getvalue()
+        closeLogFound = log_messages.find("Unexpected close. Reconnecting.")
+        retryLogFound = log_messages.find("Error during websocket reconnect. Will retry")
+        if close_unexpected:
+            self.assertNotEqual(closeLogFound, -1)
+        else:
+            self.assertEqual(closeLogFound, -1)
+        rootLogger.removeHandler(streamHandler)
+
+    def test_websocket_reconnect_on_unexpected_close(self):
+        self._test_websocket_reconnect(True)
+
+    def test_websocket_no_reconnect_on_close_by_user(self):
+        self._test_websocket_reconnect(False)
+
+    # Test websocket reconnection retry
+    @mock.patch('arvados.events._EventClient.connect')
+    def test_websocket_reconnect_retry(self, event_client_connect):
+        event_client_connect.side_effect = [None, Exception('EventClient.connect error'), None]
+
+        logstream = io.BytesIO()
+        rootLogger = logging.getLogger()
+        streamHandler = logging.StreamHandler(logstream)
+        rootLogger.addHandler(streamHandler)
+
+        run_test_server.authorize_with('active')
+        events = Queue.Queue(100)
+
+        filters = [['object_uuid', 'is_a', 'arvados#human']]
+        self.ws = arvados.events.subscribe(
+            arvados.api('v1'), filters,
+            events.put_nowait,
+            poll_fallback=False,
+            last_log_id=None)
+        self.assertIsInstance(self.ws, arvados.events.EventClient)
+
+        # simulate improper close
+        self.ws.on_closed()
+
+        # verify log messages to ensure retry happened
+        log_messages = logstream.getvalue()
+        found = log_messages.find("Error 'EventClient.connect error' during websocket reconnect.")
+        self.assertNotEqual(found, -1)
+        rootLogger.removeHandler(streamHandler)
+
+    @mock.patch('arvados.events._EventClient')
+    def test_subscribe_method(self, websocket_client):
+        filters = [['object_uuid', 'is_a', 'arvados#human']]
+        client = arvados.events.EventClient(
+            self.MOCK_WS_URL, [], lambda event: None, None)
+        client.subscribe(filters[:], 99)
+        websocket_client().subscribe.assert_called_with(filters, 99)
+
+    @mock.patch('arvados.events._EventClient')
+    def test_unsubscribe(self, websocket_client):
+        filters = [['object_uuid', 'is_a', 'arvados#human']]
+        client = arvados.events.EventClient(
+            self.MOCK_WS_URL, filters[:], lambda event: None, None)
+        client.unsubscribe(filters[:])
+        websocket_client().unsubscribe.assert_called_with(filters)
+
+    @mock.patch('arvados.events._EventClient')
+    def test_run_forever_survives_reconnects(self, websocket_client):
+        connection_cond = threading.Condition()
+        def ws_connect():
+            with connection_cond:
+                connection_cond.notify_all()
+        websocket_client().connect.side_effect = ws_connect
+        client = arvados.events.EventClient(
+            self.MOCK_WS_URL, [], lambda event: None, None)
+        with connection_cond:
+            forever_thread = threading.Thread(target=client.run_forever)
+            forever_thread.start()
+            # Simulate an unexpected disconnect, and wait for reconnect.
+            close_thread = threading.Thread(target=client.on_closed)
+            close_thread.start()
+            connection_cond.wait()
+        close_thread.join()
+        run_forever_alive = forever_thread.is_alive()
+        client.close()
+        forever_thread.join()
+        self.assertTrue(run_forever_alive)
+        self.assertEqual(2, websocket_client().connect.call_count)
+
+
+class PollClientTestCase(unittest.TestCase):
+    class MockLogs(object):
+        def __init__(self):
+            self.logs = []
+            self.lock = threading.Lock()
+
+        def add(self, log):
+            with self.lock:
+                self.logs.append(log)
+
+        def return_list(self, num_retries=None):
+            with self.lock:
+                retval = self.logs
+                self.logs = []
+            return {'items': retval, 'items_available': len(retval)}
+
+
+    def setUp(self):
+        self.logs = self.MockLogs()
+        self.arv = mock.MagicMock(name='arvados.api()')
+        self.arv.logs().list().execute.side_effect = self.logs.return_list
+        self.callback_cond = threading.Condition()
+        self.recv_events = []
+
+    def tearDown(self):
+        if hasattr(self, 'client'):
+            self.client.close(timeout=None)
+
+    def callback(self, event):
+        with self.callback_cond:
+            self.recv_events.append(event)
+            self.callback_cond.notify_all()
+
+    def build_client(self, filters=None, callback=None, last_log_id=None, poll_time=99):
+        if filters is None:
+            filters = []
+        if callback is None:
+            callback = self.callback
+        self.client = arvados.events.PollClient(
+            self.arv, filters, callback, poll_time, last_log_id)
+
+    def was_filter_used(self, target):
+        return any(target in call[-1].get('filters', [])
+                   for call in self.arv.logs().list.call_args_list)
+
+    def test_callback(self):
+        test_log = {'id': 12345, 'testkey': 'testtext'}
+        self.logs.add({'id': 123})
+        self.build_client(poll_time=.01)
+        with self.callback_cond:
+            self.client.start()
+            self.callback_cond.wait()
+            self.logs.add(test_log.copy())
+            self.callback_cond.wait()
+        self.client.close(timeout=None)
+        self.assertIn(test_log, self.recv_events)
+
+    def test_subscribe(self):
+        client_filter = ['kind', '=', 'arvados#test']
+        self.build_client()
+        self.client.subscribe([client_filter[:]])
+        with self.callback_cond:
+            self.client.start()
+            self.callback_cond.wait()
+        self.client.close(timeout=None)
+        self.assertTrue(self.was_filter_used(client_filter))
+
+    def test_unsubscribe(self):
+        client_filter = ['kind', '=', 'arvados#test']
+        self.build_client()
+        self.client.subscribe([client_filter[:]])
+        self.client.unsubscribe([client_filter[:]])
+        self.client.start()
+        self.client.close(timeout=None)
+        self.assertFalse(self.was_filter_used(client_filter))
+
+    def test_run_forever(self):
+        self.build_client()
+        with self.callback_cond:
+            self.client.start()
+            forever_thread = threading.Thread(target=self.client.run_forever)
+            forever_thread.start()
+            self.callback_cond.wait()
+        self.assertTrue(forever_thread.is_alive())
+        self.client.close()
+        forever_thread.join()
index 9a0fe80c93ea445e73744268ef5147b656b3e4d6..5fbab7dc0ac70606f81114cc3a657ca49e43cf7d 100644 (file)
@@ -69,6 +69,7 @@ class KeepTestCase(run_test_server.TestCaseWithServers):
                          blob_str,
                          'wrong content from Keep.get(md5(<binarydata>))')
 
+    @unittest.skip("unreliable test - please fix and close #8752")
     def test_KeepSingleCopyRWTest(self):
         blob_str = '\xff\xfe\xfd\xfc\x00\x01\x02\x03'
         blob_locator = self.keep_client.put(blob_str, copies=1)
@@ -103,6 +104,17 @@ class KeepTestCase(run_test_server.TestCaseWithServers):
             # Must be a string type
             self.keep_client.put({})
 
+    def test_KeepHeadTest(self):
+        locator = self.keep_client.put('test_head')
+        self.assertRegexpMatches(
+            locator,
+            '^b9a772c7049325feb7130fff1f8333e9\+9',
+            'wrong md5 hash from Keep.put for "test_head": ' + locator)
+        self.assertEqual(True, self.keep_client.head(locator))
+        self.assertEqual(self.keep_client.get(locator),
+                         'test_head',
+                         'wrong content from Keep.get for "test_head"')
+
 class KeepPermissionTestCase(run_test_server.TestCaseWithServers):
     MAIN_SERVER = {}
     KEEP_SERVER = {'blob_signing_key': 'abcdefghijk0123456789',
@@ -317,6 +329,23 @@ class KeepClientServiceTestCase(unittest.TestCase, tutil.ApiClientMock):
                 mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
                 int(arvados.KeepClient.DEFAULT_TIMEOUT[2]))
 
+    def test_head_timeout(self):
+        api_client = self.mock_keep_services(count=1)
+        force_timeout = socket.timeout("timed out")
+        with tutil.mock_keep_responses(force_timeout, 0) as mock:
+            keep_client = arvados.KeepClient(api_client=api_client)
+            with self.assertRaises(arvados.errors.KeepReadError):
+                keep_client.head('ffffffffffffffffffffffffffffffff')
+            self.assertEqual(
+                mock.responses[0].getopt(pycurl.CONNECTTIMEOUT_MS),
+                int(arvados.KeepClient.DEFAULT_TIMEOUT[0]*1000))
+            self.assertEqual(
+                mock.responses[0].getopt(pycurl.LOW_SPEED_TIME),
+                int(arvados.KeepClient.DEFAULT_TIMEOUT[1]))
+            self.assertEqual(
+                mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
+                int(arvados.KeepClient.DEFAULT_TIMEOUT[2]))
+
     def test_proxy_get_timeout(self):
         api_client = self.mock_keep_services(service_type='proxy', count=1)
         force_timeout = socket.timeout("timed out")
@@ -334,6 +363,23 @@ class KeepClientServiceTestCase(unittest.TestCase, tutil.ApiClientMock):
                 mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
                 int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[2]))
 
+    def test_proxy_head_timeout(self):
+        api_client = self.mock_keep_services(service_type='proxy', count=1)
+        force_timeout = socket.timeout("timed out")
+        with tutil.mock_keep_responses(force_timeout, 0) as mock:
+            keep_client = arvados.KeepClient(api_client=api_client)
+            with self.assertRaises(arvados.errors.KeepReadError):
+                keep_client.head('ffffffffffffffffffffffffffffffff')
+            self.assertEqual(
+                mock.responses[0].getopt(pycurl.CONNECTTIMEOUT_MS),
+                int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[0]*1000))
+            self.assertEqual(
+                mock.responses[0].getopt(pycurl.LOW_SPEED_TIME),
+                int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[1]))
+            self.assertEqual(
+                mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
+                int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[2]))
+
     def test_proxy_put_timeout(self):
         api_client = self.mock_keep_services(service_type='proxy', count=1)
         force_timeout = socket.timeout("timed out")
@@ -363,6 +409,9 @@ class KeepClientServiceTestCase(unittest.TestCase, tutil.ApiClientMock):
     def test_get_error_with_no_services(self):
         self.check_no_services_error('get', arvados.errors.KeepReadError)
 
+    def test_head_error_with_no_services(self):
+        self.check_no_services_error('head', arvados.errors.KeepReadError)
+
     def test_put_error_with_no_services(self):
         self.check_no_services_error('put', arvados.errors.KeepWriteError)
 
@@ -382,6 +431,9 @@ class KeepClientServiceTestCase(unittest.TestCase, tutil.ApiClientMock):
     def test_get_error_reflects_last_retry(self):
         self.check_errors_from_last_retry('get', arvados.errors.KeepReadError)
 
+    def test_head_error_reflects_last_retry(self):
+        self.check_errors_from_last_retry('head', arvados.errors.KeepReadError)
+
     def test_put_error_reflects_last_retry(self):
         self.check_errors_from_last_retry('put', arvados.errors.KeepWriteError)
 
@@ -476,6 +528,10 @@ class KeepClientRendezvousTestCase(unittest.TestCase, tutil.ApiClientMock):
         self._test_probe_order_against_reference_set(
             lambda i: self.keep_client.get(self.hashes[i], num_retries=1))
 
+    def test_head_probe_order_against_reference_set(self):
+        self._test_probe_order_against_reference_set(
+            lambda i: self.keep_client.head(self.hashes[i], num_retries=1))
+
     def test_put_probe_order_against_reference_set(self):
         # copies=1 prevents the test from being sensitive to races
         # between writer threads.
@@ -686,6 +742,9 @@ class KeepClientTimeout(unittest.TestCase, tutil.ApiClientMock):
         with self.assertTakesGreater(self.TIMEOUT_TIME):
             with self.assertRaises(arvados.errors.KeepWriteError):
                 kc.put(self.DATA, copies=1, num_retries=0)
+        with self.assertTakesGreater(self.TIMEOUT_TIME):
+            with self.assertRaises(arvados.errors.KeepReadError) as e:
+                kc.head(loc, num_retries=0)
 
     def test_low_bandwidth_with_server_mid_delay_failure(self):
         kc = self.keepClient()
@@ -768,6 +827,7 @@ class KeepClientGatewayTestCase(unittest.TestCase, tutil.ApiClientMock):
         self.assertEqual('foo', self.keepClient.get(locator))
         self.assertEqual(self.gateway_roots[0]+locator,
                          MockCurl.return_value.getopt(pycurl.URL))
+        self.assertEqual(True, self.keepClient.head(locator))
 
     @mock.patch('pycurl.Curl')
     def test_get_with_gateway_hints_in_order(self, MockCurl):
@@ -793,6 +853,30 @@ class KeepClientGatewayTestCase(unittest.TestCase, tutil.ApiClientMock):
                 mocks[i].getopt(pycurl.URL),
                 r'keep0x')
 
+    @mock.patch('pycurl.Curl')
+    def test_head_with_gateway_hints_in_order(self, MockCurl):
+        gateways = 4
+        disks = 3
+        mocks = [
+            tutil.FakeCurl.make(code=404, body='')
+            for _ in range(gateways+disks)
+        ]
+        MockCurl.side_effect = tutil.queue_with(mocks)
+        self.mock_disks_and_gateways(gateways=gateways, disks=disks)
+        locator = '+'.join(['acbd18db4cc2f85cedef654fccc4a4d8+3'] +
+                           ['K@'+gw['uuid'] for gw in self.gateways])
+        with self.assertRaises(arvados.errors.NotFoundError):
+            self.keepClient.head(locator)
+        # Gateways are tried first, in the order given.
+        for i, root in enumerate(self.gateway_roots):
+            self.assertEqual(root+locator,
+                             mocks[i].getopt(pycurl.URL))
+        # Disk services are tried next.
+        for i in range(gateways, gateways+disks):
+            self.assertRegexpMatches(
+                mocks[i].getopt(pycurl.URL),
+                r'keep0x')
+
     @mock.patch('pycurl.Curl')
     def test_get_with_remote_proxy_hint(self, MockCurl):
         MockCurl.return_value = tutil.FakeCurl.make(
@@ -803,6 +887,16 @@ class KeepClientGatewayTestCase(unittest.TestCase, tutil.ApiClientMock):
         self.assertEqual('https://keep.xyzzy.arvadosapi.com/'+locator,
                          MockCurl.return_value.getopt(pycurl.URL))
 
+    @mock.patch('pycurl.Curl')
+    def test_head_with_remote_proxy_hint(self, MockCurl):
+        MockCurl.return_value = tutil.FakeCurl.make(
+            code=200, body='foo', headers={'Content-Length': 3})
+        self.mock_disks_and_gateways()
+        locator = 'acbd18db4cc2f85cedef654fccc4a4d8+3+K@xyzzy'
+        self.assertEqual(True, self.keepClient.head(locator))
+        self.assertEqual('https://keep.xyzzy.arvadosapi.com/'+locator,
+                         MockCurl.return_value.getopt(pycurl.URL))
+
 
 class KeepClientRetryTestMixin(object):
     # Testing with a local Keep store won't exercise the retry behavior.
@@ -918,6 +1012,43 @@ class KeepClientRetryGetTestCase(KeepClientRetryTestMixin, unittest.TestCase):
                 (self.DEFAULT_EXPECT, 200)):
             self.check_success(locator=self.HINTED_LOCATOR)
 
+@tutil.skip_sleep
+class KeepClientRetryHeadTestCase(KeepClientRetryTestMixin, unittest.TestCase):
+    DEFAULT_EXPECT = True
+    DEFAULT_EXCEPTION = arvados.errors.KeepReadError
+    HINTED_LOCATOR = KeepClientRetryTestMixin.TEST_LOCATOR + '+K@xyzzy'
+    TEST_PATCHER = staticmethod(tutil.mock_keep_responses)
+
+    def run_method(self, locator=KeepClientRetryTestMixin.TEST_LOCATOR,
+                   *args, **kwargs):
+        return self.new_client().head(locator, *args, **kwargs)
+
+    def test_specific_exception_when_not_found(self):
+        with tutil.mock_keep_responses(self.DEFAULT_EXPECT, 404, 200):
+            self.check_exception(arvados.errors.NotFoundError, num_retries=3)
+
+    def test_general_exception_with_mixed_errors(self):
+        # head should raise a NotFoundError if no server returns the block,
+        # and a high threshold of servers report that it's not found.
+        # This test rigs up 50/50 disagreement between two servers, and
+        # checks that it does not become a NotFoundError.
+        client = self.new_client()
+        with tutil.mock_keep_responses(self.DEFAULT_EXPECT, 404, 500):
+            with self.assertRaises(arvados.errors.KeepReadError) as exc_check:
+                client.head(self.HINTED_LOCATOR)
+            self.assertNotIsInstance(
+                exc_check.exception, arvados.errors.NotFoundError,
+                "mixed errors raised NotFoundError")
+
+    def test_hint_server_can_succeed_without_retries(self):
+        with tutil.mock_keep_responses(self.DEFAULT_EXPECT, 404, 200, 500):
+            self.check_success(locator=self.HINTED_LOCATOR)
+
+    def test_try_next_server_after_timeout(self):
+        with tutil.mock_keep_responses(
+                (socket.timeout("timed out"), 200),
+                (self.DEFAULT_EXPECT, 200)):
+            self.check_success(locator=self.HINTED_LOCATOR)
 
 @tutil.skip_sleep
 class KeepClientRetryPutTestCase(KeepClientRetryTestMixin, unittest.TestCase):
index c41c42e762cd5e8f856444926716b0c274735bb5..cc12f39a355ef9b97a85a34ee5989e3bae38a744 100644 (file)
@@ -141,7 +141,7 @@ class RetryLoopBackoffTestCase(unittest.TestCase, RetryLoopTestMixin):
 
     def test_backoff_multiplier(self, sleep_mock, time_mock):
         self.run_loop(5, 500, 501, 502, 503, 504, 505,
-                      backoff_start=5, backoff_growth=10)
+                      backoff_start=5, backoff_growth=10, max_wait=1000000000)
         self.check_backoff(sleep_mock, 5, 9)
 
 
index 6c3bd61414173fb64fe9ef7b7b1b44dcc4af6d9d..624f1b8ca4391678215539f70c2a28b00fd37388 100644 (file)
@@ -184,6 +184,19 @@ class StreamFileReaderTestCase(unittest.TestCase):
     def test_bz2_decompression(self):
         self.check_decompression('bz2', bz2.compress)
 
+    def test_readline_then_readlines(self):
+        reader = self.make_newlines_reader()
+        data = reader.readline()
+        self.assertEqual('one\n', data)
+        data = reader.readlines()
+        self.assertEqual(['two\n', '\n', 'three\n', 'four\n', '\n'], data)
+
+    def test_readline_then_readall(self):
+        reader = self.make_newlines_reader()
+        data = reader.readline()
+        self.assertEqual('one\n', data)
+        self.assertEqual(''.join(['two\n', '\n', 'three\n', 'four\n', '\n']), ''.join(reader.readall()))
+
 
 class StreamRetryTestMixin(object):
     # Define reader_for(coll_name, **kwargs)
diff --git a/sdk/python/tests/test_websockets.py b/sdk/python/tests/test_websockets.py
deleted file mode 100644 (file)
index 37b644a..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-import arvados
-import arvados.events
-from datetime import datetime, timedelta, tzinfo
-import mock
-import Queue
-import run_test_server
-import threading
-import time
-import unittest
-
-class WebsocketTest(run_test_server.TestCaseWithServers):
-    MAIN_SERVER = {}
-
-    TIME_PAST = time.time()-3600
-    TIME_FUTURE = time.time()+3600
-
-    def setUp(self):
-        self.ws = None
-
-    def tearDown(self):
-        if self.ws:
-            self.ws.close()
-        super(WebsocketTest, self).tearDown()
-        run_test_server.reset()
-
-    def _test_subscribe(self, poll_fallback, expect_type, start_time=None, expected=1):
-        run_test_server.authorize_with('active')
-        events = Queue.Queue(100)
-
-        # Create ancestor before subscribing.
-        # When listening with start_time in the past, this should also be retrieved.
-        # However, when start_time is omitted in subscribe, this should not be fetched.
-        ancestor = arvados.api('v1').humans().create(body={}).execute()
-
-        filters = [['object_uuid', 'is_a', 'arvados#human']]
-        if start_time:
-            filters.append(['created_at', '>=', start_time])
-
-        self.ws = arvados.events.subscribe(
-            arvados.api('v1'), filters,
-            events.put_nowait,
-            poll_fallback=poll_fallback,
-            last_log_id=(1 if start_time else None))
-        self.assertIsInstance(self.ws, expect_type)
-        self.assertEqual(200, events.get(True, 5)['status'])
-        human = arvados.api('v1').humans().create(body={}).execute()
-
-        log_object_uuids = []
-        for i in range(0, expected):
-            log_object_uuids.append(events.get(True, 5)['object_uuid'])
-
-        if expected > 0:
-            self.assertIn(human['uuid'], log_object_uuids)
-
-        if expected > 1:
-            self.assertIn(ancestor['uuid'], log_object_uuids)
-
-        with self.assertRaises(Queue.Empty):
-            # assertEqual just serves to show us what unexpected thing
-            # comes out of the queue when the assertRaises fails; when
-            # the test passes, this assertEqual doesn't get called.
-            self.assertEqual(events.get(True, 2), None)
-
-    def test_subscribe_websocket(self):
-        self._test_subscribe(
-            poll_fallback=False, expect_type=arvados.events.EventClient, expected=1)
-
-    @mock.patch('arvados.events.EventClient.__init__')
-    def test_subscribe_poll(self, event_client_constr):
-        event_client_constr.side_effect = Exception('All is well')
-        self._test_subscribe(
-            poll_fallback=0.25, expect_type=arvados.events.PollClient, expected=1)
-
-    def test_subscribe_websocket_with_start_time_past(self):
-        self._test_subscribe(
-            poll_fallback=False, expect_type=arvados.events.EventClient,
-            start_time=self.localiso(self.TIME_PAST),
-            expected=2)
-
-    @mock.patch('arvados.events.EventClient.__init__')
-    def test_subscribe_poll_with_start_time_past(self, event_client_constr):
-        event_client_constr.side_effect = Exception('All is well')
-        self._test_subscribe(
-            poll_fallback=0.25, expect_type=arvados.events.PollClient,
-            start_time=self.localiso(self.TIME_PAST),
-            expected=2)
-
-    def test_subscribe_websocket_with_start_time_future(self):
-        self._test_subscribe(
-            poll_fallback=False, expect_type=arvados.events.EventClient,
-            start_time=self.localiso(self.TIME_FUTURE),
-            expected=0)
-
-    @mock.patch('arvados.events.EventClient.__init__')
-    def test_subscribe_poll_with_start_time_future(self, event_client_constr):
-        event_client_constr.side_effect = Exception('All is well')
-        self._test_subscribe(
-            poll_fallback=0.25, expect_type=arvados.events.PollClient,
-            start_time=self.localiso(self.TIME_FUTURE),
-            expected=0)
-
-    def test_subscribe_websocket_with_start_time_past_utc(self):
-        self._test_subscribe(
-            poll_fallback=False, expect_type=arvados.events.EventClient,
-            start_time=self.utciso(self.TIME_PAST),
-            expected=2)
-
-    def test_subscribe_websocket_with_start_time_future_utc(self):
-        self._test_subscribe(
-            poll_fallback=False, expect_type=arvados.events.EventClient,
-            start_time=self.utciso(self.TIME_FUTURE),
-            expected=0)
-
-    def utciso(self, t):
-        return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
-
-    def localiso(self, t):
-        return time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(t)) + self.isotz(-time.timezone/60)
-
-    def isotz(self, offset):
-        """Convert minutes-east-of-UTC to ISO8601 time zone designator"""
-        return '{:+03d}{:02d}'.format(offset/60, offset%60)
index 3d090f4b5cc69aa23eec3dc057041c200024ed9a..2c4e60eeb8f1e664e2a4ed62879381fbb75d6e01 100644 (file)
@@ -18,13 +18,17 @@ Gem::Specification.new do |s|
   s.files       = ["lib/arvados.rb", "lib/arvados/google_api_client.rb",
                    "lib/arvados/collection.rb", "lib/arvados/keep.rb",
                    "README", "LICENSE-2.0.txt"]
-  s.required_ruby_version = '>= 2.1.0'
+  s.required_ruby_version = '>= 1.8.7'
   # activesupport <4.2.6 only because https://dev.arvados.org/issues/8222
-  s.add_dependency('activesupport', '>= 3.2.13', '< 4.2.6')
+  s.add_dependency('activesupport', '>= 3', '< 4.2.6')
   s.add_dependency('andand', '~> 1.3', '>= 1.3.3')
-  s.add_dependency('google-api-client', '~> 0.6.3', '>= 0.6.3')
+  # Our google-api-client dependency used to be < 0.9, but that could be
+  # satisfied by the buggy 0.9.pre*.  https://dev.arvados.org/issues/9213
+  s.add_dependency('google-api-client', '>= 0.7', '< 0.8.9')
+  # work around undeclared dependency on i18n in some activesupport 3.x.x:
+  s.add_dependency('i18n', '~> 0')
   s.add_dependency('json', '~> 1.7', '>= 1.7.7')
-  s.add_runtime_dependency('jwt', '>= 0.1.5', '< 1.0.0')
+  s.add_runtime_dependency('jwt', '<2', '>= 0.1.5')
   s.homepage    =
     'https://arvados.org'
 end
index 753c518b3191ebbfefbd4407ca67c2f9b83daa45..7a3f4b4226210646cbdd098d5594ff354f42778e 100644 (file)
@@ -209,7 +209,7 @@ class Arvados
                 :parameters => parameters,
                 :body_object => body,
                 :headers => {
-                  authorization: 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
+                  :authorization => 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
                 })
       resp = JSON.parse result.body, :symbolize_names => true
       if resp[:errors]
@@ -217,7 +217,7 @@ class Arvados
       elsif resp[:uuid] and resp[:etag]
         self.new(resp)
       elsif resp[:items].is_a? Array
-        resp.merge(items: resp[:items].collect do |i|
+        resp.merge(:items => resp[:items].collect do |i|
                      self.new(i)
                    end)
       else
index 07b751908f7da26b93fd5321fe8a5c192872a8d6..474241dc41832e657b53d4d55b5746e479e79075 100644 (file)
@@ -44,7 +44,7 @@ module Arv
     end
 
     def cp_r(source, target, source_collection=nil)
-      opts = {descend_target: !source.end_with?("/")}
+      opts = {:descend_target => !source.end_with?("/")}
       copy(:merge, source.chomp("/"), target, source_collection, opts)
     end
 
@@ -70,7 +70,7 @@ module Arv
     end
 
     def rm_r(source)
-      remove(source, recursive: true)
+      remove(source, :recursive => true)
     end
 
     protected
@@ -155,7 +155,7 @@ module Arv
       modified
     end
 
-    LocatorSegment = Struct.new(:locators, :start_pos, :length)
+    Struct.new("LocatorSegment", :locators, :start_pos, :length)
 
     class LocatorRange < Range
       attr_reader :locator
@@ -187,9 +187,9 @@ module Arv
           end_index = search_for_byte(start_pos + length - 1, start_index)
         end
         seg_ranges = @ranges[start_index..end_index]
-        LocatorSegment.new(seg_ranges.map(&:locator),
-                           start_pos - seg_ranges.first.begin,
-                           length)
+        Struct::LocatorSegment.new(seg_ranges.map(&:locator),
+                                   start_pos - seg_ranges.first.begin,
+                                   length)
       end
 
       private
index 3c6b26b765f59c4938465aaa7dcc589187fa7722..489eeeeebb7e11a3ea3dda78375b33809fdf97a6 100644 (file)
@@ -47,19 +47,19 @@ module Keep
         raise ArgumentError.new "locator is nil or empty"
       end
 
-      m = LOCATOR_REGEXP.match(tok.strip)
+      m = LOCATOR_REGEXP.match(tok)
       unless m
         raise ArgumentError.new "not a valid locator #{tok}"
       end
 
-      tokhash, _, toksize, _, trailer = m[1..5]
+      tokhash, _, toksize, _, _, trailer = m[1..6]
       tokhints = []
       if trailer
         trailer.split('+').each do |hint|
-          if hint =~ /^[[:upper:]][[:alnum:]@_-]+$/
+          if hint =~ /^[[:upper:]][[:alnum:]@_-]*$/
             tokhints.push(hint)
           else
-            raise ArgumentError.new "unknown hint #{hint}"
+            raise ArgumentError.new "invalid hint #{hint}"
           end
         end
       end
index 5ed9cfc2b186b6e2122d41a879603054962bd0d3..fa1dc3f2e83da930e84e2f468f1b083f91cd965d 100644 (file)
@@ -266,6 +266,8 @@ class ManifestTest < Minitest::Test
    [true, 'd41d8cd98f00b204e9800998ecf8427e+0', '+0','0',nil],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Fizz+Buzz','+0','0','+Fizz+Buzz'],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+Fizz+Buzz', nil,nil,'+Fizz+Buzz'],
+   [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', '+0','0','+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
+   [true, 'd41d8cd98f00b204e9800998ecf8427e+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', nil,nil,'+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Z', '+0','0','+Z'],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+Z', nil,nil,'+Z'],
   ].each do |ok, locator, match2, match3, match4|
@@ -278,6 +280,18 @@ class ManifestTest < Minitest::Test
         assert_equal match4, match[4]
       end
     end
+    define_method "test_parse_method_on_#{locator.inspect}" do
+      loc = Keep::Locator.parse locator
+      if !ok
+        assert_nil loc
+      else
+        refute_nil loc
+        assert loc.is_a?(Keep::Locator)
+        #assert loc.hash
+        #assert loc.size
+        #assert loc.hints.is_a?(Array)
+      end
+    end
   end
 
   [
@@ -301,6 +315,7 @@ class ManifestTest < Minitest::Test
     [true, ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\040\n"],
     [true, ". 00000000000000000000000000000000+0 0:0:0\n"],
     [true, ". 00000000000000000000000000000000+0 0:0:d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff\n"],
+    [true, ". d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff 0:0:empty.txt\n"],
     [false, '. d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt',
       "Invalid manifest: does not end with newline"],
     [false, "abc d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt\n",
index 48998aad36d10f7630f24e574a4d278968dc7032..1e25467e7485ed05337e2b212f4cf4fc4bc44d1e 100644 (file)
@@ -67,7 +67,6 @@ gem 'andand'
 
 gem 'test_after_commit', :group => :test
 
-gem 'google-api-client', '~> 0.6.3'
 gem 'trollop'
 gem 'faye-websocket'
 
index ac6be5a522303fd7418ba69974165e3b2821e17f..7be4e0f39d5df3253944f3884008980b9076d58a 100644 (file)
@@ -32,23 +32,23 @@ GEM
       activemodel (>= 3.0.0)
       activesupport (>= 3.0.0)
       rack (>= 1.1.0)
-    addressable (2.3.8)
+    addressable (2.4.0)
     andand (1.3.3)
     arel (3.0.3)
-    arvados (0.1.20150615153458)
-      activesupport (>= 3.2.13)
+    arvados (0.1.20160420143004)
+      activesupport (>= 3, < 4.2.6)
       andand (~> 1.3, >= 1.3.3)
-      google-api-client (~> 0.6.3, >= 0.6.3)
+      google-api-client (>= 0.7, < 0.9)
+      i18n (~> 0)
       json (~> 1.7, >= 1.7.7)
-      jwt (>= 0.1.5, < 1.0.0)
-    arvados-cli (0.1.20151207150126)
+      jwt (>= 0.1.5, < 2)
+    arvados-cli (0.1.20160503204200)
       activesupport (~> 3.2, >= 3.2.13)
       andand (~> 1.3, >= 1.3.3)
       arvados (~> 0.1, >= 0.1.20150128223554)
       curb (~> 0.8)
-      google-api-client (~> 0.6.3, >= 0.6.3)
+      google-api-client (~> 0.6, >= 0.6.3, < 0.9)
       json (~> 1.7, >= 1.7.7)
-      jwt (>= 0.1.5, < 1.0.0)
       oj (~> 2.0, >= 2.0.3)
       trollop (~> 2.0)
     autoparse (0.3.3)
@@ -69,7 +69,7 @@ GEM
       coffee-script-source
       execjs
     coffee-script-source (1.7.0)
-    curb (0.8.8)
+    curb (0.9.3)
     daemon_controller (1.2.0)
     database_cleaner (1.2.0)
     erubis (2.7.0)
@@ -81,20 +81,21 @@ GEM
     factory_girl_rails (4.4.1)
       factory_girl (~> 4.4.0)
       railties (>= 3.0.0)
-    faraday (0.8.9)
-      multipart-post (~> 1.2.0)
+    faraday (0.9.2)
+      multipart-post (>= 1.2, < 3)
     faye-websocket (0.7.2)
       eventmachine (>= 0.12.0)
       websocket-driver (>= 0.3.1)
-    google-api-client (0.6.4)
+    google-api-client (0.7.1)
       addressable (>= 2.3.2)
       autoparse (>= 0.3.3)
       extlib (>= 0.9.15)
-      faraday (~> 0.8.4)
+      faraday (>= 0.9.0)
       jwt (>= 0.1.5)
       launchy (>= 2.1.1)
       multi_json (>= 1.0.0)
-      signet (~> 0.4.5)
+      retriable (>= 1.4)
+      signet (>= 0.5.0)
       uuidtools (>= 2.1.0)
     hashie (1.2.0)
     highline (1.6.21)
@@ -118,8 +119,8 @@ GEM
     mime-types (1.25.1)
     mocha (1.1.0)
       metaclass (~> 0.0.1)
-    multi_json (1.11.1)
-    multipart-post (1.2.0)
+    multi_json (1.12.0)
+    multipart-post (2.0.0)
     net-scp (1.2.0)
       net-ssh (>= 2.6.5)
     net-sftp (2.1.2)
@@ -133,7 +134,7 @@ GEM
       jwt (~> 0.1.4)
       multi_json (~> 1.0)
       rack (~> 1.2)
-    oj (2.11.4)
+    oj (2.15.0)
     omniauth (1.1.1)
       hashie (~> 1.2)
       rack
@@ -177,6 +178,7 @@ GEM
     rdoc (3.12.2)
       json (~> 1.4)
     ref (1.0.5)
+    retriable (2.1.0)
     ruby-prof (0.15.2)
     rvm-capistrano (1.5.1)
       capistrano (~> 2.15.4)
@@ -185,9 +187,9 @@ GEM
       railties (~> 3.2.0)
       sass (>= 3.1.10)
       tilt (~> 1.3)
-    signet (0.4.5)
+    signet (0.5.1)
       addressable (>= 2.2.3)
-      faraday (~> 0.8.1)
+      faraday (>= 0.9.0.rc5)
       jwt (>= 0.1.5)
       multi_json (>= 1.0.0)
     simplecov (0.7.1)
@@ -213,7 +215,7 @@ GEM
     treetop (1.4.15)
       polyglot
       polyglot (>= 0.3.1)
-    trollop (2.1.1)
+    trollop (2.1.2)
     tzinfo (0.3.39)
     uglifier (2.5.0)
       execjs (>= 0.3.0)
@@ -233,7 +235,6 @@ DEPENDENCIES
   database_cleaner
   factory_girl_rails
   faye-websocket
-  google-api-client (~> 0.6.3)
   jquery-rails
   mocha
   multi_json
@@ -258,4 +259,4 @@ DEPENDENCIES
   uglifier (>= 1.0.3)
 
 BUNDLED WITH
-   1.10.6
+   1.12.1
index e91e3ce03ec6369addee5d63964ac10e198e251a..3a888184f8a32dd37734228a7f8dacd51c3105f2 100644 (file)
@@ -327,7 +327,7 @@ class ApplicationController < ActionController::Base
     return @attrs if @attrs
     @attrs = params[resource_name]
     if @attrs.is_a? String
-      @attrs = Oj.load @attrs, symbol_keys: true
+      @attrs = Oj.strict_load @attrs, symbol_keys: true
     end
     unless @attrs.is_a? Hash
       message = "No #{resource_name}"
@@ -441,7 +441,7 @@ class ApplicationController < ActionController::Base
 
   def load_json_value(hash, key, must_be_class=nil)
     if hash[key].is_a? String
-      hash[key] = Oj.load(hash[key], symbol_keys: false)
+      hash[key] = Oj.strict_load(hash[key], symbol_keys: false)
       if must_be_class and !hash[key].is_a? must_be_class
         raise TypeError.new("parameter #{key.to_s} must be a #{must_be_class.to_s}")
       end
index 56d0d85a82b51b1c0b6e2af981f8053c267ebd88..76acc701fd30194972b1f5491659f76ae8f7862e 100644 (file)
@@ -1,8 +1,9 @@
 class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
   accept_attribute_as_json :scopes, Array
-  before_filter :current_api_client_is_trusted
+  before_filter :current_api_client_is_trusted, :except => [:current]
   before_filter :admin_required, :only => :create_system_auth
-  skip_before_filter :render_404_if_no_object, :only => :create_system_auth
+  skip_before_filter :render_404_if_no_object, :only => [:create_system_auth, :current]
+  skip_before_filter :find_object_by_uuid, :only => [:create_system_auth, :current]
 
   def self._create_system_auth_requires_parameters
     {
@@ -15,7 +16,7 @@ class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
       new(user_id: system_user.id,
           api_client_id: params[:api_client_id] || current_api_client.andand.id,
           created_by_ip_address: remote_ip,
-          scopes: Oj.load(params[:scopes] || '["all"]'))
+          scopes: Oj.strict_load(params[:scopes] || '["all"]'))
     @object.save!
     show
   end
@@ -40,6 +41,11 @@ class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
     super
   end
 
+  def current
+    @object = Thread.current[:api_client_authorization]
+    show
+  end
+
   protected
 
   def default_orders
@@ -69,14 +75,27 @@ class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
         val.is_a?(String) && (attr == 'uuid' || attr == 'api_token')
       }
     end
-    @objects = model_class.
-      includes(:user, :api_client).
-      where('user_id=?', current_user.id)
-    super
-    wanted_scopes.compact.each do |scope_list|
-      sorted_scopes = scope_list.sort
-      @objects = @objects.select { |auth| auth.scopes.sort == sorted_scopes }
+    @objects = model_class.where('user_id=?', current_user.id)
+    if wanted_scopes.compact.any?
+      # We can't filter on scopes effectively using AR/postgres.
+      # Instead we get the entire result set, do our own filtering on
+      # scopes to get a list of UUIDs, then start a new query
+      # (restricted to the selected UUIDs) so super can apply the
+      # offset/limit/order params in the usual way.
+      @request_limit = @limit
+      @request_offset = @offset
+      @limit = @objects.count
+      @offset = 0
+      super
+      wanted_scopes.compact.each do |scope_list|
+        sorted_scopes = scope_list.sort
+        @objects = @objects.select { |auth| auth.scopes.sort == sorted_scopes }
+      end
+      @limit = @request_limit
+      @offset = @request_offset
+      @objects = model_class.where('uuid in (?)', @objects.collect(&:uuid))
     end
+    super
   end
 
   def find_object_by_uuid
@@ -110,8 +129,10 @@ class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
     # The @filters test here also prevents a non-trusted token from
     # filtering on its own scopes, and discovering whether any _other_
     # equally scoped tokens exist (403=yes, 200=no).
-    if (@objects.andand.count == 1 and
-        @objects.first.uuid == current_api_client_authorization.andand.uuid and
+    return forbidden if !@objects
+    full_set = @objects.except(:limit).except(:offset) if @objects
+    if (full_set.count == 1 and
+        full_set.first.uuid == current_api_client_authorization.andand.uuid and
         (@filters.map(&:first) & %w(uuid api_token)).any?)
       return true
     end
index 04a5ed0cb2b660b9bd71b214b6ce74dbde9e4ccd..21ee7efa53b5d4008c0f42717095194b9b0c39c6 100644 (file)
@@ -4,4 +4,19 @@ class Arvados::V1::ContainersController < ApplicationController
   accept_attribute_as_json :runtime_constraints, Hash
   accept_attribute_as_json :command, Array
 
+  def auth
+    if @object.locked_by_uuid != Thread.current[:api_client_authorization].uuid
+      raise ArvadosModel::PermissionDeniedError.new("Not locked by your token")
+    end
+    @object = @object.auth
+    show
+  end
+
+  # Updates use row locking to resolve races between multiple
+  # dispatchers trying to lock the same container.
+  def update
+    @object.with_lock do
+      super
+    end
+  end
 end
index f1ef2d824054f3a0dbe3bb338a966d3a00341b10..67963388639f9fb352ca42b0abff2a05e86d140c 100644 (file)
@@ -1,4 +1,5 @@
 class Arvados::V1::JobsController < ApplicationController
+  accept_attribute_as_json :components, Hash
   accept_attribute_as_json :script_parameters, Hash
   accept_attribute_as_json :runtime_constraints, Hash
   accept_attribute_as_json :tasks_summary, Hash
@@ -143,7 +144,7 @@ class Arvados::V1::JobsController < ApplicationController
               end
             end
           end
-          job_queue = Job.queue
+          job_queue = Job.queue.select(:uuid)
           n_queued_before_me = 0
           job_queue.each do |j|
             break if j.uuid == @job.uuid
@@ -152,7 +153,7 @@ class Arvados::V1::JobsController < ApplicationController
           yield "#{db_current_time}" \
             " job #{@job.uuid}" \
             " queue_position #{n_queued_before_me}" \
-            " queue_size #{job_queue.size}" \
+            " queue_size #{job_queue.count}" \
             " nodes_idle #{nodes_in_state[:idle]}" \
             " nodes_alloc #{nodes_in_state[:alloc]}\n"
           last_ack_at = db_current_time
index 57d3ad02d748d7ee41540c0320ef8f068a8d375a..d8c04a1adbfcd0512bdbf38a4225081709ca2de8 100644 (file)
@@ -31,7 +31,7 @@ class ArvadosApiToken
     supplied_token =
       params["api_token"] ||
       params["oauth_token"] ||
-      env["HTTP_AUTHORIZATION"].andand.match(/OAuth2 ([a-z0-9]+)/).andand[1]
+      env["HTTP_AUTHORIZATION"].andand.match(/OAuth2 ([a-zA-Z0-9]+)/).andand[1]
     if supplied_token
       api_client_auth = ApiClientAuthorization.
         includes(:api_client, :user).
index 34600d7a25a8c716bd9d1fd6ec49cea052dc0c58..41d5b27093c3ab55c296f7a592b9defb7e25d6dc 100644 (file)
@@ -49,11 +49,12 @@ class Blob
     end
     timestamp_hex = timestamp.to_s(16)
     # => "53163cb4"
+    blob_signature_ttl = Rails.configuration.blob_signature_ttl.to_s(16)
 
     # Generate a signature.
     signature =
       generate_signature((opts[:key] or Rails.configuration.blob_signing_key),
-                         blob_hash, opts[:api_token], timestamp_hex)
+                         blob_hash, opts[:api_token], timestamp_hex, blob_signature_ttl)
 
     blob_locator + '+A' + signature + '@' + timestamp_hex
   end
@@ -96,10 +97,11 @@ class Blob
     if timestamp.to_i(16) < (opts[:now] or db_current_time.to_i)
       raise Blob::InvalidSignatureError.new 'Signature expiry time has passed.'
     end
+    blob_signature_ttl = Rails.configuration.blob_signature_ttl.to_s(16)
 
     my_signature =
       generate_signature((opts[:key] or Rails.configuration.blob_signing_key),
-                         blob_hash, opts[:api_token], timestamp)
+                         blob_hash, opts[:api_token], timestamp, blob_signature_ttl)
 
     if my_signature != given_signature
       raise Blob::InvalidSignatureError.new 'Signature is invalid.'
@@ -108,10 +110,11 @@ class Blob
     true
   end
 
-  def self.generate_signature key, blob_hash, api_token, timestamp
+  def self.generate_signature key, blob_hash, api_token, timestamp, blob_signature_ttl
     OpenSSL::HMAC.hexdigest('sha1', key,
                             [blob_hash,
                              api_token,
-                             timestamp].join('@'))
+                             timestamp,
+                             blob_signature_ttl].join('@'))
   end
 end
index 787047df68e877bc8944b3c23186cc400c3ae227..4c770083786934abdafe9461f51ee03646396415 100644 (file)
@@ -16,9 +16,12 @@ class Container < ArvadosModel
   validates :command, :container_image, :output_path, :cwd, :priority, :presence => true
   validate :validate_state_change
   validate :validate_change
+  validate :validate_lock
+  after_validation :assign_auth
   after_save :handle_completed
 
   has_many :container_requests, :foreign_key => :container_uuid, :class_name => 'ContainerRequest', :primary_key => :uuid
+  belongs_to :auth, :class_name => 'ApiClientAuthorization', :foreign_key => :auth_uuid, :primary_key => :uuid
 
   api_accessible :user, extend: :common do |t|
     t.add :command
@@ -27,6 +30,7 @@ class Container < ArvadosModel
     t.add :environment
     t.add :exit_code
     t.add :finished_at
+    t.add :locked_by_uuid
     t.add :log
     t.add :mounts
     t.add :output
@@ -36,12 +40,14 @@ class Container < ArvadosModel
     t.add :runtime_constraints
     t.add :started_at
     t.add :state
+    t.add :auth_uuid
   end
 
   # Supported states for a container
   States =
     [
      (Queued = 'Queued'),
+     (Locked = 'Locked'),
      (Running = 'Running'),
      (Complete = 'Complete'),
      (Cancelled = 'Cancelled')
@@ -49,7 +55,8 @@ class Container < ArvadosModel
 
   State_transitions = {
     nil => [Queued],
-    Queued => [Running, Cancelled],
+    Queued => [Locked, Cancelled],
+    Locked => [Queued, Running, Cancelled],
     Running => [Complete, Cancelled]
   }
 
@@ -58,16 +65,13 @@ class Container < ArvadosModel
   end
 
   def update_priority!
-    if [Queued, Running].include? self.state
+    if [Queued, Locked, Running].include? self.state
       # Update the priority of this container to the maximum priority of any of
       # its committed container requests and save the record.
-      max = 0
-      ContainerRequest.where(container_uuid: uuid).each do |cr|
-        if cr.state == ContainerRequest::Committed and cr.priority > max
-          max = cr.priority
-        end
-      end
-      self.priority = max
+      self.priority = ContainerRequest.
+        where(container_uuid: uuid,
+              state: ContainerRequest::Committed).
+        maximum('priority')
       self.save!
     end
   end
@@ -102,52 +106,100 @@ class Container < ArvadosModel
   end
 
   def validate_change
-    permitted = []
+    permitted = [:state]
 
     if self.new_record?
-      permitted.push :owner_uuid, :command, :container_image, :cwd, :environment,
-                     :mounts, :output_path, :priority, :runtime_constraints, :state
+      permitted.push(:owner_uuid, :command, :container_image, :cwd,
+                     :environment, :mounts, :output_path, :priority,
+                     :runtime_constraints)
     end
 
     case self.state
-    when Queued
-      # permit priority change only.
+    when Queued, Locked
       permitted.push :priority
 
     when Running
+      permitted.push :priority, :progress
       if self.state_changed?
-        # At point of state change, can set state and started_at
-        permitted.push :state, :started_at
-      else
-        # While running, can update priority and progress.
-        permitted.push :priority, :progress
+        permitted.push :started_at
       end
 
     when Complete
-      if self.state_changed?
-        permitted.push :state, :finished_at, :output, :log, :exit_code
-      else
-        errors.add :state, "cannot update record"
+      if self.state_was == Running
+        permitted.push :finished_at, :output, :log, :exit_code
       end
 
     when Cancelled
-      if self.state_changed?
-        if self.state_was == Running
-          permitted.push :state, :finished_at, :output, :log
-        elsif self.state_was == Queued
-          permitted.push :state, :finished_at
-        end
-      else
-        errors.add :state, "cannot update record"
+      case self.state_was
+      when Running
+        permitted.push :finished_at, :output, :log
+      when Queued, Locked
+        permitted.push :finished_at
       end
 
     else
-      errors.add :state, "invalid state"
+      # The state_transitions check will add an error message for this
+      return false
     end
 
     check_update_whitelist permitted
   end
 
+  def validate_lock
+    # If the Container is already locked by someone other than the
+    # current api_client_auth, disallow all changes -- except
+    # priority, which needs to change to reflect max(priority) of
+    # relevant ContainerRequests.
+    if locked_by_uuid_was
+      if locked_by_uuid_was != Thread.current[:api_client_authorization].uuid
+        check_update_whitelist [:priority]
+      end
+    end
+
+    if [Locked, Running].include? self.state
+      # If the Container was already locked, locked_by_uuid must not
+      # changes. Otherwise, the current auth gets the lock.
+      need_lock = locked_by_uuid_was || Thread.current[:api_client_authorization].uuid
+    else
+      need_lock = nil
+    end
+
+    # The caller can provide a new value for locked_by_uuid, but only
+    # if it's exactly what we expect. This allows a caller to perform
+    # an update like {"state":"Unlocked","locked_by_uuid":null}.
+    if self.locked_by_uuid_changed?
+      if self.locked_by_uuid != need_lock
+        return errors.add :locked_by_uuid, "can only change to #{need_lock}"
+      end
+    end
+    self.locked_by_uuid = need_lock
+  end
+
+  def assign_auth
+    if self.auth_uuid_changed?
+      return errors.add :auth_uuid, 'is readonly'
+    end
+    if not [Locked, Running].include? self.state
+      # don't need one
+      self.auth.andand.update_attributes(expires_at: db_current_time)
+      self.auth = nil
+      return
+    elsif self.auth
+      # already have one
+      return
+    end
+    cr = ContainerRequest.
+      where('container_uuid=? and priority>0', self.uuid).
+      order('priority desc').
+      first
+    if !cr
+      return errors.add :auth_uuid, "cannot be assigned because priority <= 0"
+    end
+    self.auth = ApiClientAuthorization.
+      create!(user_id: User.find_by_uuid(cr.modified_by_user_uuid).id,
+              api_client_id: 0)
+  end
+
   def handle_completed
     # This container is finished so finalize any associated container requests
     # that are associated with this container.
index acb751c89401424e04d03b950d5acc675dceaca8..6353132e908baa3d683ec0a9d320ff3a60d55804 100644 (file)
@@ -78,32 +78,32 @@ class ContainerRequest < ArvadosModel
     self.cwd ||= "."
   end
 
-  # Turn a container request into a container.
+  # Create a new container (or find an existing one) to satisfy this
+  # request.
   def resolve
-    # In the future this will do things like resolve symbolic git and keep
-    # references to content addresses.
-    Container.create!({ :command => self.command,
-                        :container_image => self.container_image,
-                        :cwd => self.cwd,
-                        :environment => self.environment,
-                        :mounts => self.mounts,
-                        :output_path => self.output_path,
-                        :runtime_constraints => self.runtime_constraints })
+    # TODO: resolve symbolic git and keep references to content
+    # addresses.
+    c = act_as_system_user do
+      Container.create!(command: self.command,
+                        container_image: self.container_image,
+                        cwd: self.cwd,
+                        environment: self.environment,
+                        mounts: self.mounts,
+                        output_path: self.output_path,
+                        runtime_constraints: self.runtime_constraints)
+    end
+    self.container_uuid = c.uuid
   end
 
   def set_container
-    if self.container_uuid_changed?
-      if not current_user.andand.is_admin and not self.container_uuid.nil?
-        errors.add :container_uuid, "can only be updated to nil."
-      end
-    else
-      if self.state_changed?
-        if self.state == Committed and (self.state_was == Uncommitted or self.state_was.nil?)
-          act_as_system_user do
-            self.container_uuid = self.resolve.andand.uuid
-          end
-        end
-      end
+    if (container_uuid_changed? and
+        not current_user.andand.is_admin and
+        not container_uuid.nil?)
+      errors.add :container_uuid, "can only be updated to nil."
+      return false
+    end
+    if state_changed? and state == Committed and container_uuid.nil?
+      resolve
     end
   end
 
@@ -158,16 +158,14 @@ class ContainerRequest < ArvadosModel
   end
 
   def update_priority
-    if [Committed, Final].include? self.state and (self.state_changed? or
-                                                   self.priority_changed? or
-                                                   self.container_uuid_changed?)
-      [self.container_uuid_was, self.container_uuid].each do |cuuid|
-        unless cuuid.nil?
-          c = Container.find_by_uuid cuuid
-          act_as_system_user do
-            c.update_priority!
-          end
-        end
+    if self.state_changed? or
+        self.priority_changed? or
+        self.container_uuid_changed?
+      act_as_system_user do
+        Container.
+          where('uuid in (?)',
+                [self.container_uuid_was, self.container_uuid].compact).
+          map(&:update_priority!)
       end
     end
   end
index 0e857ad15c22101d0d93e4b497479f3e14a6055d..6105b5f35d78636b0c9471b4f1315ec2236d77d6 100644 (file)
@@ -27,7 +27,7 @@ class Group < ArvadosModel
   def invalidate_permissions_cache
     # Ensure a new group can be accessed by the appropriate users
     # immediately after being created.
-    User.invalidate_permissions_cache
+    User.invalidate_permissions_cache db_current_time.to_i
   end
 
   def assign_name
index 6c24293334f6d4cc5af371b1b2b9a0d370466530..0ed53535778335d11b2b12d3007058e9ad76adfc 100644 (file)
@@ -2,6 +2,7 @@ class Job < ArvadosModel
   include HasUuid
   include KindAndEtag
   include CommonApiTemplate
+  serialize :components, Hash
   attr_protected :arvados_sdk_version, :docker_image_locator
   serialize :script_parameters, Hash
   serialize :runtime_constraints, Hash
@@ -52,6 +53,7 @@ class Job < ArvadosModel
     t.add :queue_position
     t.add :node_uuids
     t.add :description
+    t.add :components
   end
 
   # Supported states for a job
@@ -78,12 +80,13 @@ class Job < ArvadosModel
   end
 
   def queue_position
-    Job::queue.each_with_index do |job, index|
-      if job[:uuid] == self.uuid
-        return index
-      end
-    end
-    nil
+    # We used to report this accurately, but the implementation made queue
+    # API requests O(n**2) for the size of the queue.  See #8800.
+    # We've soft-disabled it because it's not clear we even want this
+    # functionality: now that we have Node Manager with support for multiple
+    # node sizes, "queue position" tells you very little about when a job will
+    # run.
+    state == Queued ? 0 : nil
   end
 
   def self.running
@@ -92,8 +95,7 @@ class Job < ArvadosModel
   end
 
   def lock locked_by_uuid
-    transaction do
-      self.reload
+    with_lock do
       unless self.state == Queued and self.is_locked_by_uuid.nil?
         raise AlreadyLockedError
       end
@@ -238,7 +240,8 @@ class Job < ArvadosModel
           output_changed? or
           log_changed? or
           tasks_summary_changed? or
-          state_changed?
+          state_changed? or
+          components_changed?
         logger.warn "User #{current_user.uuid if current_user} tried to change protected job attributes on locked #{self.class.to_s} #{uuid_was}"
         return false
       end
index d9b8f6f09b191230b6635fd8cd0b1cadf9ccf5ab..24872b21ec7163852cf86d0c0ceb3d3b41f13608 100644 (file)
@@ -66,7 +66,7 @@ class Link < ArvadosModel
       # permissions for head_uuid and tail_uuid, and invalidate the
       # cache for only those users. (This would require a browseable
       # cache.)
-      User.invalidate_permissions_cache
+      User.invalidate_permissions_cache db_current_time.to_i
     end
   end
 
index 2200d050990809d04f3e5fdbe088a1af7621ba01..553a3be5aebbd703690f54156a6c0efb22dbe528 100644 (file)
@@ -123,8 +123,13 @@ class User < ArvadosModel
     true
   end
 
-  def self.invalidate_permissions_cache
-    Rails.cache.delete_matched(/^groups_for_user_/)
+  def self.invalidate_permissions_cache(timestamp=nil)
+    if Rails.configuration.async_permissions_update
+      timestamp = DbCurrentTime::db_current_time.to_i if timestamp.nil?
+      connection.execute "NOTIFY invalidate_permissions_cache, '#{timestamp}'"
+    else
+      Rails.cache.delete_matched(/^groups_for_user_/)
+    end
   end
 
   # Return a hash of {group_uuid: perm_hash} where perm_hash[:read]
@@ -134,8 +139,7 @@ class User < ArvadosModel
   # The permission graph is built by repeatedly enumerating all
   # permission links reachable from self.uuid, and then calling
   # search_permissions
-  def group_permissions
-    Rails.cache.fetch "groups_for_user_#{self.uuid}" do
+  def calculate_group_permissions
       permissions_from = {}
       todo = {self.uuid => true}
       done = {}
@@ -182,8 +186,27 @@ class User < ArvadosModel
           end
         end
       end
-      search_permissions(self.uuid, permissions_from)
+      perms = search_permissions(self.uuid, permissions_from)
+      Rails.cache.write "groups_for_user_#{self.uuid}", perms
+      perms
+  end
+
+  # Return a hash of {group_uuid: perm_hash} where perm_hash[:read]
+  # and perm_hash[:write] are true if this user can read and write
+  # objects owned by group_uuid.
+  def group_permissions
+    r = Rails.cache.read "groups_for_user_#{self.uuid}"
+    if r.nil?
+      if Rails.configuration.async_permissions_update
+        while r.nil?
+          sleep(0.1)
+          r = Rails.cache.read "groups_for_user_#{self.uuid}"
+        end
+      else
+        r = calculate_group_permissions
+      end
     end
+    r
   end
 
   def self.setup(user, openid_prefix, repo_name=nil, vm_uuid=nil)
index 66916836c23fb9538281400ad84b1f1f672247e2..f1c4dd02866049368442febaa22ba7ed2b10245f 100644 (file)
@@ -27,6 +27,11 @@ common:
   # generate permission signatures for Keep locators. It must be
   # identical to the permission key given to Keep. IMPORTANT: This is
   # a site secret. It should be at least 50 characters.
+  #
+  # Modifying blob_signing_key will invalidate all existing
+  # signatures, which can cause programs to fail (e.g., arv-put,
+  # arv-get, and Crunch jobs).  To avoid errors, rotate keys only when
+  # no such processes are running.
   blob_signing_key: ~
 
   # These settings are provided by your OAuth2 provider (e.g.,
@@ -155,12 +160,12 @@ common:
   # still has permission) the client can retrieve the collection again
   # to get fresh signatures.
   #
-  # Datamanager considers an unreferenced block older than this to be
-  # eligible for garbage collection. Therefore, it should never be
-  # smaller than the corresponding value used by any local keepstore
-  # service (see keepstore -blob-signature-ttl flag). This rule
-  # prevents datamanager from trying to garbage-collect recently
-  # written blocks while clients are still holding valid signatures.
+  # This must be exactly equal to the -blob-signature-ttl flag used by
+  # keepstore servers.  Otherwise, reading data blocks and saving
+  # collections will fail with HTTP 403 permission errors.
+  #
+  # Modifying blob_signature_ttl invalidates existing signatures; see
+  # blob_signing_key note above.
   #
   # The default is 2 weeks.
   blob_signature_ttl: 1209600
@@ -340,6 +345,12 @@ common:
 
   crunch_log_partial_line_throttle_period: 5
 
+  # Enable asynchronous permission graph rebuild.  Must run
+  # script/permission-updater.rb as a separate process.  When the permission
+  # cache is invalidated, the background process will update the permission
+  # graph cache.  This feature is experimental!
+  async_permissions_update: false
+
 development:
   force_ssl: false
   cache_classes: false
diff --git a/services/api/config/initializers/fix_www_decode.rb b/services/api/config/initializers/fix_www_decode.rb
new file mode 100644 (file)
index 0000000..bc50c12
--- /dev/null
@@ -0,0 +1,16 @@
+module URI
+  if Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.2')
+    # Rack uses the standard library method URI.decode_www_form_component to
+    # process parameters.  This method first validates the string with a
+    # regular expression, and then decodes it using another regular expression.
+    # Ruby 2.1 and earlier has a bug is in the validation; the regular
+    # expression that is used generates many backtracking points, which results
+    # in exponential memory growth when matching large strings.  The fix is to
+    # monkey-patch the version of the method from Ruby 2.2 which checks that
+    # the string is not invalid instead of checking it is valid.
+    def self.decode_www_form_component(str, enc=Encoding::UTF_8)
+      raise ArgumentError, "invalid %-encoding (#{str})" if /%(?!\h\h)/ =~ str
+      str.b.gsub(/\+|%\h\h/, TBLDECWWWCOMP_).force_encoding(enc)
+    end
+  end
+end
index c85a3fc57af20d2461516c384623a452a86dbdda..ed8f8d89af9c2d429dfe661633df2ec36a0484e9 100644 (file)
@@ -15,6 +15,7 @@ Server::Application.routes.draw do
     namespace :v1 do
       resources :api_client_authorizations do
         post 'create_system_auth', on: :collection
+        get 'current', on: :collection
       end
       resources :api_clients
       resources :authorized_keys
@@ -28,7 +29,9 @@ Server::Application.routes.draw do
       end
       resources :humans
       resources :job_tasks
-      resources :containers
+      resources :containers do
+        get 'auth', on: :member
+      end
       resources :container_requests
       resources :jobs do
         get 'queue', on: :collection
diff --git a/services/api/db/migrate/20160324144017_add_components_to_job.rb b/services/api/db/migrate/20160324144017_add_components_to_job.rb
new file mode 100644 (file)
index 0000000..9595d7f
--- /dev/null
@@ -0,0 +1,11 @@
+class AddComponentsToJob < ActiveRecord::Migration
+  def up
+    add_column :jobs, :components, :text
+  end
+
+  def down
+    if column_exists?(:jobs, :components)
+      remove_column :jobs, :components
+    end
+  end
+end
diff --git a/services/api/db/migrate/20160506175108_add_auths_to_container.rb b/services/api/db/migrate/20160506175108_add_auths_to_container.rb
new file mode 100644 (file)
index 0000000..d714a49
--- /dev/null
@@ -0,0 +1,6 @@
+class AddAuthsToContainer < ActiveRecord::Migration
+  def change
+    add_column :containers, :auth_uuid, :string
+    add_column :containers, :locked_by_uuid, :string
+  end
+end
diff --git a/services/api/db/migrate/20160509143250_add_auth_and_lock_to_container_index.rb b/services/api/db/migrate/20160509143250_add_auth_and_lock_to_container_index.rb
new file mode 100644 (file)
index 0000000..4329ac0
--- /dev/null
@@ -0,0 +1,19 @@
+class AddAuthAndLockToContainerIndex < ActiveRecord::Migration
+  Columns_were = ["uuid", "owner_uuid", "modified_by_client_uuid", "modified_by_user_uuid", "state", "log", "cwd", "output_path", "output", "container_image"]
+  Columns = Columns_were + ["auth_uuid", "locked_by_uuid"]
+  def up
+    begin
+      remove_index :containers, :name => 'containers_search_index'
+    rescue
+    end
+    add_index(:containers, Columns, name: "containers_search_index")
+  end
+
+  def down
+    begin
+      remove_index :containers, :name => 'containers_search_index'
+    rescue
+    end
+    add_index(:containers, Columns_were, name: "containers_search_index")
+  end
+end
index e482e6e607b4141bbbc00f9b70352852be62ac90..4bf4a173bd9d1c31e04d0f7517c1927baf9f3ff2 100644 (file)
@@ -339,7 +339,9 @@ CREATE TABLE containers (
     progress double precision,
     priority integer,
     updated_at timestamp without time zone NOT NULL,
-    exit_code integer
+    exit_code integer,
+    auth_uuid character varying(255),
+    locked_by_uuid character varying(255)
 );
 
 
@@ -536,7 +538,8 @@ CREATE TABLE jobs (
     priority integer DEFAULT 0 NOT NULL,
     description character varying(524288),
     state character varying(255),
-    arvados_sdk_version character varying(255)
+    arvados_sdk_version character varying(255),
+    components text
 );
 
 
@@ -1471,7 +1474,7 @@ CREATE INDEX container_requests_search_index ON container_requests USING btree (
 -- Name: containers_search_index; Type: INDEX; Schema: public; Owner: -; Tablespace: 
 --
 
-CREATE INDEX containers_search_index ON containers USING btree (uuid, owner_uuid, modified_by_client_uuid, modified_by_user_uuid, state, log, cwd, output_path, output, container_image);
+CREATE INDEX containers_search_index ON containers USING btree (uuid, owner_uuid, modified_by_client_uuid, modified_by_user_uuid, state, log, cwd, output_path, output, container_image, auth_uuid, locked_by_uuid);
 
 
 --
@@ -2580,4 +2583,10 @@ INSERT INTO schema_migrations (version) VALUES ('20151229214707');
 
 INSERT INTO schema_migrations (version) VALUES ('20160208210629');
 
-INSERT INTO schema_migrations (version) VALUES ('20160209155729');
\ No newline at end of file
+INSERT INTO schema_migrations (version) VALUES ('20160209155729');
+
+INSERT INTO schema_migrations (version) VALUES ('20160324144017');
+
+INSERT INTO schema_migrations (version) VALUES ('20160506175108');
+
+INSERT INTO schema_migrations (version) VALUES ('20160509143250');
\ No newline at end of file
index 2e78612fc2d8b0ea3883cbb964d73c3e443e9c21..fbd4ef5f0c67933a7cc703d9f532c94fd601fc3d 100644 (file)
@@ -124,12 +124,18 @@ module CurrentApiClient
   end
 
   def act_as_user user
+    #auth_was = Thread.current[:api_client_authorization]
     user_was = Thread.current[:user]
     Thread.current[:user] = user
+    #Thread.current[:api_client_authorization] = ApiClientAuthorization.
+    #  where('user_id=? and scopes is null', user.id).
+    #  order('expires_at desc').
+    #  first
     begin
       yield
     ensure
       Thread.current[:user] = user_was
+      #Thread.current[:api_client_authorization] = auth_was
     end
   end
 
index ac53876122d6b2e74b0d9fed85a56308308465b4..9bf95f57356e4eef7389b585917e26d8ec1973c9 100644 (file)
@@ -162,7 +162,7 @@ class EventBus
     begin
       begin
         # Parse event data as JSON
-        p = (Oj.load event.data).symbolize_keys
+        p = (Oj.strict_load event.data).symbolize_keys
         filter = Filter.new(p)
       rescue Oj::Error => e
         ws.send ({status: 400, message: "malformed request"}.to_json)
index d7b9bb7513899d477906738d09a5a23bc8e6095f..5b22274d07781325276b5df152037a5b2a13dc61 100644 (file)
@@ -17,7 +17,7 @@ module LoadParam
       @where = params[:where]
     elsif params[:where].is_a? String
       begin
-        @where = Oj.load(params[:where])
+        @where = Oj.strict_load(params[:where])
         raise unless @where.is_a? Hash
       rescue
         raise ArgumentError.new("Could not parse \"where\" param as an object")
@@ -33,7 +33,7 @@ module LoadParam
       @filters += params[:filters]
     elsif params[:filters].is_a? String and !params[:filters].empty?
       begin
-        f = Oj.load params[:filters]
+        f = Oj.strict_load params[:filters]
         if not f.nil?
           raise unless f.is_a? Array
           @filters += f
@@ -72,7 +72,7 @@ module LoadParam
       (case params[:order]
        when String
          if params[:order].starts_with? '['
-           od = Oj.load(params[:order])
+           od = Oj.strict_load(params[:order])
            raise unless od.is_a? Array
            od
          else
@@ -142,7 +142,7 @@ module LoadParam
       @select = params[:select]
     when String
       begin
-        @select = Oj.load params[:select]
+        @select = Oj.strict_load params[:select]
         raise unless @select.is_a? Array or @select.nil?
       rescue
         raise ArgumentError.new("Could not parse \"select\" param as an array")
index 350c3802fc60e606fd0dba703429ca7dc98f847e..caf62c721f4854fb30d468c327623acc2f50de56 100644 (file)
@@ -126,6 +126,8 @@ module RecordFilters
             end
           end
           cond_out << cond.join(' OR ')
+        else
+          raise ArgumentError.new("Invalid operator '#{operator}'")
         end
       end
       conds_out << cond_out.join(' OR ') if cond_out.any?
index a81f9924f01aa182bf35efc2e48dd326b3b20942..8fccd0f45c36416e72034a06e8e3ce3880f7aa04 100644 (file)
@@ -2,7 +2,7 @@ module WhitelistUpdate
   def check_update_whitelist permitted_fields
     attribute_names.each do |field|
       if not permitted_fields.include? field.to_sym and self.send((field.to_s + "_changed?").to_sym)
-        errors.add field, "illegal update of field"
+        errors.add field, "cannot be modified in this state"
       end
     end
   end
@@ -10,7 +10,7 @@ module WhitelistUpdate
   def validate_state_change
     if self.state_changed?
       unless state_transitions[self.state_was].andand.include? self.state
-        errors.add :state, "invalid state change from #{self.state_was} to #{self.state}"
+        errors.add :state, "cannot change from #{self.state_was} to #{self.state}"
         return false
       end
     end
diff --git a/services/api/script/permission-updater.rb b/services/api/script/permission-updater.rb
new file mode 100755 (executable)
index 0000000..f7d672d
--- /dev/null
@@ -0,0 +1,44 @@
+#!/usr/bin/env ruby
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+include DbCurrentTime
+
+def update_permissions
+  timestamp = DbCurrentTime::db_current_time.to_i
+  Rails.logger.info "Begin updating permission cache"
+  User.all.each do |u|
+    u.calculate_group_permissions
+  end
+  Rails.cache.write "last_updated_permissions", timestamp
+  Rails.logger.info "Permission cache updated"
+end
+
+ActiveRecord::Base.connection_pool.with_connection do |connection|
+  conn = connection.instance_variable_get(:@connection)
+  begin
+    conn.async_exec "LISTEN invalidate_permissions_cache"
+
+    # Initial refresh of permissions graph
+    update_permissions
+
+    while true
+      # wait_for_notify will block until there is a change
+      # notification from Postgres about the permission cache,
+      # and then rebuild the permission cache.
+      conn.wait_for_notify do |channel, pid, payload|
+        last_updated = Rails.cache.read("last_updated_permissions")
+        Rails.logger.info "Got notify #{payload} last update #{last_updated}"
+        if last_updated.nil? || last_updated.to_i <= payload.to_i
+          update_permissions
+        end
+      end
+    end
+  ensure
+    # Don't want the connection to still be listening once we return
+    # it to the pool - could result in weird behavior for the next
+    # thread to check it out.
+    conn.async_exec "UNLISTEN *"
+  end
+end
index f99a9fb941f1b26f44d2d4b4035a28afd84fbc08..de14838186b00e1a3aebde728387882ef96f7c2e 100644 (file)
@@ -222,6 +222,13 @@ job_reader:
   api_token: e99512cdc0f3415c2428b9758f33bdfb07bc3561b00e86e7e6
   expires_at: 2038-01-01 00:00:00
 
+job_reader2:
+  uuid: zzzzz-gj3su-jobreader2auth1
+  api_client: untrusted
+  user: job_reader2
+  api_token: jobreader2415c2428b9758f33bdfb07bc3561b0jobreader2
+  expires_at: 2038-01-01 00:00:00
+
 active_no_prefs:
   uuid: zzzzz-gj3su-307z32aux8dg2s1
   api_client: untrusted
@@ -271,3 +278,9 @@ fuse:
   api_token: 4nagbkv8eap0uok7pxm72nossq5asihls3yn5p4xmvqx5t5e7p
   expires_at: 2038-01-01 00:00:00
 
+dispatch1:
+  uuid: zzzzz-gj3su-k9dvestay1plssr
+  api_client: untrusted
+  user: system_user
+  api_token: kwi8oowusvbutahacwk2geulqewy5oaqmpalczfna4b6bb0hfw
+  expires_at: 2038-01-01 00:00:00
diff --git a/services/api/test/fixtures/container_requests.yml b/services/api/test/fixtures/container_requests.yml
new file mode 100644 (file)
index 0000000..c9f3427
--- /dev/null
@@ -0,0 +1,13 @@
+queued:
+  owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  state: Committed
+  priority: 1
+  created_at: 2016-01-11 11:11:11.111111111 Z
+  updated_at: 2016-01-11 11:11:11.111111111 Z
+  modified_at: 2016-01-11 11:11:11.111111111 Z
+  modified_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  container_image: test
+  cwd: test
+  output_path: test
+  command: ["echo", "hello"]
+  container_uuid: zzzzz-dz642-queuedcontainer
index 22004b401720366e8ab61d15cdd414b802d683b5..b804c804a4c25b17505722ba341b4bd11c6e5269 100644 (file)
@@ -1,6 +1,6 @@
 queued:
   uuid: zzzzz-dz642-queuedcontainer
-  owner_uuid: zzzzz-tpzed-d9tiejq69daie8f
+  owner_uuid: zzzzz-tpzed-000000000000000
   state: Queued
   priority: 1
   created_at: 2016-01-11 11:11:11.111111111 Z
@@ -10,10 +10,13 @@ queued:
   output: test
   output_path: test
   command: ["echo", "hello"]
+  runtime_constraints:
+    ram: 12000000000
+    vcpus: 4
 
 completed:
   uuid: zzzzz-dz642-compltcontainer
-  owner_uuid: zzzzz-tpzed-d9tiejq69daie8f
+  owner_uuid: zzzzz-tpzed-000000000000000
   state: Complete
   priority: 1
   created_at: 2016-01-11 11:11:11.111111111 Z
@@ -23,3 +26,6 @@ completed:
   output: test
   output_path: test
   command: ["echo", "hello"]
+  runtime_constraints:
+    ram: 12000000000
+    vcpus: 4
diff --git a/services/api/test/fixtures/job_tasks.yml b/services/api/test/fixtures/job_tasks.yml
new file mode 100644 (file)
index 0000000..4aded53
--- /dev/null
@@ -0,0 +1,11 @@
+running_job_task_1:
+  uuid: zzzzz-ot0gb-runningjobtask1
+  owner_uuid: zzzzz-j7d0g-v955i6s2oi1cbso
+  created_at: <%= 3.minute.ago.to_s(:db) %>
+  job_uuid: zzzzz-8i9sb-with2components
+
+running_job_task_2:
+  uuid: zzzzz-ot0gb-runningjobtask2
+  owner_uuid: zzzzz-j7d0g-v955i6s2oi1cbso
+  created_at: <%= 3.minute.ago.to_s(:db) %>
+  job_uuid: zzzzz-8i9sb-with2components
index 12493e35514714dfdffc1e6f5aeabdf09d0dd223..d0c22d305954a2e832d3e8c4dac43725a982db26 100644 (file)
@@ -499,6 +499,35 @@ job_in_publicly_accessible_project_but_other_objects_elsewhere:
   log: zzzzz-4zz18-fy296fx3hot09f7
   output: zzzzz-4zz18-bv31uwvy3neko21
 
+running_job_with_components:
+  uuid: zzzzz-8i9sb-with2components
+  owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  cancelled_at: ~
+  cancelled_by_user_uuid: ~
+  cancelled_by_client_uuid: ~
+  created_at: <%= 3.minute.ago.to_s(:db) %>
+  started_at: <%= 3.minute.ago.to_s(:db) %>
+  finished_at: ~
+  script: hash
+  repository: active/foo
+  script_version: 1de84a854e2b440dc53bf42f8548afa4c17da332
+  running: true
+  success: ~
+  output: ~
+  priority: 0
+  log: ~
+  is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  tasks_summary:
+    failed: 0
+    todo: 3
+    running: 1
+    done: 1
+  runtime_constraints: {}
+  state: Running
+  components:
+    component1: zzzzz-8i9sb-jyq01m7in1jlofj
+    component2: zzzzz-d1hrv-partdonepipelin
+
 # Test Helper trims the rest of the file
 
 # Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
index 7ed7f6bcf35636ad4ccc572b2da2e3044f6a277a..7d9aea5b4598a9bf4e0b37c15bb942e15d64cc1a 100644 (file)
@@ -717,6 +717,51 @@ job_reader_can_read_foo_repo:
   tail_uuid: zzzzz-tpzed-905b42d1dd4a354
   head_uuid: zzzzz-s0uqq-382brsig8rp3666
 
+job_reader2_can_read_job_with_components:
+  # Permission link giving job_reader2 permission
+  # to read running_job_with_components
+  uuid: zzzzz-o0j2j-jobcomps4jobrdr
+  owner_uuid: zzzzz-tpzed-000000000000000
+  created_at: 2014-06-13 20:42:26 -0800
+  modified_by_client_uuid: zzzzz-tpzed-000000000000000
+  modified_by_user_uuid: zzzzz-tpzed-000000000000000
+  modified_at: 2014-06-13 20:42:26 -0800
+  updated_at: 2014-06-13 20:42:26 -0800
+  link_class: permission
+  name: can_read
+  tail_uuid: zzzzz-tpzed-readjobwithcomp
+  head_uuid: zzzzz-8i9sb-with2components
+
+job_reader2_can_read_pipeline_from_job_with_components:
+  # Permission link giving job_reader2 permission
+  # to read running_job_with_components
+  uuid: zzzzz-o0j2j-pi4comps4jobrdr
+  owner_uuid: zzzzz-tpzed-000000000000000
+  created_at: 2014-06-13 20:42:26 -0800
+  modified_by_client_uuid: zzzzz-tpzed-000000000000000
+  modified_by_user_uuid: zzzzz-tpzed-000000000000000
+  modified_at: 2014-06-13 20:42:26 -0800
+  updated_at: 2014-06-13 20:42:26 -0800
+  link_class: permission
+  name: can_read
+  tail_uuid: zzzzz-tpzed-readjobwithcomp
+  head_uuid: zzzzz-d1hrv-partdonepipelin
+
+job_reader2_can_read_first_job_from_pipeline_from_job_with_components:
+  # Permission link giving job_reader2 permission
+  # to read running_job_with_components
+  uuid: zzzzz-o0j2j-job4pi4j4jobrdr
+  owner_uuid: zzzzz-tpzed-000000000000000
+  created_at: 2014-06-13 20:42:26 -0800
+  modified_by_client_uuid: zzzzz-tpzed-000000000000000
+  modified_by_user_uuid: zzzzz-tpzed-000000000000000
+  modified_at: 2014-06-13 20:42:26 -0800
+  updated_at: 2014-06-13 20:42:26 -0800
+  link_class: permission
+  name: can_read
+  tail_uuid: zzzzz-tpzed-readjobwithcomp
+  head_uuid: zzzzz-8i9sb-cjs4pklxxjykqqq
+
 baz_collection_name_in_asubproject:
   uuid: zzzzz-o0j2j-bazprojectname2
   owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
index 41a7fc9720e77292721800004babca120ae45480..f51b78aa94ec52345e44cc6bd4e301aa7002aaf7 100644 (file)
@@ -42,6 +42,8 @@ has_component_with_completed_jobs:
   state: Complete
   uuid: zzzzz-d1hrv-i3e77t9z5y8j9cc
   owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  started_at: <%= 10.minute.ago.to_s(:db) %>
+  finished_at: <%= 9.minute.ago.to_s(:db) %>
   components:
    foo:
     script: foo
index b15ada12b426d648b564faf4a36604e9eb6b6128..7104af26f7e43b610260218d7e0ca5243058744d 100644 (file)
@@ -209,6 +209,22 @@ job_reader:
       role: Computational biologist
     getting_started_shown: 2015-03-26 12:34:56.789000000 Z
 
+job_reader2:
+  owner_uuid: zzzzz-tpzed-000000000000000
+  uuid: zzzzz-tpzed-readjobwithcomp
+  email: job_reader2@arvados.local
+  first_name: Job
+  last_name: Reader2
+  identity_url: https://job_reader2.openid.local
+  is_active: true
+  is_admin: false
+  username: jobreader2
+  prefs:
+    profile:
+      organization: example.com
+      role: Computational biologist
+    getting_started_shown: 2015-03-26 12:34:56.789000000 Z
+
 active_no_prefs:
   owner_uuid: zzzzz-tpzed-000000000000000
   uuid: zzzzz-tpzed-a46c42d1td4aoj4
index 192e6b956dad89bb7e70dea714800a986f9574ab..37e690e0b21ccb221454d6a46ac04b4e732c2ce7 100644 (file)
@@ -38,9 +38,11 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
     assert_response 403
   end
 
-  def assert_found_tokens(auth, search_params, *expected_tokens)
+  def assert_found_tokens(auth, search_params, expected)
     authorize_with auth
-    expected_tokens.map! { |name| api_client_authorizations(name).api_token }
+    expected_tokens = expected.map do |name|
+      api_client_authorizations(name).api_token
+    end
     get :index, search_params
     assert_response :success
     got_tokens = JSON.parse(@response.body)['items']
@@ -52,19 +54,26 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
   # Three-tuples with auth to use, scopes to find, and expected tokens.
   # Make two tests for each tuple, one searching with where and the other
   # with filter.
-  [[:admin_trustedclient, [], :admin_noscope],
-   [:active_trustedclient, ["GET /arvados/v1/users"], :active_userlist],
+  [[:admin_trustedclient, [], [:admin_noscope]],
+   [:active_trustedclient, ["GET /arvados/v1/users"], [:active_userlist]],
    [:active_trustedclient,
     ["POST /arvados/v1/api_client_authorizations",
      "GET /arvados/v1/api_client_authorizations"],
-    :active_apitokens],
-  ].each do |auth, scopes, *expected|
+    [:active_apitokens]],
+  ].each do |auth, scopes, expected|
     test "#{auth.to_s} can find auths where scopes=#{scopes.inspect}" do
-      assert_found_tokens(auth, {where: {scopes: scopes}}, *expected)
+      assert_found_tokens(auth, {where: {scopes: scopes}}, expected)
     end
 
     test "#{auth.to_s} can find auths filtered with scopes=#{scopes.inspect}" do
-      assert_found_tokens(auth, {filters: [['scopes', '=', scopes]]}, *expected)
+      assert_found_tokens(auth, {filters: [['scopes', '=', scopes]]}, expected)
+    end
+
+    test "#{auth.to_s} offset works with filter scopes=#{scopes.inspect}" do
+      assert_found_tokens(auth, {
+                            offset: expected.length,
+                            filters: [['scopes', '=', scopes]]
+                          }, [])
     end
   end
 
@@ -112,6 +121,20 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
       assert_response expect_list_response
       if expect_list_items
         assert_equal assigns(:objects).length, expect_list_items
+        assert_equal json_response['items_available'], expect_list_items
+      end
+    end
+
+    if expect_list_items
+      test "using '#{user}', list '#{token}' by uuid with offset" do
+        authorize_with user
+        get :index, {
+          filters: [['uuid','=',api_client_authorizations(token).uuid]],
+          offset: expect_list_items,
+        }
+        assert_response expect_list_response
+        assert_equal json_response['items_available'], expect_list_items
+        assert_equal json_response['items'].length, 0
       end
     end
 
@@ -123,6 +146,7 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
       assert_response expect_list_response
       if expect_list_items
         assert_equal assigns(:objects).length, expect_list_items
+        assert_equal json_response['items_available'], expect_list_items
       end
     end
   end
@@ -144,4 +168,17 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
     }
     assert_response 403
   end
+
+  test "get current token" do
+    authorize_with :active
+    get :current
+    assert_response :success
+    assert_equal(json_response['api_token'],
+                 api_client_authorizations(:active).api_token)
+  end
+
+  test "get current token, no auth" do
+    get :current
+    assert_response 401
+  end
 end
diff --git a/services/api/test/functional/arvados/v1/containers_controller_test.rb b/services/api/test/functional/arvados/v1/containers_controller_test.rb
new file mode 100644 (file)
index 0000000..d9f7d96
--- /dev/null
@@ -0,0 +1,52 @@
+require 'test_helper'
+
+class Arvados::V1::ContainersControllerTest < ActionController::TestCase
+  test 'create' do
+    authorize_with :system_user
+    post :create, {
+      container: {
+        command: ['echo', 'hello'],
+        container_image: 'test',
+        output_path: 'test',
+      },
+    }
+    assert_response :success
+  end
+
+  [Container::Queued, Container::Complete].each do |state|
+    test "cannot get auth in #{state} state" do
+      authorize_with :dispatch1
+      get :auth, id: containers(:queued).uuid
+      assert_response 403
+    end
+  end
+
+  test 'cannot get auth with wrong token' do
+    authorize_with :dispatch1
+    c = containers(:queued)
+    assert c.update_attributes(state: Container::Locked), show_errors(c)
+
+    authorize_with :system_user
+    get :auth, id: c.uuid
+    assert_response 403
+  end
+
+  test 'get auth' do
+    authorize_with :dispatch1
+    c = containers(:queued)
+    assert c.update_attributes(state: Container::Locked), show_errors(c)
+    get :auth, id: c.uuid
+    assert_response :success
+    assert_operator 32, :<, json_response['api_token'].length
+    assert_equal 'arvados#apiClientAuthorization', json_response['kind']
+  end
+
+  test 'no auth in container response' do
+    authorize_with :dispatch1
+    c = containers(:queued)
+    assert c.update_attributes(state: Container::Locked), show_errors(c)
+    get :show, id: c.uuid
+    assert_response :success
+    assert_nil json_response['auth']
+  end
+end
index 6623c726df01923b7227d33f17e6f2098cab649e..00846795b4d7f7501964d0b888ba87739ce6c9d7 100644 (file)
@@ -389,7 +389,7 @@ class Arvados::V1::GroupsControllerTest < ActionController::TestCase
     assert_response :success
 
     # verify that the user can no longer see the project
-    @counter = 0  # Reset executed action counter
+    @test_counter = 0  # Reset executed action counter
     @controller = Arvados::V1::GroupsController.new
     authorize_with :project_viewer
     get :index, filters: [['group_class', '=', 'project']], format: :json
@@ -401,7 +401,7 @@ class Arvados::V1::GroupsControllerTest < ActionController::TestCase
     assert_equal false, found_projects.include?(groups(:starred_and_shared_active_user_project).uuid)
 
     # share the project
-    @counter = 0
+    @test_counter = 0
     @controller = Arvados::V1::LinksController.new
     authorize_with :system_user
     post :create, link: {
@@ -412,7 +412,7 @@ class Arvados::V1::GroupsControllerTest < ActionController::TestCase
     }
 
     # verify that project_viewer user can now see shared project again
-    @counter = 0
+    @test_counter = 0
     @controller = Arvados::V1::GroupsController.new
     authorize_with :project_viewer
     get :index, filters: [['group_class', '=', 'project']], format: :json
index 1e1425e92b7d27057e89a335c2480b8024b0c444..601f9a7af56f3f4260724eb8eae3bc28f5014ea7 100644 (file)
@@ -433,4 +433,79 @@ class Arvados::V1::JobsControllerTest < ActionController::TestCase
     assert_equal('077ba2ad3ea24a929091a9e6ce545c93199b8e57',
                  internal_tag(json_response['uuid']))
   end
+
+  test 'get job with components' do
+    authorize_with :active
+    get :show, {id: jobs(:running_job_with_components).uuid}
+    assert_response :success
+    assert_not_nil json_response["components"]
+    assert_equal ["component1", "component2"], json_response["components"].keys
+  end
+
+  [
+    [:active, :success],
+    [:system_user, :success],
+    [:admin, 403],
+  ].each do |user, expected|
+    test "add components to job locked by active user as #{user} user and expect #{expected}" do
+      authorize_with user
+      put :update, {
+        id: jobs(:running).uuid,
+        job: {
+          components: {"component1" => "value1", "component2" => "value2"}
+        }
+      }
+      assert_response expected
+      if expected == :success
+        assert_not_nil json_response["components"]
+        keys = json_response["components"].keys
+        assert_equal ["component1", "component2"], keys
+        assert_equal "value1", json_response["components"][keys[0]]
+      end
+    end
+  end
+
+  test 'get_delete components_get again for job with components' do
+    authorize_with :active
+    get :show, {id: jobs(:running_job_with_components).uuid}
+    assert_response :success
+    assert_not_nil json_response["components"]
+    assert_equal ["component1", "component2"], json_response["components"].keys
+
+    # delete second component
+    @test_counter = 0  # Reset executed action counter
+    @controller = Arvados::V1::JobsController.new
+    put :update, {
+      id: jobs(:running_job_with_components).uuid,
+      job: {
+        components: {"component1" => "zzzzz-8i9sb-jobuuid00000001"}
+      }
+    }
+    assert_response :success
+
+    @test_counter = 0  # Reset executed action counter
+    @controller = Arvados::V1::JobsController.new
+    get :show, {id: jobs(:running_job_with_components).uuid}
+    assert_response :success
+    assert_not_nil json_response["components"]
+    assert_equal ["component1"], json_response["components"].keys
+
+    # delete all components
+    @test_counter = 0  # Reset executed action counter
+    @controller = Arvados::V1::JobsController.new
+    put :update, {
+      id: jobs(:running_job_with_components).uuid,
+      job: {
+        components: {}
+      }
+    }
+    assert_response :success
+
+    @test_counter = 0  # Reset executed action counter
+    @controller = Arvados::V1::JobsController.new
+    get :show, {id: jobs(:running_job_with_components).uuid}
+    assert_response :success
+    assert_not_nil json_response["components"]
+    assert_equal [], json_response["components"].keys
+  end
 end
index a3b03ff2ba91be22392895538e32733ce7a4bafe..5c753731df7e89b4ffffd7f5c838f1c49674cc22 100644 (file)
@@ -8,4 +8,16 @@ class ActiveSupport::TestCase
       $stderr.puts "#{t1 - t0}s #{label}"
     end
   end
+
+  def vmpeak c
+    open("/proc/self/status").each_line do |line|
+      print "Begin #{c} #{line}" if (line =~ /^VmHWM:/)
+    end
+    n = yield
+    open("/proc/self/status").each_line do |line|
+      print "End #{c} #{line}" if (line =~ /^VmHWM:/)
+    end
+    n
+  end
+
 end
index 892060a1993ea21b838db6e4cb87ea886b9cf167..a952c202cb7dbadf73fae734ca0141d000ac5cde 100644 (file)
@@ -6,6 +6,7 @@ class CollectionsApiPerformanceTest < ActionDispatch::IntegrationTest
   include ManifestExamples
 
   test "crud cycle for a collection with a big manifest" do
+    slow_test
     bigmanifest = time_block 'make example' do
       make_manifest(streams: 100,
                     files_per_stream: 100,
@@ -14,7 +15,7 @@ class CollectionsApiPerformanceTest < ActionDispatch::IntegrationTest
                     api_token: api_token(:active))
     end
     json = time_block "JSON encode #{bigmanifest.length>>20}MiB manifest" do
-      Oj.dump({manifest_text: bigmanifest})
+      Oj.dump({"manifest_text" => bigmanifest})
     end
     time_block 'create' do
       post '/arvados/v1/collections', {collection: json}, auth(:active)
@@ -37,4 +38,19 @@ class CollectionsApiPerformanceTest < ActionDispatch::IntegrationTest
       delete '/arvados/v1/collections/' + uuid, {}, auth(:active)
     end
   end
+
+  test "memory usage" do
+    slow_test
+    hugemanifest = make_manifest(streams: 1,
+                                 files_per_stream: 2000,
+                                 blocks_per_file: 200,
+                                 bytes_per_block: 2**26,
+                                 api_token: api_token(:active))
+    json = time_block "JSON encode #{hugemanifest.length>>20}MiB manifest" do
+      Oj.dump({manifest_text: hugemanifest})
+    end
+    vmpeak "post" do
+      post '/arvados/v1/collections', {collection: json}, auth(:active)
+    end
+  end
 end
index 58f2abf69709d62f63989aca05ee5cb4a26e869c..ecb2f2a05831a44a7798fd98d048a821878fd11a 100644 (file)
@@ -4,6 +4,7 @@ class DatabaseResetTest < ActionDispatch::IntegrationTest
   self.use_transactional_fixtures = false
 
   test "reset fails when Rails.env != 'test'" do
+    slow_test
     rails_env_was = Rails.env
     begin
       Rails.env = 'production'
@@ -22,6 +23,7 @@ class DatabaseResetTest < ActionDispatch::IntegrationTest
   end
 
   test "database reset doesn't break basic CRUD operations" do
+    slow_test
     active_auth = auth(:active)
     admin_auth = auth(:admin)
 
@@ -48,6 +50,7 @@ class DatabaseResetTest < ActionDispatch::IntegrationTest
   end
 
   test "roll back database change" do
+    slow_test
     active_auth = auth(:active)
     admin_auth = auth(:admin)
 
index c4d6d5eb7e6eb54eaba2c830035321656668aa33..d1b8c34a43be5df0736475996b378002e87b6baa 100644 (file)
@@ -56,7 +56,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
 
     ws_helper do |ws|
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         status = d["status"]
         ws.close
       end
@@ -75,7 +75,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         status = d["status"]
         ws.close
       end
@@ -97,7 +97,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -134,7 +134,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -174,7 +174,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -213,7 +213,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -257,7 +257,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -297,7 +297,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -323,6 +323,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
   end
 
   test "connect, subscribe, get event, unsubscribe" do
+    slow_test
     state = 1
     spec = nil
     spec_ev_uuid = nil
@@ -342,7 +343,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -372,6 +373,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
   end
 
   test "connect, subscribe, get event, unsubscribe with filter" do
+    slow_test
     state = 1
     spec = nil
     spec_ev_uuid = nil
@@ -390,7 +392,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -421,6 +423,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
 
 
   test "connect, subscribe, get event, try to unsubscribe with bogus filter" do
+    slow_test
     state = 1
     spec = nil
     spec_ev_uuid = nil
@@ -435,7 +438,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -473,6 +476,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
 
 
   test "connected, not subscribed, no event" do
+    slow_test
     authorize_with :admin
 
     ws_helper :admin, false do |ws|
@@ -493,6 +497,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
   end
 
   test "connected, not authorized to see event" do
+    slow_test
     state = 1
 
     authorize_with :admin
@@ -507,7 +512,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -531,7 +536,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         status = d["status"]
         ws.close
       end
@@ -549,7 +554,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         status = d["status"]
         ws.close
       end
@@ -567,7 +572,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         status = d["status"]
         ws.close
       end
@@ -590,7 +595,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when (1..EventBus::MAX_FILTERS)
           assert_equal 200, d["status"]
@@ -608,6 +613,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
   end
 
   test "connect, subscribe, lots of events" do
+    slow_test
     state = 1
     event_count = 0
     log_start = Log.order(:id).last.id
@@ -625,7 +631,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
@@ -664,7 +670,7 @@ class WebsocketTest < ActionDispatch::IntegrationTest
       end
 
       ws.on :message do |event|
-        d = Oj.load event.data
+        d = Oj.strict_load event.data
         case state
         when 1
           assert_equal 200, d["status"]
index 68d4bbf5af4b03349b11259f82357e917dd52cf7..ef08c726ae2432481409fd054b12122fb4b7ce25 100644 (file)
@@ -26,7 +26,7 @@ require 'mocha/mini_test'
 
 module ArvadosTestSupport
   def json_response
-    Oj.load response.body
+    Oj.strict_load response.body
   end
 
   def api_token(api_client_auth_name)
@@ -36,6 +36,10 @@ module ArvadosTestSupport
   def auth(api_client_auth_name)
     {'HTTP_AUTHORIZATION' => "OAuth2 #{api_token(api_client_auth_name)}"}
   end
+
+  def show_errors model
+    return lambda { model.errors.full_messages.inspect }
+  end
 end
 
 class ActiveSupport::TestCase
@@ -44,6 +48,10 @@ class ActiveSupport::TestCase
 
   include ArvadosTestSupport
 
+  setup do
+    Rails.logger.warn "\n\n#{'=' * 70}\n#{self.class}\##{method_name}\n#{'-' * 70}\n\n"
+  end
+
   teardown do
     Thread.current[:api_client_ip_address] = nil
     Thread.current[:api_client_authorization] = nil
@@ -102,16 +110,20 @@ class ActiveSupport::TestCase
     ArvadosApiToken.new.call("rack.input" => "",
                              "HTTP_AUTHORIZATION" => "OAuth2 #{t}")
   end
+
+  def slow_test
+    skip "RAILS_TEST_SHORT is set" unless (ENV['RAILS_TEST_SHORT'] || '').empty?
+  end
 end
 
 class ActionController::TestCase
   setup do
-    @counter = 0
+    @test_counter = 0
   end
 
   def check_counter action
-    @counter += 1
-    if @counter == 2
+    @test_counter += 1
+    if @test_counter == 2
       assert_equal 1, 2, "Multiple actions in functional test"
     end
   end
index 0794a751e097bb7b42317529177085e5572a3810..938c57bd6ea0d45f844ea9c90b5ea550e1e3b35c 100644 (file)
@@ -17,7 +17,7 @@ class BlobTest < ActiveSupport::TestCase
     'vu5wm9fpnwjyxfldw3vbo01mgjs75rgo7qioh8z8ij7jpyp8508okhgbbex3ceei' +
     '786u5rw2a9gx743dj3fgq2irk'
   @@known_signed_locator = 'acbd18db4cc2f85cedef654fccc4a4d8+3' +
-    '+A257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a@7fffffff'
+    '+A89118b78732c33104a4d6231e8b5a5fa1e4301e3@7fffffff'
 
   test 'generate predictable invincible signature' do
     signed = Blob.sign_locator @@known_locator, {
@@ -118,4 +118,23 @@ class BlobTest < ActiveSupport::TestCase
       Blob.verify_signature!(@@blob_locator, api_token: @@api_token, key: @@key)
     end
   end
+
+  test 'signature changes when ttl changes' do
+    signed = Blob.sign_locator @@known_locator, {
+      api_token: @@known_token,
+      key: @@known_key,
+      expire: 0x7fffffff,
+    }
+
+    original_ttl = Rails.configuration.blob_signature_ttl
+    Rails.configuration.blob_signature_ttl = original_ttl*2
+    signed2 = Blob.sign_locator @@known_locator, {
+      api_token: @@known_token,
+      key: @@known_key,
+      expire: 0x7fffffff,
+    }
+    Rails.configuration.blob_signature_ttl = original_ttl
+
+    assert_not_equal signed, signed2
+  end
 end
index 37da5fddde9da8a0d0e34f7d292394c3ffe4cf87..1c6e4f2db2c0dfafcde3d7fba519fe8e4431cf6b 100644 (file)
@@ -18,6 +18,7 @@ class CollectionModelPerformanceTest < ActiveSupport::TestCase
 
   # "crrud" == "create read render update delete", not a typo
   test "crrud cycle for a collection with a big manifest)" do
+    slow_test
     bigmanifest = time_block 'make example' do
       make_manifest(streams: 100,
                     files_per_stream: 100,
index d0def576c3b6450b633cf08003ba5bf4ea296e2f..701147cf6576997c04bf633650b0770c193136ee 100644 (file)
@@ -306,18 +306,18 @@ class ContainerRequestTest < ActiveSupport::TestCase
     assert_equal "Committed", cr.state
 
     c = Container.find_by_uuid cr.container_uuid
-    assert_equal "Queued", c.state
+    assert_equal Container::Queued, c.state
 
     act_as_system_user do
-      c.state = "Running"
-      c.save!
+      c.update_attributes! state: Container::Locked
+      c.update_attributes! state: Container::Running
     end
 
     cr.reload
     assert_equal "Committed", cr.state
 
     act_as_system_user do
-      c.state = "Complete"
+      c.update_attributes! state: Container::Complete
       c.save!
     end
 
index 0cac6acd936332eaa3b76eb28fad3c84479b1f5a..9cc098117f68f3434b80ca7ee557f1ecfa89fa51 100644 (file)
 require 'test_helper'
 
 class ContainerTest < ActiveSupport::TestCase
-  def check_illegal_modify c
-    c.reload
-    c.command = ["echo", "bar"]
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
-
-    c.reload
-    c.container_image = "img2"
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
+  include DbCurrentTime
 
-    c.reload
-    c.cwd = "/tmp2"
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
+  DEFAULT_ATTRS = {
+    command: ['echo', 'foo'],
+    container_image: 'img',
+    output_path: '/tmp',
+    priority: 1,
+  }
 
-    c.reload
-    c.environment = {"FOO" => "BAR"}
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
+  def minimal_new attrs={}
+    cr = ContainerRequest.new DEFAULT_ATTRS.merge(attrs)
+    act_as_user users(:active) do
+      cr.save!
     end
-
-    c.reload
-    c.mounts = {"FOO" => "BAR"}
-    assert_raises(ActiveRecord::RecordInvalid) do
+    c = Container.new DEFAULT_ATTRS.merge(attrs)
+    act_as_system_user do
       c.save!
+      assert cr.update_attributes(container_uuid: c.uuid,
+                                  state: ContainerRequest::Committed,
+                                  ), show_errors(cr)
     end
+    return c, cr
+  end
 
-    c.reload
-    c.output_path = "/tmp3"
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
+  def check_illegal_updates c, bad_updates
+    bad_updates.each do |u|
+      refute c.update_attributes(u), u.inspect
+      refute c.valid?, u.inspect
+      c.reload
     end
+  end
 
-    c.reload
-    c.runtime_constraints = {"FOO" => "BAR"}
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
+  def check_illegal_modify c
+    check_illegal_updates c, [{command: ["echo", "bar"]},
+                              {container_image: "img2"},
+                              {cwd: "/tmp2"},
+                              {environment: {"FOO" => "BAR"}},
+                              {mounts: {"FOO" => "BAR"}},
+                              {output_path: "/tmp3"},
+                              {locked_by_uuid: "zzzzz-gj3su-027z32aux8dg2s1"},
+                              {auth_uuid: "zzzzz-gj3su-017z32aux8dg2s1"},
+                              {runtime_constraints: {"FOO" => "BAR"}}]
   end
 
   def check_bogus_states c
-    c.reload
-    c.state = nil
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
-
-    c.reload
-    c.state = "Flubber"
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
+    check_illegal_updates c, [{state: nil},
+                              {state: "Flubber"}]
   end
 
-  def check_no_change_from_complete c
+  def check_no_change_from_cancelled c
     check_illegal_modify c
     check_bogus_states c
-
-    c.reload
-    c.priority = 3
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
-
-    c.reload
-    c.state = "Queued"
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
-
-    c.reload
-    c.state = "Running"
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
-
-    c.reload
-    c.state = "Complete"
-    assert_raises(ActiveRecord::RecordInvalid) do
-      c.save!
-    end
+    check_illegal_updates c, [{ priority: 3 },
+                              { state: Container::Queued },
+                              { state: Container::Locked },
+                              { state: Container::Running },
+                              { state: Container::Complete }]
   end
 
   test "Container create" do
     act_as_system_user do
-      c = Container.new
-      c.command = ["echo", "foo"]
-      c.container_image = "img"
-      c.cwd = "/tmp"
-      c.environment = {}
-      c.mounts = {"BAR" => "FOO"}
-      c.output_path = "/tmp"
-      c.priority = 1
-      c.runtime_constraints = {}
-      c.save!
+      c, _ = minimal_new(environment: {},
+                      mounts: {"BAR" => "FOO"},
+                      output_path: "/tmp",
+                      priority: 1,
+                      runtime_constraints: {})
 
       check_illegal_modify c
       check_bogus_states c
@@ -111,80 +78,100 @@ class ContainerTest < ActiveSupport::TestCase
   end
 
   test "Container running" do
-    act_as_system_user do
-      c = Container.new
-      c.command = ["echo", "foo"]
-      c.container_image = "img"
-      c.output_path = "/tmp"
-      c.save!
+    c, _ = minimal_new priority: 1
 
-      c.reload
-      c.state = "Complete"
-      assert_raises(ActiveRecord::RecordInvalid) do
-        c.save!
-      end
+    set_user_from_auth :dispatch1
+    check_illegal_updates c, [{state: Container::Running},
+                              {state: Container::Complete}]
 
-      c.reload
-      c.state = "Running"
-      c.save!
+    c.update_attributes! state: Container::Locked
+    c.update_attributes! state: Container::Running
 
-      check_illegal_modify c
-      check_bogus_states c
+    check_illegal_modify c
+    check_bogus_states c
 
-      c.reload
-      c.state = "Queued"
-      assert_raises(ActiveRecord::RecordInvalid) do
-        c.save!
-      end
+    check_illegal_updates c, [{state: Container::Queued}]
+    c.reload
 
-      c.reload
-      c.priority = 3
-      c.save!
-    end
+    c.update_attributes! priority: 3
   end
 
-  test "Container queued cancel" do
-    act_as_system_user do
-      c = Container.new
-      c.command = ["echo", "foo"]
-      c.container_image = "img"
-      c.output_path = "/tmp"
-      c.save!
+  test "Lock and unlock" do
+    c, cr = minimal_new priority: 0
 
-      c.reload
-      c.state = "Cancelled"
-      c.save!
+    set_user_from_auth :dispatch1
+    assert_equal Container::Queued, c.state
 
-      check_no_change_from_complete c
-    end
-  end
+    refute c.update_attributes(state: Container::Locked), "no priority"
+    c.reload
+    assert cr.update_attributes priority: 1
 
-  test "Container running cancel" do
-    act_as_system_user do
-      c = Container.new
-      c.command = ["echo", "foo"]
-      c.container_image = "img"
-      c.output_path = "/tmp"
-      c.save!
+    refute c.update_attributes(state: Container::Running), "not locked"
+    c.reload
+    refute c.update_attributes(state: Container::Complete), "not locked"
+    c.reload
 
-      c.reload
-      c.state = "Running"
-      c.save!
+    assert c.update_attributes(state: Container::Locked), show_errors(c)
+    assert c.locked_by_uuid
+    assert c.auth_uuid
 
-      c.reload
-      c.state = "Cancelled"
-      c.save!
+    assert c.update_attributes(state: Container::Queued), show_errors(c)
+    refute c.locked_by_uuid
+    refute c.auth_uuid
 
-      check_no_change_from_complete c
-    end
+    refute c.update_attributes(state: Container::Running), "not locked"
+    c.reload
+    refute c.locked_by_uuid
+    refute c.auth_uuid
+
+    assert c.update_attributes(state: Container::Locked), show_errors(c)
+    assert c.update_attributes(state: Container::Running), show_errors(c)
+    assert c.locked_by_uuid
+    assert c.auth_uuid
+
+    auth_uuid_was = c.auth_uuid
+
+    refute c.update_attributes(state: Container::Locked), "already running"
+    c.reload
+    refute c.update_attributes(state: Container::Queued), "already running"
+    c.reload
+
+    assert c.update_attributes(state: Container::Complete), show_errors(c)
+    refute c.locked_by_uuid
+    refute c.auth_uuid
+
+    auth_exp = ApiClientAuthorization.find_by_uuid(auth_uuid_was).expires_at
+    assert_operator auth_exp, :<, db_current_time
+  end
+
+  test "Container queued cancel" do
+    c, _ = minimal_new
+    set_user_from_auth :dispatch1
+    assert c.update_attributes(state: Container::Cancelled), show_errors(c)
+    check_no_change_from_cancelled c
+  end
+
+  test "Container locked cancel" do
+    c, _ = minimal_new
+    set_user_from_auth :dispatch1
+    assert c.update_attributes(state: Container::Locked), show_errors(c)
+    assert c.update_attributes(state: Container::Cancelled), show_errors(c)
+    check_no_change_from_cancelled c
+  end
+
+  test "Container running cancel" do
+    c, _ = minimal_new
+    set_user_from_auth :dispatch1
+    c.update_attributes! state: Container::Queued
+    c.update_attributes! state: Container::Locked
+    c.update_attributes! state: Container::Running
+    c.update_attributes! state: Container::Cancelled
+    check_no_change_from_cancelled c
   end
 
   test "Container create forbidden for non-admin" do
     set_user_from_auth :active_trustedclient
-    c = Container.new
-    c.command = ["echo", "foo"]
-    c.container_image = "img"
-    c.cwd = "/tmp"
+    c = Container.new DEFAULT_ATTRS
     c.environment = {}
     c.mounts = {"BAR" => "FOO"}
     c.output_path = "/tmp"
@@ -196,34 +183,14 @@ class ContainerTest < ActiveSupport::TestCase
   end
 
   test "Container only set exit code on complete" do
-    act_as_system_user do
-      c = Container.new
-      c.command = ["echo", "foo"]
-      c.container_image = "img"
-      c.output_path = "/tmp"
-      c.save!
+    c, _ = minimal_new
+    set_user_from_auth :dispatch1
+    c.update_attributes! state: Container::Locked
+    c.update_attributes! state: Container::Running
 
-      c.reload
-      c.state = "Running"
-      c.save!
+    check_illegal_updates c, [{exit_code: 1},
+                              {exit_code: 1, state: Container::Cancelled}]
 
-      c.reload
-      c.exit_code = 1
-      assert_raises(ActiveRecord::RecordInvalid) do
-        c.save!
-      end
-
-      c.reload
-      c.exit_code = 1
-      c.state = "Cancelled"
-      assert_raises(ActiveRecord::RecordInvalid) do
-        c.save!
-      end
-
-      c.reload
-      c.exit_code = 1
-      c.state = "Complete"
-      c.save!
-    end
+    assert c.update_attributes(exit_code: 1, state: Container::Complete)
   end
 end
index f16c8b2ec497e0caaaf5d78bdf5bb063b58badad..832338a3cc5de1ce742eb5088992ba91d4fe5fdc 100644 (file)
@@ -316,7 +316,6 @@ class JobTest < ActiveSupport::TestCase
 
     assert_not_nil job1.queue_position, "Expected non-nil queue position for job1"
     assert_not_nil job2.queue_position, "Expected non-nil queue position for job2"
-    assert_not_equal job1.queue_position, job2.queue_position
   end
 
   SDK_MASTER = "ca68b24e51992e790f29df5cc4bc54ce1da4a1c2"
index e05c0c5da4439e44931837ea5a259885624b80d8..73a389533679a2ceef773a237891921ae5bf92fc 100644 (file)
@@ -1,14 +1,15 @@
 package main
 
+// Dispatcher service for Crunch that runs containers locally.
+
 import (
        "flag"
        "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/dispatch"
        "log"
        "os"
        "os/exec"
-       "os/signal"
        "sync"
-       "syscall"
        "time"
 )
 
@@ -20,12 +21,10 @@ func main() {
 }
 
 var (
-       arv              arvadosclient.ArvadosClient
        runningCmds      map[string]*exec.Cmd
        runningCmdsMutex sync.Mutex
        waitGroup        sync.WaitGroup
-       doneProcessing   chan bool
-       sigChan          chan os.Signal
+       crunchRunCommand *string
 )
 
 func doMain() error {
@@ -36,12 +35,7 @@ func doMain() error {
                10,
                "Interval in seconds to poll for queued containers")
 
-       priorityPollInterval := flags.Int(
-               "container-priority-poll-interval",
-               60,
-               "Interval in seconds to check priority of a dispatched container")
-
-       crunchRunCommand := flags.String(
+       crunchRunCommand = flags.String(
                "crunch-run-command",
                "/usr/bin/crunch-run",
                "Crunch command to run container")
@@ -49,35 +43,32 @@ func doMain() error {
        // Parse args; omit the first arg which is the command name
        flags.Parse(os.Args[1:])
 
-       var err error
-       arv, err = arvadosclient.MakeArvadosClient()
+       runningCmds = make(map[string]*exec.Cmd)
+
+       arv, err := arvadosclient.MakeArvadosClient()
        if err != nil {
+               log.Printf("Error making Arvados client: %v", err)
                return err
        }
+       arv.Retries = 25
 
-       // Channel to terminate
-       doneProcessing = make(chan bool)
-
-       // Map of running crunch jobs
-       runningCmds = make(map[string]*exec.Cmd)
-
-       // Graceful shutdown
-       sigChan = make(chan os.Signal, 1)
-       signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
-       go func(sig <-chan os.Signal) {
-               for sig := range sig {
-                       log.Printf("Caught signal: %v", sig)
-                       doneProcessing <- true
-               }
-       }(sigChan)
+       dispatcher := dispatch.Dispatcher{
+               Arv:            arv,
+               RunContainer:   run,
+               PollInterval:   time.Duration(*pollInterval) * time.Second,
+               DoneProcessing: make(chan struct{})}
 
-       // Run all queued containers
-       runQueuedContainers(*pollInterval, *priorityPollInterval, *crunchRunCommand)
+       err = dispatcher.RunDispatcher()
+       if err != nil {
+               return err
+       }
 
+       runningCmdsMutex.Lock()
        // Finished dispatching; interrupt any crunch jobs that are still running
        for _, cmd := range runningCmds {
                cmd.Process.Signal(os.Interrupt)
        }
+       runningCmdsMutex.Unlock()
 
        // Wait for all running crunch jobs to complete / terminate
        waitGroup.Wait()
@@ -85,136 +76,99 @@ func doMain() error {
        return nil
 }
 
-// Poll for queued containers using pollInterval.
-// Invoke dispatchLocal for each ticker cycle, which will run all the queued containers.
-//
-// Any errors encountered are logged but the program would continue to run (not exit).
-// This is because, once one or more crunch jobs are running,
-// we would need to wait for them complete.
-func runQueuedContainers(pollInterval, priorityPollInterval int, crunchRunCommand string) {
-       ticker := time.NewTicker(time.Duration(pollInterval) * time.Second)
-
-       for {
-               select {
-               case <-ticker.C:
-                       dispatchLocal(priorityPollInterval, crunchRunCommand)
-               case <-doneProcessing:
-                       ticker.Stop()
-                       return
-               }
-       }
+func startFunc(container dispatch.Container, cmd *exec.Cmd) error {
+       return cmd.Start()
 }
 
-// Container data
-type Container struct {
-       UUID     string `json:"uuid"`
-       State    string `json:"state"`
-       Priority int    `json:"priority"`
-}
+var startCmd = startFunc
 
-// ContainerList is a list of the containers from api
-type ContainerList struct {
-       Items []Container `json:"items"`
-}
-
-// Get the list of queued containers from API server and invoke run for each container.
-func dispatchLocal(priorityPollInterval int, crunchRunCommand string) {
-       params := arvadosclient.Dict{
-               "filters": [][]string{[]string{"state", "=", "Queued"}},
-       }
-
-       var containers ContainerList
-       err := arv.List("containers", params, &containers)
-       if err != nil {
-               log.Printf("Error getting list of queued containers: %q", err)
-               return
-       }
+// Run a container.
+//
+// If the container is Locked, start a new crunch-run process and wait until
+// crunch-run completes.  If the priority is set to zero, set an interrupt
+// signal to the crunch-run process.
+//
+// If the container is in any other state, or is not Complete/Cancelled after
+// crunch-run terminates, mark the container as Cancelled.
+func run(dispatcher *dispatch.Dispatcher,
+       container dispatch.Container,
+       status chan dispatch.Container) {
+
+       uuid := container.UUID
+
+       if container.State == dispatch.Locked {
+               waitGroup.Add(1)
+
+               cmd := exec.Command(*crunchRunCommand, uuid)
+               cmd.Stdin = nil
+               cmd.Stderr = os.Stderr
+               cmd.Stdout = os.Stderr
+
+               log.Printf("Starting container %v", uuid)
+
+               // Add this crunch job to the list of runningCmds only if we
+               // succeed in starting crunch-run.
+
+               runningCmdsMutex.Lock()
+               if err := startCmd(container, cmd); err != nil {
+                       runningCmdsMutex.Unlock()
+                       log.Printf("Error starting %v for %v: %q", *crunchRunCommand, uuid, err)
+                       dispatcher.UpdateState(uuid, dispatch.Cancelled)
+               } else {
+                       runningCmds[uuid] = cmd
+                       runningCmdsMutex.Unlock()
+
+                       // Need to wait for crunch-run to exit
+                       done := make(chan struct{})
+
+                       go func() {
+                               if _, err := cmd.Process.Wait(); err != nil {
+                                       log.Printf("Error while waiting for crunch job to finish for %v: %q", uuid, err)
+                               }
+                               log.Printf("sending done")
+                               done <- struct{}{}
+                       }()
+
+               Loop:
+                       for {
+                               select {
+                               case <-done:
+                                       break Loop
+                               case c := <-status:
+                                       // Interrupt the child process if priority changes to 0
+                                       if (c.State == dispatch.Locked || c.State == dispatch.Running) && c.Priority == 0 {
+                                               log.Printf("Sending SIGINT to pid %d to cancel container %v", cmd.Process.Pid, uuid)
+                                               cmd.Process.Signal(os.Interrupt)
+                                       }
+                               }
+                       }
+                       close(done)
 
-       for i := 0; i < len(containers.Items); i++ {
-               log.Printf("About to run queued container %v", containers.Items[i].UUID)
-               // Run the container
-               go run(containers.Items[i].UUID, crunchRunCommand, priorityPollInterval)
-       }
-}
+                       log.Printf("Finished container run for %v", uuid)
 
-// Run queued container:
-// Set container state to locked (TBD)
-// Run container using the given crunch-run command
-// Set the container state to Running
-// If the container priority becomes zero while crunch job is still running, terminate it.
-func run(uuid string, crunchRunCommand string, priorityPollInterval int) {
-       cmd := exec.Command(crunchRunCommand, uuid)
-
-       cmd.Stdin = nil
-       cmd.Stderr = os.Stderr
-       cmd.Stdout = os.Stderr
-       if err := cmd.Start(); err != nil {
-               log.Printf("Error running container for %v: %q", uuid, err)
-               return
+                       // Remove the crunch job from runningCmds
+                       runningCmdsMutex.Lock()
+                       delete(runningCmds, uuid)
+                       runningCmdsMutex.Unlock()
+               }
+               waitGroup.Done()
        }
 
-       // Add this crunch job to the list of runningCmds
-       runningCmdsMutex.Lock()
-       runningCmds[uuid] = cmd
-       runningCmdsMutex.Unlock()
-
-       log.Printf("Started container run for %v", uuid)
-
-       // Add this crunch job to waitGroup
-       waitGroup.Add(1)
-       defer waitGroup.Done()
-
-       // Update container status to Running
-       err := arv.Update("containers", uuid,
-               arvadosclient.Dict{
-                       "container": arvadosclient.Dict{"state": "Running"}},
-               nil)
+       // If the container is not finalized, then change it to "Cancelled".
+       err := dispatcher.Arv.Get("containers", uuid, nil, &container)
        if err != nil {
-               log.Printf("Error updating container state to 'Running' for %v: %q", uuid, err)
+               log.Printf("Error getting final container state: %v", err)
        }
-
-       // A goroutine to terminate the runner if container priority becomes zero
-       priorityTicker := time.NewTicker(time.Duration(priorityPollInterval) * time.Second)
-       go func() {
-               for _ = range priorityTicker.C {
-                       var container Container
-                       err := arv.Get("containers", uuid, nil, &container)
-                       if err != nil {
-                               log.Printf("Error getting container info for %v: %q", uuid, err)
-                       } else {
-                               if container.Priority == 0 {
-                                       priorityTicker.Stop()
-                                       cmd.Process.Signal(os.Interrupt)
-                               }
-                       }
-               }
-       }()
-
-       // Wait for the crunch job to exit
-       if _, err := cmd.Process.Wait(); err != nil {
-               log.Printf("Error while waiting for crunch job to finish for %v: %q", uuid, err)
+       if container.LockedByUUID == dispatcher.Auth.UUID &&
+               (container.State == dispatch.Locked || container.State == dispatch.Running) {
+               log.Printf("After %s process termination, container state for %v is %q.  Updating it to %q",
+                       *crunchRunCommand, container.State, uuid, dispatch.Cancelled)
+               dispatcher.UpdateState(uuid, dispatch.Cancelled)
        }
 
-       // Remove the crunch job to runningCmds
-       runningCmdsMutex.Lock()
-       delete(runningCmds, uuid)
-       runningCmdsMutex.Unlock()
-
-       priorityTicker.Stop()
-
-       // The container state should be 'Complete'
-       var container Container
-       err = arv.Get("containers", uuid, nil, &container)
-       if container.State == "Running" {
-               log.Printf("After crunch-run process termination, the state is still 'Running' for %v. Updating it to 'Complete'", uuid)
-               err = arv.Update("containers", uuid,
-                       arvadosclient.Dict{
-                               "container": arvadosclient.Dict{"state": "Complete"}},
-                       nil)
-               if err != nil {
-                       log.Printf("Error updating container state to Complete for %v: %q", uuid, err)
-               }
+       // drain any subsequent status changes
+       for _ = range status {
        }
 
-       log.Printf("Finished container run for %v", uuid)
+       log.Printf("Finalized container %v", uuid)
 }
index 3ec1e2ec6b41c4759bd6f75e6cddc847254633ab..0248f18433df7aa3e591ebb78e538b28931d622b 100644 (file)
@@ -1,20 +1,20 @@
 package main
 
 import (
+       "bytes"
        "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
        "git.curoverse.com/arvados.git/sdk/go/arvadostest"
-
-       "io/ioutil"
+       "git.curoverse.com/arvados.git/sdk/go/dispatch"
+       . "gopkg.in/check.v1"
+       "io"
        "log"
        "net/http"
        "net/http/httptest"
        "os"
+       "os/exec"
        "strings"
-       "syscall"
        "testing"
        "time"
-
-       . "gopkg.in/check.v1"
 )
 
 // Gocheck boilerplate
@@ -33,6 +33,7 @@ var initialArgs []string
 func (s *TestSuite) SetUpSuite(c *C) {
        initialArgs = os.Args
        arvadostest.StartAPI()
+       runningCmds = make(map[string]*exec.Cmd)
 }
 
 func (s *TestSuite) TearDownSuite(c *C) {
@@ -42,12 +43,6 @@ func (s *TestSuite) TearDownSuite(c *C) {
 func (s *TestSuite) SetUpTest(c *C) {
        args := []string{"crunch-dispatch-local"}
        os.Args = args
-
-       var err error
-       arv, err = arvadosclient.MakeArvadosClient()
-       if err != nil {
-               c.Fatalf("Error making arvados client: %s", err)
-       }
 }
 
 func (s *TestSuite) TearDownTest(c *C) {
@@ -59,29 +54,48 @@ func (s *MockArvadosServerSuite) TearDownTest(c *C) {
        arvadostest.ResetEnv()
 }
 
-func (s *TestSuite) Test_doMain(c *C) {
-       args := []string{"-poll-interval", "2", "-container-priority-poll-interval", "1", "-crunch-run-command", "echo"}
-       os.Args = append(os.Args, args...)
+func (s *TestSuite) TestIntegration(c *C) {
+       arv, err := arvadosclient.MakeArvadosClient()
+       c.Assert(err, IsNil)
+
+       echo := "echo"
+       crunchRunCommand = &echo
+
+       doneProcessing := make(chan struct{})
+       dispatcher := dispatch.Dispatcher{
+               Arv:          arv,
+               PollInterval: time.Duration(1) * time.Second,
+               RunContainer: func(dispatcher *dispatch.Dispatcher,
+                       container dispatch.Container,
+                       status chan dispatch.Container) {
+                       run(dispatcher, container, status)
+                       doneProcessing <- struct{}{}
+               },
+               DoneProcessing: doneProcessing}
+
+       startCmd = func(container dispatch.Container, cmd *exec.Cmd) error {
+               dispatcher.UpdateState(container.UUID, "Running")
+               dispatcher.UpdateState(container.UUID, "Complete")
+               return cmd.Start()
+       }
 
-       go func() {
-               time.Sleep(5 * time.Second)
-               sigChan <- syscall.SIGINT
-       }()
+       err = dispatcher.RunDispatcher()
+       c.Assert(err, IsNil)
 
-       err := doMain()
-       c.Check(err, IsNil)
+       // Wait for all running crunch jobs to complete / terminate
+       waitGroup.Wait()
 
        // There should be no queued containers now
        params := arvadosclient.Dict{
                "filters": [][]string{[]string{"state", "=", "Queued"}},
        }
-       var containers ContainerList
+       var containers dispatch.ContainerList
        err = arv.List("containers", params, &containers)
        c.Check(err, IsNil)
        c.Assert(len(containers.Items), Equals, 0)
 
        // Previously "Queued" container should now be in "Complete" state
-       var container Container
+       var container dispatch.Container
        err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
        c.Check(err, IsNil)
        c.Check(container.State, Equals, "Complete")
@@ -91,47 +105,51 @@ func (s *MockArvadosServerSuite) Test_APIErrorGettingContainers(c *C) {
        apiStubResponses := make(map[string]arvadostest.StubResponse)
        apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
 
-       testWithServerStub(c, apiStubResponses, "echo", "Error getting list of queued containers")
+       testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
 }
 
 func (s *MockArvadosServerSuite) Test_APIErrorUpdatingContainerState(c *C) {
        apiStubResponses := make(map[string]arvadostest.StubResponse)
        apiStubResponses["/arvados/v1/containers"] =
-               arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx1"}]}`)}
+               arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx1","State":"Queued"}]}`)}
        apiStubResponses["/arvados/v1/containers/zzzzz-dz642-xxxxxxxxxxxxxx1"] =
                arvadostest.StubResponse{500, string(`{}`)}
 
-       testWithServerStub(c, apiStubResponses, "echo", "Error updating container state")
+       testWithServerStub(c, apiStubResponses, "echo", "Error updating container zzzzz-dz642-xxxxxxxxxxxxxx1 to state \"Locked\"")
 }
 
 func (s *MockArvadosServerSuite) Test_ContainerStillInRunningAfterRun(c *C) {
        apiStubResponses := make(map[string]arvadostest.StubResponse)
        apiStubResponses["/arvados/v1/containers"] =
-               arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2"}]}`)}
+               arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2","State":"Queued"}]}`)}
        apiStubResponses["/arvados/v1/containers/zzzzz-dz642-xxxxxxxxxxxxxx2"] =
-               arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2", "state":"Running", "priority":1}`)}
+               arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2", "state":"Running", "priority":1, "locked_by_uuid": "` + arvadostest.Dispatch1AuthUUID + `"}`)}
 
        testWithServerStub(c, apiStubResponses, "echo",
-               "After crunch-run process termination, the state is still 'Running' for zzzzz-dz642-xxxxxxxxxxxxxx2")
+               `After echo process termination, container state for Running is "zzzzz-dz642-xxxxxxxxxxxxxx2".  Updating it to "Cancelled"`)
 }
 
 func (s *MockArvadosServerSuite) Test_ErrorRunningContainer(c *C) {
        apiStubResponses := make(map[string]arvadostest.StubResponse)
        apiStubResponses["/arvados/v1/containers"] =
-               arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx3"}]}`)}
+               arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx3","State":"Queued"}]}`)}
+
        apiStubResponses["/arvados/v1/containers/zzzzz-dz642-xxxxxxxxxxxxxx3"] =
                arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx3", "state":"Running", "priority":1}`)}
 
-       testWithServerStub(c, apiStubResponses, "nosuchcommand", "Error running container for zzzzz-dz642-xxxxxxxxxxxxxx3")
+       testWithServerStub(c, apiStubResponses, "nosuchcommand", "Error starting nosuchcommand for zzzzz-dz642-xxxxxxxxxxxxxx3")
 }
 
 func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
+       apiStubResponses["/arvados/v1/api_client_authorizations/current"] =
+               arvadostest.StubResponse{200, string(`{"uuid": "` + arvadostest.Dispatch1AuthUUID + `", "api_token": "xyz"}`)}
+
        apiStub := arvadostest.ServerStub{apiStubResponses}
 
        api := httptest.NewServer(&apiStub)
        defer api.Close()
 
-       arv = arvadosclient.ArvadosClient{
+       arv := arvadosclient.ArvadosClient{
                Scheme:    "http",
                ApiServer: api.URL[7:],
                ApiToken:  "abc123",
@@ -139,21 +157,42 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
                Retries:   0,
        }
 
-       tempfile, err := ioutil.TempFile(os.TempDir(), "temp-log-file")
-       c.Check(err, IsNil)
-       defer os.Remove(tempfile.Name())
-       log.SetOutput(tempfile)
+       buf := bytes.NewBuffer(nil)
+       log.SetOutput(io.MultiWriter(buf, os.Stderr))
+       defer log.SetOutput(os.Stderr)
+
+       *crunchRunCommand = crunchCmd
+
+       doneProcessing := make(chan struct{})
+       dispatcher := dispatch.Dispatcher{
+               Arv:          arv,
+               PollInterval: time.Duration(1) * time.Second,
+               RunContainer: func(dispatcher *dispatch.Dispatcher,
+                       container dispatch.Container,
+                       status chan dispatch.Container) {
+                       run(dispatcher, container, status)
+                       doneProcessing <- struct{}{}
+               },
+               DoneProcessing: doneProcessing}
+
+       startCmd = func(container dispatch.Container, cmd *exec.Cmd) error {
+               dispatcher.UpdateState(container.UUID, "Running")
+               dispatcher.UpdateState(container.UUID, "Complete")
+               return cmd.Start()
+       }
 
        go func() {
-               time.Sleep(2 * time.Second)
-               sigChan <- syscall.SIGTERM
+               for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
+                       time.Sleep(100 * time.Millisecond)
+               }
+               dispatcher.DoneProcessing <- struct{}{}
        }()
 
-       runQueuedContainers(1, 1, crunchCmd)
+       err := dispatcher.RunDispatcher()
+       c.Assert(err, IsNil)
 
        // Wait for all running crunch jobs to complete / terminate
        waitGroup.Wait()
 
-       buf, _ := ioutil.ReadFile(tempfile.Name())
-       c.Check(strings.Contains(string(buf), expected), Equals, true)
+       c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
 }
index 8fbc0fa8b63810bd6a06fadab097836dfd06b113..f718fbcdcea3fd5c00ab8763240ee3056f098a53 100644 (file)
@@ -1,16 +1,18 @@
 package main
 
+// Dispatcher service for Crunch that submits containers to the slurm queue.
+
 import (
        "flag"
        "fmt"
        "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/dispatch"
        "io/ioutil"
        "log"
+       "math"
        "os"
        "os/exec"
-       "os/signal"
-       "sync"
-       "syscall"
+       "strings"
        "time"
 )
 
@@ -22,12 +24,8 @@ func main() {
 }
 
 var (
-       arv              arvadosclient.ArvadosClient
-       runningCmds      map[string]*exec.Cmd
-       runningCmdsMutex sync.Mutex
-       waitGroup        sync.WaitGroup
-       doneProcessing   chan bool
-       sigChan          chan os.Signal
+       crunchRunCommand *string
+       squeueUpdater    Squeue
 )
 
 func doMain() error {
@@ -38,139 +36,80 @@ func doMain() error {
                10,
                "Interval in seconds to poll for queued containers")
 
-       priorityPollInterval := flags.Int(
-               "container-priority-poll-interval",
-               60,
-               "Interval in seconds to check priority of a dispatched container")
-
-       crunchRunCommand := flags.String(
+       crunchRunCommand = flags.String(
                "crunch-run-command",
                "/usr/bin/crunch-run",
                "Crunch command to run container")
 
-       finishCommand := flags.String(
-               "finish-command",
-               "/usr/bin/crunch-finish-slurm.sh",
-               "Command to run from strigger when job is finished")
-
        // Parse args; omit the first arg which is the command name
        flags.Parse(os.Args[1:])
 
-       var err error
-       arv, err = arvadosclient.MakeArvadosClient()
+       arv, err := arvadosclient.MakeArvadosClient()
        if err != nil {
+               log.Printf("Error making Arvados client: %v", err)
                return err
        }
+       arv.Retries = 25
 
-       // Channel to terminate
-       doneProcessing = make(chan bool)
-
-       // Graceful shutdown
-       sigChan = make(chan os.Signal, 1)
-       signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
-       go func(sig <-chan os.Signal) {
-               for sig := range sig {
-                       log.Printf("Caught signal: %v", sig)
-                       doneProcessing <- true
-               }
-       }(sigChan)
-
-       // Run all queued containers
-       runQueuedContainers(*pollInterval, *priorityPollInterval, *crunchRunCommand, *finishCommand)
-
-       // Wait for all running crunch jobs to complete / terminate
-       waitGroup.Wait()
-
-       return nil
-}
-
-// Poll for queued containers using pollInterval.
-// Invoke dispatchSlurm for each ticker cycle, which will run all the queued containers.
-//
-// Any errors encountered are logged but the program would continue to run (not exit).
-// This is because, once one or more crunch jobs are running,
-// we would need to wait for them complete.
-func runQueuedContainers(pollInterval, priorityPollInterval int, crunchRunCommand, finishCommand string) {
-       ticker := time.NewTicker(time.Duration(pollInterval) * time.Second)
-
-       for {
-               select {
-               case <-ticker.C:
-                       dispatchSlurm(priorityPollInterval, crunchRunCommand, finishCommand)
-               case <-doneProcessing:
-                       ticker.Stop()
-                       return
-               }
-       }
-}
-
-// Container data
-type Container struct {
-       UUID     string `json:"uuid"`
-       State    string `json:"state"`
-       Priority int    `json:"priority"`
-}
-
-// ContainerList is a list of the containers from api
-type ContainerList struct {
-       Items []Container `json:"items"`
-}
+       squeueUpdater.StartMonitor(time.Duration(*pollInterval) * time.Second)
+       defer squeueUpdater.Done()
 
-// Get the list of queued containers from API server and invoke run for each container.
-func dispatchSlurm(priorityPollInterval int, crunchRunCommand, finishCommand string) {
-       params := arvadosclient.Dict{
-               "filters": [][]string{[]string{"state", "=", "Queued"}},
-       }
+       dispatcher := dispatch.Dispatcher{
+               Arv:            arv,
+               RunContainer:   run,
+               PollInterval:   time.Duration(*pollInterval) * time.Second,
+               DoneProcessing: make(chan struct{})}
 
-       var containers ContainerList
-       err := arv.List("containers", params, &containers)
+       err = dispatcher.RunDispatcher()
        if err != nil {
-               log.Printf("Error getting list of queued containers: %q", err)
-               return
+               return err
        }
 
-       for i := 0; i < len(containers.Items); i++ {
-               log.Printf("About to submit queued container %v", containers.Items[i].UUID)
-               // Run the container
-               go run(containers.Items[i], crunchRunCommand, finishCommand, priorityPollInterval)
-       }
+       return nil
 }
 
 // sbatchCmd
-func sbatchFunc(uuid string) *exec.Cmd {
-       return exec.Command("sbatch", "--job-name="+uuid, "--share", "--parsable")
+func sbatchFunc(container dispatch.Container) *exec.Cmd {
+       memPerCPU := math.Ceil((float64(container.RuntimeConstraints["ram"])) / (float64(container.RuntimeConstraints["vcpus"] * 1048576)))
+       return exec.Command("sbatch", "--share", "--parsable",
+               fmt.Sprintf("--job-name=%s", container.UUID),
+               fmt.Sprintf("--mem-per-cpu=%d", int(memPerCPU)),
+               fmt.Sprintf("--cpus-per-task=%d", int(container.RuntimeConstraints["vcpus"])),
+               fmt.Sprintf("--priority=%d", container.Priority))
 }
 
-var sbatchCmd = sbatchFunc
-
-// striggerCmd
-func striggerFunc(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure string) *exec.Cmd {
-       return exec.Command("strigger", "--set", "--jobid="+jobid, "--fini",
-               fmt.Sprintf("--program=%s %s %s %s %s", finishCommand, apiHost, apiToken, apiInsecure, containerUUID))
+// scancelCmd
+func scancelFunc(container dispatch.Container) *exec.Cmd {
+       return exec.Command("scancel", "--name="+container.UUID)
 }
 
-var striggerCmd = striggerFunc
+// Wrap these so that they can be overridden by tests
+var sbatchCmd = sbatchFunc
+var scancelCmd = scancelFunc
 
 // Submit job to slurm using sbatch.
-func submit(container Container, crunchRunCommand string) (jobid string, submitErr error) {
+func submit(dispatcher *dispatch.Dispatcher,
+       container dispatch.Container, crunchRunCommand string) (jobid string, submitErr error) {
        submitErr = nil
 
-       // Mark record as complete if anything errors out.
        defer func() {
-               if submitErr != nil {
-                       // This really should be an "Error" state, see #8018
-                       updateErr := arv.Update("containers", container.UUID,
-                               arvadosclient.Dict{
-                                       "container": arvadosclient.Dict{"state": "Complete"}},
-                               nil)
-                       if updateErr != nil {
-                               log.Printf("Error updating container state to 'Complete' for %v: %q", container.UUID, updateErr)
-                       }
+               // If we didn't get as far as submitting a slurm job,
+               // unlock the container and return it to the queue.
+               if submitErr == nil {
+                       // OK, no cleanup needed
+                       return
+               }
+               err := dispatcher.Arv.Update("containers", container.UUID,
+                       arvadosclient.Dict{
+                               "container": arvadosclient.Dict{"state": "Queued"}},
+                       nil)
+               if err != nil {
+                       log.Printf("Error unlocking container %s: %v", container.UUID, err)
                }
        }()
 
        // Create the command and attach to stdin/stdout
-       cmd := sbatchCmd(container.UUID)
+       cmd := sbatchCmd(container)
        stdinWriter, stdinerr := cmd.StdinPipe()
        if stdinerr != nil {
                submitErr = fmt.Errorf("Error creating stdin pipe %v: %q", container.UUID, stdinerr)
@@ -189,6 +128,10 @@ func submit(container Container, crunchRunCommand string) (jobid string, submitE
                return
        }
 
+       // Mutex between squeue sync and running sbatch or scancel.
+       squeueUpdater.SlurmLock.Lock()
+       defer squeueUpdater.SlurmLock.Unlock()
+
        err := cmd.Start()
        if err != nil {
                submitErr = fmt.Errorf("Error starting %v: %v", cmd.Args, err)
@@ -198,15 +141,15 @@ func submit(container Container, crunchRunCommand string) (jobid string, submitE
        stdoutChan := make(chan []byte)
        go func() {
                b, _ := ioutil.ReadAll(stdoutReader)
+               stdoutReader.Close()
                stdoutChan <- b
-               close(stdoutChan)
        }()
 
        stderrChan := make(chan []byte)
        go func() {
                b, _ := ioutil.ReadAll(stderrReader)
+               stderrReader.Close()
                stderrChan <- b
-               close(stderrChan)
        }()
 
        // Send a tiny script on stdin to execute the crunch-run command
@@ -219,82 +162,112 @@ func submit(container Container, crunchRunCommand string) (jobid string, submitE
        stdoutMsg := <-stdoutChan
        stderrmsg := <-stderrChan
 
+       close(stdoutChan)
+       close(stderrChan)
+
        if err != nil {
                submitErr = fmt.Errorf("Container submission failed %v: %v %v", cmd.Args, err, stderrmsg)
                return
        }
 
        // If everything worked out, got the jobid on stdout
-       jobid = string(stdoutMsg)
+       jobid = strings.TrimSpace(string(stdoutMsg))
 
        return
 }
 
-// finalizeRecordOnFinish uses 'strigger' command to register a script that will run on
-// the slurm controller when the job finishes.
-func finalizeRecordOnFinish(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure string) {
-       cmd := striggerCmd(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure)
-       cmd.Stdout = os.Stdout
-       cmd.Stderr = os.Stderr
-       err := cmd.Run()
-       if err != nil {
-               log.Printf("While setting up strigger: %v", err)
-       }
-}
-
-// Run a queued container.
-// Set container state to locked (TBD)
-// Submit job to slurm to execute crunch-run command for the container
-// If the container priority becomes zero while crunch job is still running, cancel the job.
-func run(container Container, crunchRunCommand, finishCommand string, priorityPollInterval int) {
+// If the container is marked as Locked, check if it is already in the slurm
+// queue.  If not, submit it.
+//
+// If the container is marked as Running, check if it is in the slurm queue.
+// If not, mark it as Cancelled.
+func monitorSubmitOrCancel(dispatcher *dispatch.Dispatcher, container dispatch.Container, monitorDone *bool) {
+       submitted := false
+       for !*monitorDone {
+               if squeueUpdater.CheckSqueue(container.UUID) {
+                       // Found in the queue, so continue monitoring
+                       submitted = true
+               } else if container.State == dispatch.Locked && !submitted {
+                       // Not in queue but in Locked state and we haven't
+                       // submitted it yet, so submit it.
+
+                       log.Printf("About to submit queued container %v", container.UUID)
+
+                       if _, err := submit(dispatcher, container, *crunchRunCommand); err != nil {
+                               log.Printf("Error submitting container %s to slurm: %v",
+                                       container.UUID, err)
+                               // maybe sbatch is broken, put it back to queued
+                               dispatcher.UpdateState(container.UUID, dispatch.Queued)
+                       }
+                       submitted = true
+               } else {
+                       // Not in queue and we are not going to submit it.
+                       // Refresh the container state. If it is
+                       // Complete/Cancelled, do nothing, if it is Locked then
+                       // release it back to the Queue, if it is Running then
+                       // clean up the record.
+
+                       var con dispatch.Container
+                       err := dispatcher.Arv.Get("containers", container.UUID, nil, &con)
+                       if err != nil {
+                               log.Printf("Error getting final container state: %v", err)
+                       }
 
-       jobid, err := submit(container, crunchRunCommand)
-       if err != nil {
-               log.Printf("Error queuing container run: %v", err)
-               return
-       }
+                       var st string
+                       switch con.State {
+                       case dispatch.Locked:
+                               st = dispatch.Queued
+                       case dispatch.Running:
+                               st = dispatch.Cancelled
+                       default:
+                               // Container state is Queued, Complete or Cancelled so stop monitoring it.
+                               return
+                       }
 
-       insecure := "0"
-       if arv.ApiInsecure {
-               insecure = "1"
-       }
-       finalizeRecordOnFinish(jobid, container.UUID, finishCommand, arv.ApiServer, arv.ApiToken, insecure)
-
-       // Update container status to Running, this is a temporary workaround
-       // to avoid resubmitting queued containers because record locking isn't
-       // implemented yet.
-       err = arv.Update("containers", container.UUID,
-               arvadosclient.Dict{
-                       "container": arvadosclient.Dict{"state": "Running"}},
-               nil)
-       if err != nil {
-               log.Printf("Error updating container state to 'Running' for %v: %q", container.UUID, err)
+                       log.Printf("Container %s in state %v but missing from slurm queue, changing to %v.",
+                               container.UUID, con.State, st)
+                       dispatcher.UpdateState(container.UUID, st)
+               }
        }
+}
 
-       log.Printf("Submitted container run for %v", container.UUID)
-
-       containerUUID := container.UUID
-
-       // A goroutine to terminate the runner if container priority becomes zero
-       priorityTicker := time.NewTicker(time.Duration(priorityPollInterval) * time.Second)
-       go func() {
-               for _ = range priorityTicker.C {
-                       var container Container
-                       err := arv.Get("containers", containerUUID, nil, &container)
-                       if err != nil {
-                               log.Printf("Error getting container info for %v: %q", container.UUID, err)
-                       } else {
-                               if container.Priority == 0 {
-                                       log.Printf("Canceling container %v", container.UUID)
-                                       priorityTicker.Stop()
-                                       cancelcmd := exec.Command("scancel", "--name="+container.UUID)
-                                       cancelcmd.Run()
-                               }
-                               if container.State == "Complete" {
-                                       priorityTicker.Stop()
+// Run or monitor a container.
+//
+// Monitor status updates.  If the priority changes to zero, cancel the
+// container using scancel.
+func run(dispatcher *dispatch.Dispatcher,
+       container dispatch.Container,
+       status chan dispatch.Container) {
+
+       log.Printf("Monitoring container %v started", container.UUID)
+       defer log.Printf("Monitoring container %v finished", container.UUID)
+
+       monitorDone := false
+       go monitorSubmitOrCancel(dispatcher, container, &monitorDone)
+
+       for container = range status {
+               if container.State == dispatch.Locked || container.State == dispatch.Running {
+                       if container.Priority == 0 {
+                               log.Printf("Canceling container %s", container.UUID)
+
+                               // Mutex between squeue sync and running sbatch or scancel.
+                               squeueUpdater.SlurmLock.Lock()
+                               err := scancelCmd(container).Run()
+                               squeueUpdater.SlurmLock.Unlock()
+
+                               if err != nil {
+                                       log.Printf("Error stopping container %s with scancel: %v",
+                                               container.UUID, err)
+                                       if squeueUpdater.CheckSqueue(container.UUID) {
+                                               log.Printf("Container %s is still in squeue after scancel.",
+                                                       container.UUID)
+                                               continue
+                                       }
                                }
+
+                               err = dispatcher.UpdateState(container.UUID, dispatch.Cancelled)
                        }
                }
-       }()
-
+       }
+       monitorDone = true
 }
index 7355cff9d99cbbc0883a833b8f96a8c31902f271..cddbe8c706e27f2f988f988b97aa0da6c5a1b7de 100644 (file)
@@ -1,17 +1,18 @@
 package main
 
 import (
+       "bytes"
+       "fmt"
        "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
        "git.curoverse.com/arvados.git/sdk/go/arvadostest"
-
-       "io/ioutil"
+       "git.curoverse.com/arvados.git/sdk/go/dispatch"
+       "io"
        "log"
        "net/http"
        "net/http/httptest"
        "os"
        "os/exec"
        "strings"
-       "syscall"
        "testing"
        "time"
 
@@ -33,87 +34,138 @@ var initialArgs []string
 
 func (s *TestSuite) SetUpSuite(c *C) {
        initialArgs = os.Args
-       arvadostest.StartAPI()
 }
 
 func (s *TestSuite) TearDownSuite(c *C) {
-       arvadostest.StopAPI()
 }
 
 func (s *TestSuite) SetUpTest(c *C) {
        args := []string{"crunch-dispatch-slurm"}
        os.Args = args
 
-       var err error
-       arv, err = arvadosclient.MakeArvadosClient()
-       if err != nil {
-               c.Fatalf("Error making arvados client: %s", err)
-       }
+       arvadostest.StartAPI()
+       os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
 }
 
 func (s *TestSuite) TearDownTest(c *C) {
-       arvadostest.ResetEnv()
        os.Args = initialArgs
+       arvadostest.StopAPI()
 }
 
 func (s *MockArvadosServerSuite) TearDownTest(c *C) {
        arvadostest.ResetEnv()
 }
 
-func (s *TestSuite) Test_doMain(c *C) {
-       args := []string{"-poll-interval", "2", "-container-priority-poll-interval", "1", "-crunch-run-command", "echo"}
-       os.Args = append(os.Args, args...)
+func (s *TestSuite) TestIntegrationNormal(c *C) {
+       container := s.integrationTest(c, func() *exec.Cmd { return exec.Command("echo", "zzzzz-dz642-queuedcontainer") },
+               []string(nil),
+               func(dispatcher *dispatch.Dispatcher, container dispatch.Container) {
+                       dispatcher.UpdateState(container.UUID, dispatch.Running)
+                       time.Sleep(3 * time.Second)
+                       dispatcher.UpdateState(container.UUID, dispatch.Complete)
+               })
+       c.Check(container.State, Equals, "Complete")
+}
 
-       var sbatchCmdLine []string
-       var striggerCmdLine []string
+func (s *TestSuite) TestIntegrationCancel(c *C) {
 
        // Override sbatchCmd
-       defer func(orig func(string) *exec.Cmd) {
-               sbatchCmd = orig
-       }(sbatchCmd)
-       sbatchCmd = func(uuid string) *exec.Cmd {
-               sbatchCmdLine = sbatchFunc(uuid).Args
-               return exec.Command("echo", uuid)
+       var scancelCmdLine []string
+       defer func(orig func(dispatch.Container) *exec.Cmd) {
+               scancelCmd = orig
+       }(scancelCmd)
+       scancelCmd = func(container dispatch.Container) *exec.Cmd {
+               scancelCmdLine = scancelFunc(container).Args
+               return exec.Command("echo")
        }
 
-       // Override striggerCmd
-       defer func(orig func(jobid, containerUUID, finishCommand,
-               apiHost, apiToken, apiInsecure string) *exec.Cmd) {
-               striggerCmd = orig
-       }(striggerCmd)
-       striggerCmd = func(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure string) *exec.Cmd {
-               striggerCmdLine = striggerFunc(jobid, containerUUID, finishCommand,
-                       apiHost, apiToken, apiInsecure).Args
-               go func() {
-                       time.Sleep(5 * time.Second)
-                       arv.Update("containers", containerUUID,
+       container := s.integrationTest(c, func() *exec.Cmd { return exec.Command("echo", "zzzzz-dz642-queuedcontainer") },
+               []string(nil),
+               func(dispatcher *dispatch.Dispatcher, container dispatch.Container) {
+                       dispatcher.UpdateState(container.UUID, dispatch.Running)
+                       time.Sleep(1 * time.Second)
+                       dispatcher.Arv.Update("containers", container.UUID,
                                arvadosclient.Dict{
-                                       "container": arvadosclient.Dict{"state": "Complete"}},
+                                       "container": arvadosclient.Dict{"priority": 0}},
                                nil)
-               }()
-               return exec.Command("echo", "strigger")
+               })
+       c.Check(container.State, Equals, "Cancelled")
+       c.Check(scancelCmdLine, DeepEquals, []string{"scancel", "--name=zzzzz-dz642-queuedcontainer"})
+}
+
+func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
+       container := s.integrationTest(c, func() *exec.Cmd { return exec.Command("echo") }, []string{"sbatch", "--share", "--parsable",
+               fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
+               fmt.Sprintf("--mem-per-cpu=%d", 2862),
+               fmt.Sprintf("--cpus-per-task=%d", 4),
+               fmt.Sprintf("--priority=%d", 1)},
+               func(dispatcher *dispatch.Dispatcher, container dispatch.Container) {
+                       dispatcher.UpdateState(container.UUID, dispatch.Running)
+                       time.Sleep(3 * time.Second)
+                       dispatcher.UpdateState(container.UUID, dispatch.Complete)
+               })
+       c.Check(container.State, Equals, "Cancelled")
+}
+
+func (s *TestSuite) integrationTest(c *C,
+       newSqueueCmd func() *exec.Cmd,
+       sbatchCmdComps []string,
+       runContainer func(*dispatch.Dispatcher, dispatch.Container)) dispatch.Container {
+       arvadostest.ResetEnv()
+
+       arv, err := arvadosclient.MakeArvadosClient()
+       c.Assert(err, IsNil)
+
+       var sbatchCmdLine []string
+
+       // Override sbatchCmd
+       defer func(orig func(dispatch.Container) *exec.Cmd) {
+               sbatchCmd = orig
+       }(sbatchCmd)
+       sbatchCmd = func(container dispatch.Container) *exec.Cmd {
+               sbatchCmdLine = sbatchFunc(container).Args
+               return exec.Command("sh")
        }
 
-       go func() {
-               time.Sleep(8 * time.Second)
-               sigChan <- syscall.SIGINT
-       }()
+       // Override squeueCmd
+       defer func(orig func() *exec.Cmd) {
+               squeueCmd = orig
+       }(squeueCmd)
+       squeueCmd = newSqueueCmd
 
        // There should be no queued containers now
        params := arvadosclient.Dict{
                "filters": [][]string{[]string{"state", "=", "Queued"}},
        }
-       var containers ContainerList
-       err := arv.List("containers", params, &containers)
+       var containers dispatch.ContainerList
+       err = arv.List("containers", params, &containers)
        c.Check(err, IsNil)
        c.Check(len(containers.Items), Equals, 1)
 
-       err = doMain()
-       c.Check(err, IsNil)
+       echo := "echo"
+       crunchRunCommand = &echo
+
+       doneProcessing := make(chan struct{})
+       dispatcher := dispatch.Dispatcher{
+               Arv:          arv,
+               PollInterval: time.Duration(1) * time.Second,
+               RunContainer: func(dispatcher *dispatch.Dispatcher,
+                       container dispatch.Container,
+                       status chan dispatch.Container) {
+                       go runContainer(dispatcher, container)
+                       run(dispatcher, container, status)
+                       doneProcessing <- struct{}{}
+               },
+               DoneProcessing: doneProcessing}
+
+       squeueUpdater.StartMonitor(time.Duration(500) * time.Millisecond)
 
-       c.Check(sbatchCmdLine, DeepEquals, []string{"sbatch", "--job-name=zzzzz-dz642-queuedcontainer", "--share", "--parsable"})
-       c.Check(striggerCmdLine, DeepEquals, []string{"strigger", "--set", "--jobid=zzzzz-dz642-queuedcontainer\n", "--fini",
-               "--program=/usr/bin/crunch-finish-slurm.sh " + os.Getenv("ARVADOS_API_HOST") + " 4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h 1 zzzzz-dz642-queuedcontainer"})
+       err = dispatcher.RunDispatcher()
+       c.Assert(err, IsNil)
+
+       squeueUpdater.Done()
+
+       c.Check(sbatchCmdLine, DeepEquals, sbatchCmdComps)
 
        // There should be no queued containers now
        err = arv.List("containers", params, &containers)
@@ -121,17 +173,18 @@ func (s *TestSuite) Test_doMain(c *C) {
        c.Check(len(containers.Items), Equals, 0)
 
        // Previously "Queued" container should now be in "Complete" state
-       var container Container
+       var container dispatch.Container
        err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
        c.Check(err, IsNil)
-       c.Check(container.State, Equals, "Complete")
+       return container
 }
 
 func (s *MockArvadosServerSuite) Test_APIErrorGettingContainers(c *C) {
        apiStubResponses := make(map[string]arvadostest.StubResponse)
+       apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
        apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
 
-       testWithServerStub(c, apiStubResponses, "echo", "Error getting list of queued containers")
+       testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
 }
 
 func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
@@ -140,7 +193,7 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
        api := httptest.NewServer(&apiStub)
        defer api.Close()
 
-       arv = arvadosclient.ArvadosClient{
+       arv := arvadosclient.ArvadosClient{
                Scheme:    "http",
                ApiServer: api.URL[7:],
                ApiToken:  "abc123",
@@ -148,18 +201,38 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
                Retries:   0,
        }
 
-       tempfile, err := ioutil.TempFile(os.TempDir(), "temp-log-file")
-       c.Check(err, IsNil)
-       defer os.Remove(tempfile.Name())
-       log.SetOutput(tempfile)
+       buf := bytes.NewBuffer(nil)
+       log.SetOutput(io.MultiWriter(buf, os.Stderr))
+       defer log.SetOutput(os.Stderr)
+
+       crunchRunCommand = &crunchCmd
+
+       doneProcessing := make(chan struct{})
+       dispatcher := dispatch.Dispatcher{
+               Arv:          arv,
+               PollInterval: time.Duration(1) * time.Second,
+               RunContainer: func(dispatcher *dispatch.Dispatcher,
+                       container dispatch.Container,
+                       status chan dispatch.Container) {
+                       go func() {
+                               time.Sleep(1 * time.Second)
+                               dispatcher.UpdateState(container.UUID, dispatch.Running)
+                               dispatcher.UpdateState(container.UUID, dispatch.Complete)
+                       }()
+                       run(dispatcher, container, status)
+                       doneProcessing <- struct{}{}
+               },
+               DoneProcessing: doneProcessing}
 
        go func() {
-               time.Sleep(2 * time.Second)
-               sigChan <- syscall.SIGTERM
+               for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
+                       time.Sleep(100 * time.Millisecond)
+               }
+               dispatcher.DoneProcessing <- struct{}{}
        }()
 
-       runQueuedContainers(2, 1, crunchCmd, crunchCmd)
+       err := dispatcher.RunDispatcher()
+       c.Assert(err, IsNil)
 
-       buf, _ := ioutil.ReadFile(tempfile.Name())
-       c.Check(strings.Contains(string(buf), expected), Equals, true)
+       c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
 }
diff --git a/services/crunch-dispatch-slurm/crunch-finish-slurm.sh b/services/crunch-dispatch-slurm/crunch-finish-slurm.sh
deleted file mode 100755 (executable)
index 95a37ba..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh
-
-# Script to be called by strigger when a job finishes.  This ensures the job
-# record has the correct state "Complete" even if the node running the job
-# failed.
-
-ARVADOS_API_HOST=$1
-ARVADOS_API_TOKEN=$2
-ARVADOS_API_HOST_INSECURE=$3
-uuid=$4
-jobid=$5
-
-# If it is possible to attach metadata to job records we could look up the
-# above information instead of getting it on the command line.  For example,
-# this is the recipe for getting the job name (container uuid) from the job id.
-#uuid=$(squeue --jobs=$jobid --states=all --format=%j --noheader)
-
-export ARVADOS_API_HOST ARVADOS_API_TOKEN ARVADOS_API_HOST_INSECURE
-
-exec arv container update --uuid $uuid --container '{"state": "Complete"}'
diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go
new file mode 100644 (file)
index 0000000..34e6632
--- /dev/null
@@ -0,0 +1,114 @@
+package main
+
+import (
+       "bufio"
+       "log"
+       "os/exec"
+       "sync"
+       "time"
+)
+
+// Squeue implements asynchronous polling monitor of the SLURM queue using the
+// command 'squeue'.
+type Squeue struct {
+       squeueContents []string
+       squeueDone     chan struct{}
+       squeueCond     *sync.Cond
+       SlurmLock      sync.Mutex
+}
+
+// squeueFunc
+func squeueFunc() *exec.Cmd {
+       return exec.Command("squeue", "--format=%j")
+}
+
+var squeueCmd = squeueFunc
+
+// RunSqueue runs squeue once and captures the output.  If it succeeds, set
+// "squeueContents" and then wake up any goroutines waiting squeueCond in
+// CheckSqueue().  If there was an error, log it and leave the threads blocked.
+func (squeue *Squeue) RunSqueue() {
+       var newSqueueContents []string
+
+       // Mutex between squeue sync and running sbatch or scancel.  This
+       // establishes a sequence so that squeue doesn't run concurrently with
+       // sbatch or scancel; the next update of squeue will occur only after
+       // sbatch or scancel has completed.
+       squeue.SlurmLock.Lock()
+       defer squeue.SlurmLock.Unlock()
+
+       // Also ensure unlock on all return paths
+
+       cmd := squeueCmd()
+       sq, err := cmd.StdoutPipe()
+       if err != nil {
+               log.Printf("Error creating stdout pipe for squeue: %v", err)
+               return
+       }
+       cmd.Start()
+       scanner := bufio.NewScanner(sq)
+       for scanner.Scan() {
+               newSqueueContents = append(newSqueueContents, scanner.Text())
+       }
+       if err := scanner.Err(); err != nil {
+               cmd.Wait()
+               log.Printf("Error reading from squeue pipe: %v", err)
+               return
+       }
+
+       err = cmd.Wait()
+       if err != nil {
+               log.Printf("Error running squeue: %v", err)
+               return
+       }
+
+       squeue.squeueCond.L.Lock()
+       squeue.squeueContents = newSqueueContents
+       squeue.squeueCond.Broadcast()
+       squeue.squeueCond.L.Unlock()
+}
+
+// CheckSqueue checks if a given container UUID is in the slurm queue.  This
+// does not run squeue directly, but instead blocks until woken up by next
+// successful update of squeue.
+func (squeue *Squeue) CheckSqueue(uuid string) bool {
+       squeue.squeueCond.L.Lock()
+       // block until next squeue broadcast signaling an update.
+       squeue.squeueCond.Wait()
+       contents := squeue.squeueContents
+       squeue.squeueCond.L.Unlock()
+
+       for _, k := range contents {
+               if k == uuid {
+                       return true
+               }
+       }
+       return false
+}
+
+// StartMonitor starts the squeue monitoring goroutine.
+func (squeue *Squeue) StartMonitor(pollInterval time.Duration) {
+       squeue.squeueCond = sync.NewCond(&sync.Mutex{})
+       squeue.squeueDone = make(chan struct{})
+       go squeue.SyncSqueue(pollInterval)
+}
+
+// Done stops the squeue monitoring goroutine.
+func (squeue *Squeue) Done() {
+       squeue.squeueDone <- struct{}{}
+       close(squeue.squeueDone)
+}
+
+// SyncSqueue periodically polls RunSqueue() at the given duration until
+// terminated by calling Done().
+func (squeue *Squeue) SyncSqueue(pollInterval time.Duration) {
+       ticker := time.NewTicker(pollInterval)
+       for {
+               select {
+               case <-squeue.squeueDone:
+                       return
+               case <-ticker.C:
+                       squeue.RunSqueue()
+               }
+       }
+}
index 01edb0a516fadd33c175df030c7c4c330b2985ba..f55834566d7f2970ae1986ccf842364d91cb5fb4 100644 (file)
@@ -15,6 +15,7 @@ import (
        "os"
        "os/exec"
        "os/signal"
+       "path"
        "strings"
        "sync"
        "syscall"
@@ -26,6 +27,7 @@ type IArvadosClient interface {
        Create(resourceType string, parameters arvadosclient.Dict, output interface{}) error
        Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
        Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) (err error)
+       Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) (err error)
 }
 
 // ErrCancelled is the error returned when the container is cancelled.
@@ -44,6 +46,7 @@ type Mount struct {
        PortableDataHash string `json:"portable_data_hash"`
        UUID             string `json:"uuid"`
        DeviceType       string `json:"device_type"`
+       Path             string `json:"path"`
 }
 
 // Collection record returned by the API server.
@@ -52,25 +55,35 @@ type CollectionRecord struct {
        PortableDataHash string `json:"portable_data_hash"`
 }
 
+type RuntimeConstraints struct {
+       API *bool
+}
+
 // ContainerRecord is the container record returned by the API server.
 type ContainerRecord struct {
-       UUID               string                 `json:"uuid"`
-       Command            []string               `json:"command"`
-       ContainerImage     string                 `json:"container_image"`
-       Cwd                string                 `json:"cwd"`
-       Environment        map[string]string      `json:"environment"`
-       Mounts             map[string]Mount       `json:"mounts"`
-       OutputPath         string                 `json:"output_path"`
-       Priority           int                    `json:"priority"`
-       RuntimeConstraints map[string]interface{} `json:"runtime_constraints"`
-       State              string                 `json:"state"`
-       Output             string                 `json:"output"`
+       UUID               string             `json:"uuid"`
+       Command            []string           `json:"command"`
+       ContainerImage     string             `json:"container_image"`
+       Cwd                string             `json:"cwd"`
+       Environment        map[string]string  `json:"environment"`
+       Mounts             map[string]Mount   `json:"mounts"`
+       OutputPath         string             `json:"output_path"`
+       Priority           int                `json:"priority"`
+       RuntimeConstraints RuntimeConstraints `json:"runtime_constraints"`
+       State              string             `json:"state"`
+       Output             string             `json:"output"`
+}
+
+// APIClientAuthorization is an arvados#api_client_authorization resource.
+type APIClientAuthorization struct {
+       UUID     string `json:"uuid"`
+       APIToken string `json:"api_token"`
 }
 
 // NewLogWriter is a factory function to create a new log writer.
 type NewLogWriter func(name string) io.WriteCloser
 
-type RunArvMount func([]string) (*exec.Cmd, error)
+type RunArvMount func(args []string, tok string) (*exec.Cmd, error)
 
 type MkTempDir func(string, string) (string, error)
 
@@ -94,12 +107,14 @@ type ContainerRunner struct {
        Kc        IKeepClient
        ContainerRecord
        dockerclient.ContainerConfig
+       dockerclient.HostConfig
+       token       string
        ContainerID string
        ExitCode    *int
        NewLogWriter
        loggingDone   chan bool
        CrunchLog     *ThrottledLogger
-       Stdout        *ThrottledLogger
+       Stdout        io.WriteCloser
        Stderr        *ThrottledLogger
        LogCollection *CollectionWriter
        LogsPDH       *string
@@ -187,8 +202,19 @@ func (runner *ContainerRunner) LoadImage() (err error) {
        return nil
 }
 
-func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string) (c *exec.Cmd, err error) {
+func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (c *exec.Cmd, err error) {
        c = exec.Command("arv-mount", arvMountCmd...)
+
+       // Copy our environment, but override ARVADOS_API_TOKEN with
+       // the container auth token.
+       c.Env = nil
+       for _, s := range os.Environ() {
+               if !strings.HasPrefix(s, "ARVADOS_API_TOKEN=") {
+                       c.Env = append(c.Env, s)
+               }
+       }
+       c.Env = append(c.Env, "ARVADOS_API_TOKEN="+token)
+
        nt := NewThrottledLogger(runner.NewLogWriter("arv-mount"))
        c.Stdout = nt
        c.Stderr = nt
@@ -246,6 +272,22 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
        runner.Binds = nil
 
        for bind, mnt := range runner.ContainerRecord.Mounts {
+               if bind == "stdout" {
+                       // Is it a "file" mount kind?
+                       if mnt.Kind != "file" {
+                               return fmt.Errorf("Unsupported mount kind '%s' for stdout. Only 'file' is supported.", mnt.Kind)
+                       }
+
+                       // Does path start with OutputPath?
+                       prefix := runner.ContainerRecord.OutputPath
+                       if !strings.HasSuffix(prefix, "/") {
+                               prefix += "/"
+                       }
+                       if !strings.HasPrefix(mnt.Path, prefix) {
+                               return fmt.Errorf("Stdout path does not start with OutputPath: %s, %s", mnt.Path, prefix)
+                       }
+               }
+
                if mnt.Kind == "collection" {
                        var src string
                        if mnt.UUID != "" && mnt.PortableDataHash != "" {
@@ -296,8 +338,6 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                        } else {
                                runner.Binds = append(runner.Binds, bind)
                        }
-               } else {
-                       return fmt.Errorf("Unknown mount kind '%s'", mnt.Kind)
                }
        }
 
@@ -312,7 +352,12 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
        }
        arvMountCmd = append(arvMountCmd, runner.ArvMountPoint)
 
-       runner.ArvMount, err = runner.RunArvMount(arvMountCmd)
+       token, err := runner.ContainerToken()
+       if err != nil {
+               return fmt.Errorf("could not get container token: %s", err)
+       }
+
+       runner.ArvMount, err = runner.RunArvMount(arvMountCmd, token)
        if err != nil {
                return fmt.Errorf("While trying to start arv-mount: %v", err)
        }
@@ -383,7 +428,31 @@ func (runner *ContainerRunner) AttachStreams() (err error) {
 
        runner.loggingDone = make(chan bool)
 
-       runner.Stdout = NewThrottledLogger(runner.NewLogWriter("stdout"))
+       if stdoutMnt, ok := runner.ContainerRecord.Mounts["stdout"]; ok {
+               stdoutPath := stdoutMnt.Path[len(runner.ContainerRecord.OutputPath):]
+               index := strings.LastIndex(stdoutPath, "/")
+               if index > 0 {
+                       subdirs := stdoutPath[:index]
+                       if subdirs != "" {
+                               st, err := os.Stat(runner.HostOutputDir)
+                               if err != nil {
+                                       return fmt.Errorf("While Stat on temp dir: %v", err)
+                               }
+                               stdoutPath := path.Join(runner.HostOutputDir, subdirs)
+                               err = os.MkdirAll(stdoutPath, st.Mode()|os.ModeSetgid|0777)
+                               if err != nil {
+                                       return fmt.Errorf("While MkdirAll %q: %v", stdoutPath, err)
+                               }
+                       }
+               }
+               stdoutFile, err := os.Create(path.Join(runner.HostOutputDir, stdoutPath))
+               if err != nil {
+                       return fmt.Errorf("While creating stdout file: %v", err)
+               }
+               runner.Stdout = stdoutFile
+       } else {
+               runner.Stdout = NewThrottledLogger(runner.NewLogWriter("stdout"))
+       }
        runner.Stderr = NewThrottledLogger(runner.NewLogWriter("stderr"))
 
        go runner.ProcessDockerAttach(containerReader)
@@ -391,43 +460,51 @@ func (runner *ContainerRunner) AttachStreams() (err error) {
        return nil
 }
 
-// StartContainer creates the container and runs it.
-func (runner *ContainerRunner) StartContainer() (err error) {
+// CreateContainer creates the docker container.
+func (runner *ContainerRunner) CreateContainer() error {
        runner.CrunchLog.Print("Creating Docker container")
 
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-
-       if runner.Cancelled {
-               return ErrCancelled
-       }
-
        runner.ContainerConfig.Cmd = runner.ContainerRecord.Command
        if runner.ContainerRecord.Cwd != "." {
                runner.ContainerConfig.WorkingDir = runner.ContainerRecord.Cwd
        }
+
        for k, v := range runner.ContainerRecord.Environment {
                runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
        }
+       if wantAPI := runner.ContainerRecord.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
+               tok, err := runner.ContainerToken()
+               if err != nil {
+                       return err
+               }
+               runner.ContainerConfig.Env = append(runner.ContainerConfig.Env,
+                       "ARVADOS_API_TOKEN="+tok,
+                       "ARVADOS_API_HOST="+os.Getenv("ARVADOS_API_HOST"),
+                       "ARVADOS_API_HOST_INSECURE="+os.Getenv("ARVADOS_API_HOST_INSECURE"),
+               )
+       }
+
        runner.ContainerConfig.NetworkDisabled = true
+
+       var err error
        runner.ContainerID, err = runner.Docker.CreateContainer(&runner.ContainerConfig, "", nil)
        if err != nil {
                return fmt.Errorf("While creating container: %v", err)
        }
-       hostConfig := &dockerclient.HostConfig{Binds: runner.Binds,
+
+       runner.HostConfig = dockerclient.HostConfig{Binds: runner.Binds,
                LogConfig: dockerclient.LogConfig{Type: "none"}}
 
-       err = runner.AttachStreams()
-       if err != nil {
-               return err
-       }
+       return runner.AttachStreams()
+}
 
+// StartContainer starts the docker container created by CreateContainer.
+func (runner *ContainerRunner) StartContainer() error {
        runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
-       err = runner.Docker.StartContainer(runner.ContainerID, hostConfig)
+       err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
        if err != nil {
-               return fmt.Errorf("While starting container: %v", err)
+               return fmt.Errorf("could not start container: %v", err)
        }
-
        return nil
 }
 
@@ -541,6 +618,15 @@ func (runner *ContainerRunner) CommitLogs() error {
        runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID,
                "crunch-run", nil})
 
+       if runner.LogsPDH != nil {
+               // If we have already assigned something to LogsPDH,
+               // we must be closing the re-opened log, which won't
+               // end up getting attached to the container record and
+               // therefore doesn't need to be saved as a collection
+               // -- it exists only to send logs to other channels.
+               return nil
+       }
+
        mt, err := runner.LogCollection.ManifestText()
        if err != nil {
                return fmt.Errorf("While creating log manifest: %v", err)
@@ -557,37 +643,64 @@ func (runner *ContainerRunner) CommitLogs() error {
                return fmt.Errorf("While creating log collection: %v", err)
        }
 
-       runner.LogsPDH = new(string)
-       *runner.LogsPDH = response.PortableDataHash
+       runner.LogsPDH = &response.PortableDataHash
 
        return nil
 }
 
 // UpdateContainerRecordRunning updates the container state to "Running"
 func (runner *ContainerRunner) UpdateContainerRecordRunning() error {
+       runner.CancelLock.Lock()
+       defer runner.CancelLock.Unlock()
+       if runner.Cancelled {
+               return ErrCancelled
+       }
        return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID,
                arvadosclient.Dict{"container": arvadosclient.Dict{"state": "Running"}}, nil)
 }
 
-// UpdateContainerRecordComplete updates the container record state on API
-// server to "Complete" or "Cancelled"
-func (runner *ContainerRunner) UpdateContainerRecordComplete() error {
-       update := arvadosclient.Dict{}
-       if runner.LogsPDH != nil {
-               update["log"] = *runner.LogsPDH
+// ContainerToken returns the api_token the container (and any
+// arv-mount processes) are allowed to use.
+func (runner *ContainerRunner) ContainerToken() (string, error) {
+       if runner.token != "" {
+               return runner.token, nil
        }
-       if runner.ExitCode != nil {
-               update["exit_code"] = *runner.ExitCode
-       }
-       if runner.OutputPDH != nil {
-               update["output"] = runner.OutputPDH
+
+       var auth APIClientAuthorization
+       err := runner.ArvClient.Call("GET", "containers", runner.ContainerRecord.UUID, "auth", nil, &auth)
+       if err != nil {
+               return "", err
        }
+       runner.token = auth.APIToken
+       return runner.token, nil
+}
 
+// UpdateContainerRecordComplete updates the container record state on API
+// server to "Complete" or "Cancelled"
+func (runner *ContainerRunner) UpdateContainerRecordFinal() error {
+       update := arvadosclient.Dict{}
        update["state"] = runner.finalState
-
+       if runner.finalState == "Complete" {
+               if runner.LogsPDH != nil {
+                       update["log"] = *runner.LogsPDH
+               }
+               if runner.ExitCode != nil {
+                       update["exit_code"] = *runner.ExitCode
+               }
+               if runner.OutputPDH != nil {
+                       update["output"] = *runner.OutputPDH
+               }
+       }
        return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID, arvadosclient.Dict{"container": update}, nil)
 }
 
+// IsCancelled returns the value of Cancelled, with goroutine safety.
+func (runner *ContainerRunner) IsCancelled() bool {
+       runner.CancelLock.Lock()
+       defer runner.CancelLock.Unlock()
+       return runner.Cancelled
+}
+
 // NewArvLogWriter creates an ArvLogWriter
 func (runner *ContainerRunner) NewArvLogWriter(name string) io.WriteCloser {
        return &ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID, name, runner.LogCollection.Open(name + ".txt")}
@@ -604,93 +717,99 @@ func (runner *ContainerRunner) Run() (err error) {
                runner.CrunchLog.Printf("Executing on host '%s'", hostname)
        }
 
-       var runerr, waiterr error
+       // Clean up temporary directories _after_ finalizing
+       // everything (if we've made any by then)
+       defer runner.CleanupDirs()
+
+       runner.finalState = "Queued"
 
        defer func() {
-               if err != nil {
-                       runner.CrunchLog.Print(err)
+               // checkErr prints e (unless it's nil) and sets err to
+               // e (unless err is already non-nil). Thus, if err
+               // hasn't already been assigned when Run() returns,
+               // this cleanup func will cause Run() to return the
+               // first non-nil error that is passed to checkErr().
+               checkErr := func(e error) {
+                       if e == nil {
+                               return
+                       }
+                       runner.CrunchLog.Print(e)
+                       if err == nil {
+                               err = e
+                       }
                }
 
-               if runner.Cancelled {
-                       runner.finalState = "Cancelled"
-               } else {
-                       runner.finalState = "Complete"
-               }
+               // Log the error encountered in Run(), if any
+               checkErr(err)
 
-               // (6) capture output
-               outputerr := runner.CaptureOutput()
-               if outputerr != nil {
-                       runner.CrunchLog.Print(outputerr)
+               if runner.finalState == "Queued" {
+                       runner.UpdateContainerRecordFinal()
+                       return
                }
 
-               // (7) clean up temporary directories
-               runner.CleanupDirs()
-
-               // (8) write logs
-               logerr := runner.CommitLogs()
-               if logerr != nil {
-                       runner.CrunchLog.Print(logerr)
+               if runner.IsCancelled() {
+                       runner.finalState = "Cancelled"
+                       // but don't return yet -- we still want to
+                       // capture partial output and write logs
                }
 
-               // (9) update container record with results
-               updateerr := runner.UpdateContainerRecordComplete()
-               if updateerr != nil {
-                       runner.CrunchLog.Print(updateerr)
-               }
+               checkErr(runner.CaptureOutput())
+               checkErr(runner.CommitLogs())
+               checkErr(runner.UpdateContainerRecordFinal())
 
+               // The real log is already closed, but then we opened
+               // a new one in case we needed to log anything while
+               // finalizing.
                runner.CrunchLog.Close()
-
-               if err == nil {
-                       if runerr != nil {
-                               err = runerr
-                       } else if waiterr != nil {
-                               err = waiterr
-                       } else if logerr != nil {
-                               err = logerr
-                       } else if updateerr != nil {
-                               err = updateerr
-                       }
-               }
        }()
 
        err = runner.ArvClient.Get("containers", runner.ContainerRecord.UUID, nil, &runner.ContainerRecord)
        if err != nil {
-               return fmt.Errorf("While getting container record: %v", err)
+               err = fmt.Errorf("While getting container record: %v", err)
+               return
        }
 
-       // (1) setup signal handling
+       // setup signal handling
        runner.SetupSignals()
 
-       // (2) check for and/or load image
+       // check for and/or load image
        err = runner.LoadImage()
        if err != nil {
-               return fmt.Errorf("While loading container image: %v", err)
+               err = fmt.Errorf("While loading container image: %v", err)
+               return
        }
 
-       // (3) set up FUSE mount and binds
+       // set up FUSE mount and binds
        err = runner.SetupMounts()
        if err != nil {
-               return fmt.Errorf("While setting up mounts: %v", err)
+               err = fmt.Errorf("While setting up mounts: %v", err)
+               return
        }
 
-       // (3) create and start container
-       err = runner.StartContainer()
+       err = runner.CreateContainer()
        if err != nil {
-               if err == ErrCancelled {
-                       err = nil
-               }
                return
        }
 
-       // (4) update container record state
+       if runner.IsCancelled() {
+               return
+       }
+
        err = runner.UpdateContainerRecordRunning()
        if err != nil {
-               runner.CrunchLog.Print(err)
+               return
        }
+       runner.finalState = "Cancelled"
 
-       // (5) wait for container to finish
-       waiterr = runner.WaitFinish()
+       err = runner.StartContainer()
+       if err != nil {
+               return
+       }
 
+       err = runner.WaitFinish()
+       if err == nil {
+               runner.finalState = "Complete"
+       }
        return
 }
 
index 659b3c0ede524a31af3ada93369fcc6cab808e2a..998c4bc17a16f8f932c4ae7adf2469222a0eab4e 100644 (file)
@@ -18,6 +18,7 @@ import (
        "os/exec"
        "sort"
        "strings"
+       "sync"
        "syscall"
        "testing"
        "time"
@@ -40,6 +41,7 @@ type ArvTestClient struct {
        ContainerRecord
        Logs          map[string]*bytes.Buffer
        WasSetRunning bool
+       sync.Mutex
 }
 
 type KeepTestClient struct {
@@ -54,6 +56,9 @@ var hwImageId = "9c31ee32b3d15268a0754e8edc74d4f815ee014b693bc5109058e431dd5caea
 var otherManifest = ". 68a84f561b1d1708c6baff5e019a9ab3+46+Ae5d0af96944a3690becb1decdf60cc1c937f556d@5693216f 0:46:md5sum.txt\n"
 var otherPDH = "a3e8f74c6f101eae01fa08bfb4e49b3a+54"
 
+var fakeAuthUUID = "zzzzz-gj3su-55pqoyepgi2glem"
+var fakeAuthToken = "a3ltuwzqcu2u4sc0q7yhpc2w7s00fdcqecg5d6e0u3pfohmbjt"
+
 type TestDockerClient struct {
        imageLoaded string
        logReader   io.ReadCloser
@@ -130,6 +135,9 @@ func (this *ArvTestClient) Create(resourceType string,
        parameters arvadosclient.Dict,
        output interface{}) error {
 
+       this.Mutex.Lock()
+       defer this.Mutex.Unlock()
+
        this.Calls += 1
        this.Content = append(this.Content, parameters)
 
@@ -153,6 +161,19 @@ func (this *ArvTestClient) Create(resourceType string,
        return nil
 }
 
+func (this *ArvTestClient) Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error {
+       switch {
+       case method == "GET" && resourceType == "containers" && action == "auth":
+               return json.Unmarshal([]byte(`{
+                       "kind": "arvados#api_client_authorization",
+                       "uuid": "`+fakeAuthUUID+`",
+                       "api_token": "`+fakeAuthToken+`"
+                       }`), output)
+       default:
+               return fmt.Errorf("Not found")
+       }
+}
+
 func (this *ArvTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
        if resourceType == "collections" {
                if uuid == hwPDH {
@@ -168,13 +189,35 @@ func (this *ArvTestClient) Get(resourceType string, uuid string, parameters arva
 }
 
 func (this *ArvTestClient) Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) (err error) {
+       this.Mutex.Lock()
+       defer this.Mutex.Unlock()
        this.Calls += 1
        this.Content = append(this.Content, parameters)
        if resourceType == "containers" {
                if parameters["container"].(arvadosclient.Dict)["state"] == "Running" {
                        this.WasSetRunning = true
                }
+       }
+       return nil
+}
 
+// CalledWith returns the parameters from the first API call whose
+// parameters match jpath/string. E.g., CalledWith(c, "foo.bar",
+// "baz") returns parameters with parameters["foo"]["bar"]=="baz". If
+// no call matches, it returns nil.
+func (this *ArvTestClient) CalledWith(jpath, expect string) arvadosclient.Dict {
+       call: for _, content := range this.Content {
+               var v interface{} = content
+               for _, k := range strings.Split(jpath, ".") {
+                       if dict, ok := v.(arvadosclient.Dict); !ok {
+                               continue call
+                       } else {
+                               v = dict[k]
+                       }
+               }
+               if v, ok := v.(string); ok && v == expect {
+                       return content
+               }
        }
        return nil
 }
@@ -252,6 +295,10 @@ func (this ArvErrorTestClient) Create(resourceType string,
        return nil
 }
 
+func (this ArvErrorTestClient) Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error {
+       return errors.New("ArvError")
+}
+
 func (this ArvErrorTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
        return errors.New("ArvError")
 }
@@ -376,6 +423,9 @@ func (s *TestSuite) TestRunContainer(c *C) {
        err := cr.LoadImage()
        c.Check(err, IsNil)
 
+       err = cr.CreateContainer()
+       c.Check(err, IsNil)
+
        err = cr.StartContainer()
        c.Check(err, IsNil)
 
@@ -428,7 +478,7 @@ func (s *TestSuite) TestUpdateContainerRecordComplete(c *C) {
        *cr.ExitCode = 42
        cr.finalState = "Complete"
 
-       err := cr.UpdateContainerRecordComplete()
+       err := cr.UpdateContainerRecordFinal()
        c.Check(err, IsNil)
 
        c.Check(api.Content[0]["container"].(arvadosclient.Dict)["log"], Equals, *cr.LogsPDH)
@@ -443,7 +493,7 @@ func (s *TestSuite) TestUpdateContainerRecordCancelled(c *C) {
        cr.Cancelled = true
        cr.finalState = "Cancelled"
 
-       err := cr.UpdateContainerRecordComplete()
+       err := cr.UpdateContainerRecordFinal()
        c.Check(err, IsNil)
 
        c.Check(api.Content[0]["container"].(arvadosclient.Dict)["log"], IsNil)
@@ -452,10 +502,10 @@ func (s *TestSuite) TestUpdateContainerRecordCancelled(c *C) {
 }
 
 // Used by the TestFullRun*() test below to DRY up boilerplate setup to do full
-// dress rehersal of the Run() function, starting from a JSON container record.
+// dress rehearsal of the Run() function, starting from a JSON container record.
 func FullRunHelper(c *C, record string, fn func(t *TestDockerClient)) (api *ArvTestClient, cr *ContainerRunner) {
        rec := ContainerRecord{}
-       err := json.NewDecoder(strings.NewReader(record)).Decode(&rec)
+       err := json.Unmarshal([]byte(record), &rec)
        c.Check(err, IsNil)
 
        docker := NewTestDockerClient()
@@ -524,7 +574,7 @@ func (s *TestSuite) TestFullRunStderr(c *C) {
                t.finish <- dockerclient.WaitResult{ExitCode: 1}
        })
 
-       c.Check(api.Calls, Equals, 8)
+       c.Assert(api.Calls, Equals, 8)
        c.Check(api.Content[7]["container"].(arvadosclient.Dict)["log"], NotNil)
        c.Check(api.Content[7]["container"].(arvadosclient.Dict)["exit_code"], Equals, 1)
        c.Check(api.Content[7]["container"].(arvadosclient.Dict)["state"], Equals, "Complete")
@@ -594,7 +644,7 @@ func (s *TestSuite) TestCancel(c *C) {
 }`
 
        rec := ContainerRecord{}
-       err := json.NewDecoder(strings.NewReader(record)).Decode(&rec)
+       err := json.Unmarshal([]byte(record), &rec)
        c.Check(err, IsNil)
 
        docker := NewTestDockerClient()
@@ -613,7 +663,7 @@ func (s *TestSuite) TestCancel(c *C) {
 
        go func() {
                for cr.ContainerID == "" {
-                       time.Sleep(1 * time.Second)
+                       time.Sleep(time.Millisecond)
                }
                cr.SigChan <- syscall.SIGINT
        }()
@@ -621,10 +671,6 @@ func (s *TestSuite) TestCancel(c *C) {
        err = cr.Run()
 
        c.Check(err, IsNil)
-
-       c.Check(api.Calls, Equals, 6)
-       c.Check(api.Content[5]["container"].(arvadosclient.Dict)["log"], NotNil)
-
        if err != nil {
                for k, v := range api.Logs {
                        c.Log(k)
@@ -632,8 +678,9 @@ func (s *TestSuite) TestCancel(c *C) {
                }
        }
 
+       c.Assert(api.Calls, Equals, 6)
+       c.Check(api.Content[5]["container"].(arvadosclient.Dict)["log"], IsNil)
        c.Check(api.Content[5]["container"].(arvadosclient.Dict)["state"], Equals, "Cancelled")
-
        c.Check(strings.HasSuffix(api.Logs["stdout"].String(), "foo\n"), Equals, true)
 
 }
@@ -662,11 +709,13 @@ func (s *TestSuite) TestFullRunSetEnv(c *C) {
 }
 
 type ArvMountCmdLine struct {
-       Cmd []string
+       Cmd   []string
+       token string
 }
 
-func (am *ArvMountCmdLine) ArvMountTest(c []string) (*exec.Cmd, error) {
+func (am *ArvMountCmdLine) ArvMountTest(c []string, token string) (*exec.Cmd, error) {
        am.Cmd = c
+       am.token = token
        return nil, nil
 }
 
@@ -732,3 +781,76 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                cr.CleanupDirs()
        }
 }
+
+func (s *TestSuite) TestStdout(c *C) {
+       helperRecord := `{`
+       helperRecord += `"command": ["/bin/sh", "-c", "echo $FROBIZ"],`
+       helperRecord += `"container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",`
+       helperRecord += `"cwd": "/bin",`
+       helperRecord += `"environment": {"FROBIZ": "bilbo"},`
+       helperRecord += `"mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "file", "path": "/tmp/a/b/c.out"} },`
+       helperRecord += `"output_path": "/tmp",`
+       helperRecord += `"priority": 1,`
+       helperRecord += `"runtime_constraints": {}`
+       helperRecord += `}`
+
+       api, _ := FullRunHelper(c, helperRecord, func(t *TestDockerClient) {
+               t.logWriter.Write(dockerLog(1, t.env[0][7:]+"\n"))
+               t.logWriter.Close()
+               t.finish <- dockerclient.WaitResult{ExitCode: 0}
+       })
+
+       c.Assert(api.Calls, Equals, 6)
+       c.Check(api.Content[5]["container"].(arvadosclient.Dict)["exit_code"], Equals, 0)
+       c.Check(api.Content[5]["container"].(arvadosclient.Dict)["state"], Equals, "Complete")
+       c.Check(api.CalledWith("collection.manifest_text", "./a/b 307372fa8fd5c146b22ae7a45b49bc31+6 0:6:c.out\n"), Not(IsNil))
+}
+
+// Used by the TestStdoutWithWrongPath*()
+func StdoutErrorRunHelper(c *C, record string, fn func(t *TestDockerClient)) (api *ArvTestClient, cr *ContainerRunner, err error) {
+       rec := ContainerRecord{}
+       err = json.Unmarshal([]byte(record), &rec)
+       c.Check(err, IsNil)
+
+       docker := NewTestDockerClient()
+       docker.fn = fn
+       docker.RemoveImage(hwImageId, true)
+
+       api = &ArvTestClient{ContainerRecord: rec}
+       cr = NewContainerRunner(api, &KeepTestClient{}, docker, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
+       am := &ArvMountCmdLine{}
+       cr.RunArvMount = am.ArvMountTest
+
+       err = cr.Run()
+       return
+}
+
+func (s *TestSuite) TestStdoutWithWrongPath(c *C) {
+       _, _, err := StdoutErrorRunHelper(c, `{
+    "mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "file", "path":"/tmpa.out"} },
+    "output_path": "/tmp"
+}`, func(t *TestDockerClient) {})
+
+       c.Check(err, NotNil)
+       c.Check(strings.Contains(err.Error(), "Stdout path does not start with OutputPath"), Equals, true)
+}
+
+func (s *TestSuite) TestStdoutWithWrongKindTmp(c *C) {
+       _, _, err := StdoutErrorRunHelper(c, `{
+    "mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "tmp", "path":"/tmp/a.out"} },
+    "output_path": "/tmp"
+}`, func(t *TestDockerClient) {})
+
+       c.Check(err, NotNil)
+       c.Check(strings.Contains(err.Error(), "Unsupported mount kind 'tmp' for stdout"), Equals, true)
+}
+
+func (s *TestSuite) TestStdoutWithWrongKindCollection(c *C) {
+       _, _, err := StdoutErrorRunHelper(c, `{
+    "mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "collection", "path":"/tmp/a.out"} },
+    "output_path": "/tmp"
+}`, func(t *TestDockerClient) {})
+
+       c.Check(err, NotNil)
+       c.Check(strings.Contains(err.Error(), "Unsupported mount kind 'collection' for stdout"), Equals, true)
+}
index 79214fca7dc4d3f5e0ac0b77475c6a0a5b27138d..bb3123a1025a810f0165219967161b6977d8f889 100644 (file)
@@ -4,6 +4,7 @@ import (
        "fmt"
        "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
        . "gopkg.in/check.v1"
+       "testing"
        "time"
 )
 
@@ -46,13 +47,16 @@ func (s *LoggingTestSuite) TestWriteLogs(c *C) {
 }
 
 func (s *LoggingTestSuite) TestWriteLogsLarge(c *C) {
+       if testing.Short() {
+               return
+       }
        api := &ArvTestClient{}
        kc := &KeepTestClient{}
        cr := NewContainerRunner(api, kc, nil, "zzzzz-zzzzzzzzzzzzzzz")
        cr.CrunchLog.Timestamper = (&TestTimestamper{}).Timestamp
        cr.CrunchLog.Immediate = nil
 
-       for i := 0; i < 2000000; i += 1 {
+       for i := 0; i < 2000000; i++ {
                cr.CrunchLog.Printf("Hello %d", i)
        }
        cr.CrunchLog.Print("Goodbye")
@@ -79,18 +83,21 @@ func (s *LoggingTestSuite) TestWriteMultipleLogs(c *C) {
        stdout.Print("Doing stuff")
        cr.CrunchLog.Print("Goodbye")
        stdout.Print("Blurb")
-
        cr.CrunchLog.Close()
-       logtext1 := "2015-12-29T15:51:45.000000001Z Hello world!\n" +
-               "2015-12-29T15:51:45.000000003Z Goodbye\n"
-       c.Check(api.Content[0]["log"].(arvadosclient.Dict)["event_type"], Equals, "crunch-run")
-       c.Check(api.Content[0]["log"].(arvadosclient.Dict)["properties"].(map[string]string)["text"], Equals, logtext1)
-
        stdout.Close()
-       logtext2 := "2015-12-29T15:51:45.000000002Z Doing stuff\n" +
-               "2015-12-29T15:51:45.000000004Z Blurb\n"
-       c.Check(api.Content[1]["log"].(arvadosclient.Dict)["event_type"], Equals, "stdout")
-       c.Check(api.Content[1]["log"].(arvadosclient.Dict)["properties"].(map[string]string)["text"], Equals, logtext2)
+
+       logText := make(map[string]string)
+       for _, content := range api.Content {
+               log := content["log"].(arvadosclient.Dict)
+               logText[log["event_type"].(string)] += log["properties"].(map[string]string)["text"]
+       }
+
+       c.Check(logText["crunch-run"], Equals, `2015-12-29T15:51:45.000000001Z Hello world!
+2015-12-29T15:51:45.000000003Z Goodbye
+`)
+       c.Check(logText["stdout"], Equals, `2015-12-29T15:51:45.000000002Z Doing stuff
+2015-12-29T15:51:45.000000004Z Blurb
+`)
 
        mt, err := cr.LogCollection.ManifestText()
        c.Check(err, IsNil)
index 9b7eb7543a4ebed086aba2d409f44fcc789ef222..55b3f61c4e5ee32bcff3fab7082fda7334f08be4 100644 (file)
@@ -49,11 +49,11 @@ type GetCollectionsParams struct {
 
 // SdkCollectionInfo holds collection info from api
 type SdkCollectionInfo struct {
-       UUID         string    `json:"uuid"`
-       OwnerUUID    string    `json:"owner_uuid"`
-       Redundancy   int       `json:"redundancy"`
-       ModifiedAt   time.Time `json:"modified_at"`
-       ManifestText string    `json:"manifest_text"`
+       UUID                 string    `json:"uuid"`
+       OwnerUUID            string    `json:"owner_uuid"`
+       ReplicationDesired   int       `json:"replication_desired"`
+       ModifiedAt           time.Time `json:"modified_at"`
+       ManifestText         string    `json:"manifest_text"`
 }
 
 // SdkCollectionList lists collections from api
@@ -125,14 +125,14 @@ func GetCollections(params GetCollectionsParams) (results ReadCollections, err e
        fieldsWanted := []string{"manifest_text",
                "owner_uuid",
                "uuid",
-               "redundancy",
+               "replication_desired",
                "modified_at"}
 
        sdkParams := arvadosclient.Dict{
                "select":  fieldsWanted,
                "order":   []string{"modified_at ASC", "uuid ASC"},
                "filters": [][]string{[]string{"modified_at", ">=", "1900-01-01T00:00:00Z"}},
-               "offset": 0}
+               "offset":  0}
 
        if params.BatchSize > 0 {
                sdkParams["limit"] = params.BatchSize
@@ -262,12 +262,12 @@ func GetCollections(params GetCollectionsParams) (results ReadCollections, err e
        }
        if totalCollections < finalNumberOfCollectionsAvailable {
                err = fmt.Errorf("API server indicates a total of %d collections "+
-                               "available up to %v, but we only retrieved %d. "+
-                               "Refusing to continue as this could indicate an "+
-                               "otherwise undetected failure.",
-                               finalNumberOfCollectionsAvailable, 
-                               sdkParams["filters"].([][]string)[0][2],
-                               totalCollections)
+                       "available up to %v, but we only retrieved %d. "+
+                       "Refusing to continue as this could indicate an "+
+                       "otherwise undetected failure.",
+                       finalNumberOfCollectionsAvailable,
+                       sdkParams["filters"].([][]string)[0][2],
+                       totalCollections)
                return
        }
 
@@ -297,7 +297,7 @@ func ProcessCollections(arvLogger *logger.Logger,
        for _, sdkCollection := range receivedCollections {
                collection := Collection{UUID: StrCopy(sdkCollection.UUID),
                        OwnerUUID:         StrCopy(sdkCollection.OwnerUUID),
-                       ReplicationLevel:  sdkCollection.Redundancy,
+                       ReplicationLevel:  sdkCollection.ReplicationDesired,
                        BlockDigestToSize: make(map[blockdigest.BlockDigest]int)}
 
                if sdkCollection.ModifiedAt.IsZero() {
index 8c655cd5ff68a981146493bfa21fd71693ba12c0..8111425d7af76a9fa55ca71a3b3620a1d0a095a1 100644 (file)
@@ -10,7 +10,7 @@ import (
        "time"
 )
 
-// Useful to call at the begining of execution to log info about the
+// Useful to call at the beginning of execution to log info about the
 // current run.
 func LogRunInfo(arvLogger *logger.Logger) {
        if arvLogger != nil {
index aac9aec9716a74c33f5c18a4ec23e2f9b0b8e212..d7fb3eb8f7cb3953d0f40e0e67626376f0d2941a 100644 (file)
@@ -71,7 +71,7 @@ func ComputePullServers(kc *keepclient.KeepClient,
        blockToDesiredReplication map[blockdigest.DigestWithSize]int,
        underReplicated BlockSet) (m map[Locator]PullServers) {
        m = map[Locator]PullServers{}
-       // We use CanonicalString to avoid filling memory with dupicate
+       // We use CanonicalString to avoid filling memory with duplicate
        // copies of the same string.
        var cs CanonicalString
 
index 1a9d0fd62eb70d667bc82b7e5a85a6f720ef1bb5..3ca9714066e23a1e3ad2c58d40691686da2b36e8 100644 (file)
@@ -25,7 +25,7 @@ setup(name="arvados-docker-cleaner",
           ('share/doc/arvados-docker-cleaner', ['agpl-3.0.txt']),
       ],
       install_requires=[
-        'docker-py',
+        'docker-py==1.7.2',
         ],
       tests_require=[
         'pbr<1.7.0',
index 196bb221e901e132d10db4f2bdbd7ed060f794e3..3f2bcd5ec2464fb5351a42be205b17f1fe93e49c 100644 (file)
@@ -320,6 +320,11 @@ class CollectionDirectoryBase(Directory):
         self.flush()
         src.flush()
 
+    def clear(self, force=False):
+        r = super(CollectionDirectoryBase, self).clear(force)
+        self.collection = None
+        return r
+
 
 class CollectionDirectory(CollectionDirectoryBase):
     """Represents the root of a directory tree representing a collection."""
index e731327dec5524432d0eb12c7d9dfc2b900fafd4..3f0e4932fddb181d84a17def278e21bd3035b6db 100644 (file)
@@ -95,6 +95,12 @@ class ObjectFile(StringFile):
         return self.object_uuid
 
     def update(self, obj=None):
+        if obj is None:
+            # TODO: retrieve the current record for self.object_uuid
+            # from the server. For now, at least don't crash when
+            # someone tells us it's a good time to update but doesn't
+            # pass us a fresh obj. See #8345
+            return
         self._mtime = convertTime(obj['modified_at']) if 'modified_at' in obj else 0
         self.contents = json.dumps(obj, indent=4, sort_keys=True) + "\n"
 
index fca1edf6bc25603beac62e6b88041cf481da43ee..d7e1a8afb302b26ae582bc5a3a5aaecc9514ae7c 100644 (file)
@@ -15,6 +15,11 @@ try:
 except ImportError:
     tagger = egg_info_cmd.egg_info
 
+short_tests_only = False
+if '--short-tests-only' in sys.argv:
+    short_tests_only = True
+    sys.argv.remove('--short-tests-only')
+
 setup(name='arvados_fuse',
       version='0.1',
       description='Arvados FUSE driver',
index c79daf80f54156b6e304839c01a66221217ae3c9..12395d7f951422c90a76bc1e474172524c598356 100644 (file)
@@ -36,6 +36,7 @@ class MountTestBase(unittest.TestCase):
         run_test_server.run()
         run_test_server.authorize_with("admin")
         self.api = api if api else arvados.safeapi.ThreadSafeApiCache(arvados.config.settings())
+        self.llfuse_thread = None
 
     # This is a copy of Mount's method.  TODO: Refactor MountTestBase
     # to use a Mount instead of copying its code.
@@ -67,12 +68,13 @@ class MountTestBase(unittest.TestCase):
         self.pool.join()
         del self.pool
 
-        subprocess.call(["fusermount", "-u", "-z", self.mounttmp])
-        self.llfuse_thread.join(timeout=1)
-        if self.llfuse_thread.is_alive():
-            logger.warning("MountTestBase.tearDown():"
-                           " llfuse thread still alive 1s after umount"
-                           " -- abandoning and exiting anyway")
+        if self.llfuse_thread:
+            subprocess.call(["fusermount", "-u", "-z", self.mounttmp])
+            self.llfuse_thread.join(timeout=1)
+            if self.llfuse_thread.is_alive():
+                logger.warning("MountTestBase.tearDown():"
+                               " llfuse thread still alive 1s after umount"
+                               " -- abandoning and exiting anyway")
 
         os.rmdir(self.mounttmp)
         if self.keeptmp:
index c4eadca0f8dc3d418f700733432dc8f6e0c1796e..a975db52b0361ac92c2c42945fa15cd2b30b7a35 100644 (file)
@@ -7,6 +7,7 @@ import sys
 import unittest
 from .. import run_test_server
 from ..mount_test_base import MountTestBase
+from ..slow_test import slow_test
 
 logger = logging.getLogger('arvados.arv-mount')
 
@@ -80,6 +81,7 @@ class CreateCollectionWithMultipleBlocksAndMoveAndDeleteFile(MountTestBase):
     def setUp(self):
         super(CreateCollectionWithMultipleBlocksAndMoveAndDeleteFile, self).setUp()
 
+    @slow_test
     def test_CreateCollectionWithManyBlocksAndMoveAndDeleteFile(self):
         collection = arvados.collection.Collection(api_client=self.api)
         collection.save_new()
@@ -215,6 +217,7 @@ class CreateCollectionWithManyFilesAndMoveAndDeleteFile(MountTestBase):
     def setUp(self):
         super(CreateCollectionWithManyFilesAndMoveAndDeleteFile, self).setUp()
 
+    @slow_test
     def test_CreateCollectionWithManyFilesAndMoveAndDeleteFile(self):
         collection = arvados.collection.Collection(api_client=self.api)
         collection.save_new()
@@ -327,6 +330,7 @@ class UsingMagicDir_CreateCollectionWithManyFilesAndMoveAndDeleteFile(MountTestB
             with open(os.path.join(self.mounttmp, collection, k)) as f:
                 self.assertEqual(v, f.read())
 
+    @slow_test
     def test_UsingMagicDirCreateCollectionWithManyFilesAndMoveAndDeleteFile(self):
         streams = 2
         files_per_stream = 200
@@ -382,6 +386,7 @@ class UsingMagicDir_CreateCollectionWithManyFilesAndMoveAllFilesIntoAnother(Moun
         collection.save_new()
         return collection
 
+    @slow_test
     def test_UsingMagicDirCreateCollectionWithManyFilesAndMoveAllFilesIntoAnother(self):
         streams = 2
         files_per_stream = 200
@@ -428,6 +433,7 @@ class UsingMagicDir_CreateCollectionWithManyFilesAndMoveEachFileIntoAnother(Moun
             self.pool.apply(magicDirTest_MoveFileFromCollection, (self.mounttmp, from_collection.manifest_locator(),
                   to_collection.manifest_locator(), 'stream0', 'file'+str(j)+'.txt',))
 
+    @slow_test
     def test_UsingMagicDirCreateCollectionWithManyFilesAndMoveEachFileIntoAnother(self):
         streams = 2
         files_per_stream = 200
@@ -470,6 +476,7 @@ class FuseListLargeProjectContents(MountTestBase):
             collection_contents = llfuse.listdir(os.path.join(self.mounttmp, collection_name))
             self.assertIn('baz', collection_contents)
 
+    @slow_test
     def test_listLargeProjectContents(self):
         self.make_mount(fuse.ProjectDirectory,
                         project_object=run_test_server.fixture('groups')['project_with_201_collections'])
diff --git a/services/fuse/tests/slow_test.py b/services/fuse/tests/slow_test.py
new file mode 120000 (symlink)
index 0000000..c7e1f7f
--- /dev/null
@@ -0,0 +1 @@
+../../../sdk/python/tests/slow_test.py
\ No newline at end of file
diff --git a/services/fuse/tests/test_cache.py b/services/fuse/tests/test_cache.py
new file mode 100644 (file)
index 0000000..7aa0009
--- /dev/null
@@ -0,0 +1,45 @@
+import arvados
+import arvados.collection
+import arvados_fuse
+import arvados_fuse.command
+import json
+import logging
+import os
+import tempfile
+import unittest
+
+from .integration_test import IntegrationTest
+from .mount_test_base import MountTestBase
+
+class TmpCollectionTest(IntegrationTest):
+    mnt_args = ["--directory-cache=0"]
+
+    @IntegrationTest.mount(argv=mnt_args)
+    def test_cache_spill(self):
+        pdh = []
+        for i in range(0, 8):
+            cw = arvados.collection.Collection()
+            f = cw.open("blurg%i" % i, "w")
+            f.write("bloop%i" % i)
+
+            cw.mkdirs("dir%i" % i)
+            f = cw.open("dir%i/blurg" % i, "w")
+            f.write("dirbloop%i" % i)
+
+            cw.save_new()
+            pdh.append(cw.portable_data_hash())
+        self.pool_test(self.mnt, pdh)
+
+    @staticmethod
+    def _test_cache_spill(self, mnt, pdh):
+        for i,v in enumerate(pdh):
+            j = os.path.join(mnt, "by_id", v, "blurg%i" % i)
+            self.assertTrue(os.path.exists(j))
+            j = os.path.join(mnt, "by_id", v, "dir%i/blurg" % i)
+            self.assertTrue(os.path.exists(j))
+
+        for i,v in enumerate(pdh):
+            j = os.path.join(mnt, "by_id", v, "blurg%i" % i)
+            self.assertTrue(os.path.exists(j))
+            j = os.path.join(mnt, "by_id", v, "dir%i/blurg" % i)
+            self.assertTrue(os.path.exists(j))
index fa48849626a5d251c57ba1584b78855d1c73d0a6..e534e3273747372ce0f9ba19d7b08e9a21b3b7a8 100644 (file)
@@ -1142,7 +1142,6 @@ class TokenExpiryTest(MountTestBase):
 
     @mock.patch('arvados.keep.KeepClient.get')
     def runTest(self, mocked_get):
-        logging.getLogger('arvados.arvados_fuse').setLevel(logging.DEBUG)
         self.api._rootDesc = {"blobSignatureTtl": 2}
         mnt = self.make_mount(fuse.CollectionDirectory, collection_record='zzzzz-4zz18-op4e2lbej01tcvu')
         mocked_get.return_value = 'fake data'
diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go
new file mode 100644 (file)
index 0000000..2a2480c
--- /dev/null
@@ -0,0 +1,638 @@
+package main
+
+import (
+       "fmt"
+       "log"
+       "math"
+       "os"
+       "runtime"
+       "strings"
+       "sync"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+)
+
+// CheckConfig returns an error if anything is wrong with the given
+// config and runOptions.
+func CheckConfig(config Config, runOptions RunOptions) error {
+       if len(config.KeepServiceList.Items) > 0 && config.KeepServiceTypes != nil {
+               return fmt.Errorf("cannot specify both KeepServiceList and KeepServiceTypes in config")
+       }
+       if !runOptions.Once && config.RunPeriod == arvados.Duration(0) {
+               return fmt.Errorf("you must either use the -once flag, or specify RunPeriod in config")
+       }
+       return nil
+}
+
+// Balancer compares the contents of keepstore servers with the
+// collections stored in Arvados, and issues pull/trash requests
+// needed to get (closer to) the optimal data layout.
+//
+// In the optimal data layout: every data block referenced by a
+// collection is replicated at least as many times as desired by the
+// collection; there are no unreferenced data blocks older than
+// BlobSignatureTTL; and all N existing replicas of a given data block
+// are in the N best positions in rendezvous probe order.
+type Balancer struct {
+       *BlockStateMap
+       KeepServices       map[string]*KeepService
+       DefaultReplication int
+       Logger             *log.Logger
+       Dumper             *log.Logger
+       MinMtime           int64
+
+       collScanned  int
+       serviceRoots map[string]string
+       errors       []error
+       mutex        sync.Mutex
+}
+
+// Run performs a balance operation using the given config and
+// runOptions. It should only be called once on a given Balancer
+// object. Typical usage:
+//
+//   err = (&Balancer{}).Run(config, runOptions)
+func (bal *Balancer) Run(config Config, runOptions RunOptions) (err error) {
+       bal.Dumper = runOptions.Dumper
+       bal.Logger = runOptions.Logger
+       if bal.Logger == nil {
+               bal.Logger = log.New(os.Stderr, "", log.LstdFlags)
+       }
+
+       defer timeMe(bal.Logger, "Run")()
+
+       if len(config.KeepServiceList.Items) > 0 {
+               err = bal.SetKeepServices(config.KeepServiceList)
+       } else {
+               err = bal.DiscoverKeepServices(&config.Client, config.KeepServiceTypes)
+       }
+       if err != nil {
+               return
+       }
+
+       if err = bal.CheckSanityEarly(&config.Client); err != nil {
+               return
+       }
+       if runOptions.CommitTrash {
+               if err = bal.ClearTrashLists(&config.Client); err != nil {
+                       return
+               }
+       }
+       if err = bal.GetCurrentState(&config.Client); err != nil {
+               return
+       }
+       bal.ComputeChangeSets()
+       bal.PrintStatistics()
+       if err = bal.CheckSanityLate(); err != nil {
+               return
+       }
+       if runOptions.CommitPulls {
+               err = bal.CommitPulls(&config.Client)
+               if err != nil {
+                       // Skip trash if we can't pull. (Too cautious?)
+                       return
+               }
+       }
+       if runOptions.CommitTrash {
+               err = bal.CommitTrash(&config.Client)
+       }
+       return
+}
+
+// SetKeepServices sets the list of KeepServices to operate on.
+func (bal *Balancer) SetKeepServices(srvList arvados.KeepServiceList) error {
+       bal.KeepServices = make(map[string]*KeepService)
+       for _, srv := range srvList.Items {
+               bal.KeepServices[srv.UUID] = &KeepService{
+                       KeepService: srv,
+                       ChangeSet:   &ChangeSet{},
+               }
+       }
+       return nil
+}
+
+// DiscoverKeepServices sets the list of KeepServices by calling the
+// API to get a list of all services, and selecting the ones whose
+// ServiceType is in okTypes.
+func (bal *Balancer) DiscoverKeepServices(c *arvados.Client, okTypes []string) error {
+       bal.KeepServices = make(map[string]*KeepService)
+       ok := make(map[string]bool)
+       for _, t := range okTypes {
+               ok[t] = true
+       }
+       return c.EachKeepService(func(srv arvados.KeepService) error {
+               if ok[srv.ServiceType] {
+                       bal.KeepServices[srv.UUID] = &KeepService{
+                               KeepService: srv,
+                               ChangeSet:   &ChangeSet{},
+                       }
+               } else {
+                       bal.logf("skipping %v with service type %q", srv.UUID, srv.ServiceType)
+               }
+               return nil
+       })
+}
+
+// CheckSanityEarly checks for configuration and runtime errors that
+// can be detected before GetCurrentState() and ComputeChangeSets()
+// are called.
+//
+// If it returns an error, it is pointless to run GetCurrentState or
+// ComputeChangeSets: after doing so, the statistics would be
+// meaningless and it would be dangerous to run any Commit methods.
+func (bal *Balancer) CheckSanityEarly(c *arvados.Client) error {
+       u, err := c.CurrentUser()
+       if err != nil {
+               return fmt.Errorf("CurrentUser(): %v", err)
+       }
+       if !u.IsActive || !u.IsAdmin {
+               return fmt.Errorf("current user (%s) is not an active admin user", u.UUID)
+       }
+       for _, srv := range bal.KeepServices {
+               if srv.ServiceType == "proxy" {
+                       return fmt.Errorf("config error: %s: proxy servers cannot be balanced", srv)
+               }
+       }
+       return nil
+}
+
+// ClearTrashLists sends an empty trash list to each keep
+// service. Calling this before GetCurrentState avoids races.
+//
+// When a block appears in an index, we assume that replica will still
+// exist after we delete other replicas on other servers. However,
+// it's possible that a previous rebalancing operation made different
+// decisions (e.g., servers were added/removed, and rendezvous order
+// changed). In this case, the replica might already be on that
+// server's trash list, and it might be deleted before we send a
+// replacement trash list.
+//
+// We avoid this problem if we clear all trash lists before getting
+// indexes. (We also assume there is only one rebalancing process
+// running at a time.)
+func (bal *Balancer) ClearTrashLists(c *arvados.Client) error {
+       for _, srv := range bal.KeepServices {
+               srv.ChangeSet = &ChangeSet{}
+       }
+       return bal.CommitTrash(c)
+}
+
+// GetCurrentState determines the current replication state, and the
+// desired replication level, for every block that is either
+// retrievable or referenced.
+//
+// It determines the current replication state by reading the block index
+// from every known Keep service.
+//
+// It determines the desired replication level by retrieving all
+// collection manifests in the database (API server).
+//
+// It encodes the resulting information in BlockStateMap.
+func (bal *Balancer) GetCurrentState(c *arvados.Client) error {
+       defer timeMe(bal.Logger, "GetCurrentState")()
+       bal.BlockStateMap = NewBlockStateMap()
+
+       dd, err := c.DiscoveryDocument()
+       if err != nil {
+               return err
+       }
+       bal.DefaultReplication = dd.DefaultCollectionReplication
+       bal.MinMtime = time.Now().Unix() - dd.BlobSignatureTTL
+
+       errs := make(chan error, 2+len(bal.KeepServices))
+       wg := sync.WaitGroup{}
+
+       // Start one goroutine for each KeepService: retrieve the
+       // index, and add the returned blocks to BlockStateMap.
+       for _, srv := range bal.KeepServices {
+               wg.Add(1)
+               go func(srv *KeepService) {
+                       defer wg.Done()
+                       bal.logf("%s: retrieve index", srv)
+                       idx, err := srv.Index(c, "")
+                       if err != nil {
+                               errs <- fmt.Errorf("%s: %v", srv, err)
+                               return
+                       }
+                       bal.logf("%s: add %d replicas to map", srv, len(idx))
+                       bal.BlockStateMap.AddReplicas(srv, idx)
+                       bal.logf("%s: done", srv)
+               }(srv)
+       }
+
+       // collQ buffers incoming collections so we can start fetching
+       // the next page without waiting for the current page to
+       // finish processing. (1000 happens to match the page size
+       // used by (*arvados.Client)EachCollection(), but it's OK if
+       // they don't match.)
+       collQ := make(chan arvados.Collection, 1000)
+
+       // Start a goroutine to process collections. (We could use a
+       // worker pool here, but even with a single worker we already
+       // process collections much faster than we can retrieve them.)
+       wg.Add(1)
+       go func() {
+               defer wg.Done()
+               for coll := range collQ {
+                       err := bal.addCollection(coll)
+                       if err != nil {
+                               errs <- err
+                               for range collQ {
+                               }
+                               return
+                       }
+                       bal.collScanned++
+               }
+       }()
+
+       // Start a goroutine to retrieve all collections from the
+       // Arvados database and send them to collQ for processing.
+       wg.Add(1)
+       go func() {
+               defer wg.Done()
+               err = EachCollection(c,
+                       func(coll arvados.Collection) error {
+                               collQ <- coll
+                               if len(errs) > 0 {
+                                       // some other GetCurrentState
+                                       // error happened: no point
+                                       // getting any more
+                                       // collections.
+                                       return fmt.Errorf("")
+                               }
+                               return nil
+                       }, func(done, total int) {
+                               bal.logf("collections: %d/%d", done, total)
+                       })
+               close(collQ)
+               if err != nil {
+                       errs <- err
+               }
+       }()
+
+       go func() {
+               // Send a nil error when all goroutines finish. If
+               // this is the first error sent to errs, then
+               // everything worked.
+               wg.Wait()
+               errs <- nil
+       }()
+       return <-errs
+}
+
+func (bal *Balancer) addCollection(coll arvados.Collection) error {
+       blkids, err := coll.SizedDigests()
+       if err != nil {
+               bal.mutex.Lock()
+               bal.errors = append(bal.errors, fmt.Errorf("%v: %v", coll.UUID, err))
+               bal.mutex.Unlock()
+               return nil
+       }
+       repl := bal.DefaultReplication
+       if coll.ReplicationDesired != nil {
+               repl = *coll.ReplicationDesired
+       }
+       debugf("%v: %d block x%d", coll.UUID, len(blkids), repl)
+       bal.BlockStateMap.IncreaseDesired(repl, blkids)
+       return nil
+}
+
+// ComputeChangeSets compares, for each known block, the current and
+// desired replication states. If it is possible to get closer to the
+// desired state by copying or deleting blocks, it adds those changes
+// to the relevant KeepServices' ChangeSets.
+//
+// It does not actually apply any of the computed changes.
+func (bal *Balancer) ComputeChangeSets() {
+       // This just calls balanceBlock() once for each block, using a
+       // pool of worker goroutines.
+       defer timeMe(bal.Logger, "ComputeChangeSets")()
+       bal.setupServiceRoots()
+
+       type balanceTask struct {
+               blkid arvados.SizedDigest
+               blk   *BlockState
+       }
+       nWorkers := 1 + runtime.NumCPU()
+       todo := make(chan balanceTask, nWorkers)
+       var wg sync.WaitGroup
+       for i := 0; i < nWorkers; i++ {
+               wg.Add(1)
+               go func() {
+                       for work := range todo {
+                               bal.balanceBlock(work.blkid, work.blk)
+                       }
+                       wg.Done()
+               }()
+       }
+       bal.BlockStateMap.Apply(func(blkid arvados.SizedDigest, blk *BlockState) {
+               todo <- balanceTask{
+                       blkid: blkid,
+                       blk:   blk,
+               }
+       })
+       close(todo)
+       wg.Wait()
+}
+
+func (bal *Balancer) setupServiceRoots() {
+       bal.serviceRoots = make(map[string]string)
+       for _, srv := range bal.KeepServices {
+               bal.serviceRoots[srv.UUID] = srv.UUID
+       }
+}
+
+const (
+       changeStay = iota
+       changePull
+       changeTrash
+       changeNone
+)
+
+var changeName = map[int]string{
+       changeStay:  "stay",
+       changePull:  "pull",
+       changeTrash: "trash",
+       changeNone:  "none",
+}
+
+// balanceBlock compares current state to desired state for a single
+// block, and makes the appropriate ChangeSet calls.
+func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) {
+       debugf("balanceBlock: %v %+v", blkid, blk)
+       uuids := keepclient.NewRootSorter(bal.serviceRoots, string(blkid[:32])).GetSortedRoots()
+       hasRepl := make(map[string]Replica, len(bal.serviceRoots))
+       for _, repl := range blk.Replicas {
+               hasRepl[repl.UUID] = repl
+               // TODO: when multiple copies are on one server, use
+               // the oldest one that doesn't have a timestamp
+               // collision with other replicas.
+       }
+       // number of replicas already found in positions better than
+       // the position we're contemplating now.
+       reportedBestRepl := 0
+       // To be safe we assume two replicas with the same Mtime are
+       // in fact the same replica being reported more than
+       // once. len(uniqueBestRepl) is the number of distinct
+       // replicas in the best rendezvous positions we've considered
+       // so far.
+       uniqueBestRepl := make(map[int64]bool, len(bal.serviceRoots))
+       // pulls is the number of Pull changes we have already
+       // requested. (For purposes of deciding whether to Pull to
+       // rendezvous position N, we should assume all pulls we have
+       // requested on rendezvous positions M<N will be successful.)
+       pulls := 0
+       var changes []string
+       for _, uuid := range uuids {
+               change := changeNone
+               srv := bal.KeepServices[uuid]
+               // TODO: request a Touch if Mtime is duplicated.
+               repl, ok := hasRepl[srv.UUID]
+               if ok {
+                       // This service has a replica. We should
+                       // delete it if [1] we already have enough
+                       // distinct replicas in better rendezvous
+                       // positions and [2] this replica's Mtime is
+                       // distinct from all of the better replicas'
+                       // Mtimes.
+                       if !srv.ReadOnly &&
+                               repl.Mtime < bal.MinMtime &&
+                               len(uniqueBestRepl) >= blk.Desired &&
+                               !uniqueBestRepl[repl.Mtime] {
+                               srv.AddTrash(Trash{
+                                       SizedDigest: blkid,
+                                       Mtime:       repl.Mtime,
+                               })
+                               change = changeTrash
+                       } else {
+                               change = changeStay
+                       }
+                       uniqueBestRepl[repl.Mtime] = true
+                       reportedBestRepl++
+               } else if pulls+reportedBestRepl < blk.Desired &&
+                       len(blk.Replicas) > 0 &&
+                       !srv.ReadOnly {
+                       // This service doesn't have a replica. We
+                       // should pull one to this server if we don't
+                       // already have enough (existing+requested)
+                       // replicas in better rendezvous positions.
+                       srv.AddPull(Pull{
+                               SizedDigest: blkid,
+                               Source:      blk.Replicas[0].KeepService,
+                       })
+                       pulls++
+                       change = changePull
+               }
+               if bal.Dumper != nil {
+                       changes = append(changes, fmt.Sprintf("%s:%d=%s,%d", srv.ServiceHost, srv.ServicePort, changeName[change], repl.Mtime))
+               }
+       }
+       if bal.Dumper != nil {
+               bal.Dumper.Printf("%s have=%d want=%d %s", blkid, len(blk.Replicas), blk.Desired, strings.Join(changes, " "))
+       }
+}
+
+type blocksNBytes struct {
+       replicas int
+       blocks   int
+       bytes    int64
+}
+
+func (bb blocksNBytes) String() string {
+       return fmt.Sprintf("%d replicas (%d blocks, %d bytes)", bb.replicas, bb.blocks, bb.bytes)
+}
+
+type balancerStats struct {
+       lost, overrep, unref, garbage, underrep, justright blocksNBytes
+       desired, current                                   blocksNBytes
+       pulls, trashes                                     int
+       replHistogram                                      []int
+}
+
+func (bal *Balancer) getStatistics() (s balancerStats) {
+       s.replHistogram = make([]int, 2)
+       bal.BlockStateMap.Apply(func(blkid arvados.SizedDigest, blk *BlockState) {
+               surplus := len(blk.Replicas) - blk.Desired
+               bytes := blkid.Size()
+               switch {
+               case len(blk.Replicas) == 0 && blk.Desired > 0:
+                       s.lost.replicas -= surplus
+                       s.lost.blocks++
+                       s.lost.bytes += bytes * int64(-surplus)
+               case len(blk.Replicas) < blk.Desired:
+                       s.underrep.replicas -= surplus
+                       s.underrep.blocks++
+                       s.underrep.bytes += bytes * int64(-surplus)
+               case len(blk.Replicas) > 0 && blk.Desired == 0:
+                       counter := &s.garbage
+                       for _, r := range blk.Replicas {
+                               if r.Mtime >= bal.MinMtime {
+                                       counter = &s.unref
+                                       break
+                               }
+                       }
+                       counter.replicas += surplus
+                       counter.blocks++
+                       counter.bytes += bytes * int64(surplus)
+               case len(blk.Replicas) > blk.Desired:
+                       s.overrep.replicas += surplus
+                       s.overrep.blocks++
+                       s.overrep.bytes += bytes * int64(len(blk.Replicas)-blk.Desired)
+               default:
+                       s.justright.replicas += blk.Desired
+                       s.justright.blocks++
+                       s.justright.bytes += bytes * int64(blk.Desired)
+               }
+
+               if blk.Desired > 0 {
+                       s.desired.replicas += blk.Desired
+                       s.desired.blocks++
+                       s.desired.bytes += bytes * int64(blk.Desired)
+               }
+               if len(blk.Replicas) > 0 {
+                       s.current.replicas += len(blk.Replicas)
+                       s.current.blocks++
+                       s.current.bytes += bytes * int64(len(blk.Replicas))
+               }
+
+               for len(s.replHistogram) <= len(blk.Replicas) {
+                       s.replHistogram = append(s.replHistogram, 0)
+               }
+               s.replHistogram[len(blk.Replicas)]++
+       })
+       for _, srv := range bal.KeepServices {
+               s.pulls += len(srv.ChangeSet.Pulls)
+               s.trashes += len(srv.ChangeSet.Trashes)
+       }
+       return
+}
+
+// PrintStatistics writes statistics about the computed changes to
+// bal.Logger. It should not be called until ComputeChangeSets has
+// finished.
+func (bal *Balancer) PrintStatistics() {
+       s := bal.getStatistics()
+       bal.logf("===")
+       bal.logf("%s lost (0=have<want)", s.lost)
+       bal.logf("%s underreplicated (0<have<want)", s.underrep)
+       bal.logf("%s just right (have=want)", s.justright)
+       bal.logf("%s overreplicated (have>want>0)", s.overrep)
+       bal.logf("%s unreferenced (have>want=0, new)", s.unref)
+       bal.logf("%s garbage (have>want=0, old)", s.garbage)
+       bal.logf("===")
+       bal.logf("%s total commitment (excluding unreferenced)", s.desired)
+       bal.logf("%s total usage", s.current)
+       bal.logf("===")
+       for _, srv := range bal.KeepServices {
+               bal.logf("%s: %v\n", srv, srv.ChangeSet)
+       }
+       bal.logf("===")
+       bal.printHistogram(s, 60)
+       bal.logf("===")
+}
+
+func (bal *Balancer) printHistogram(s balancerStats, hashColumns int) {
+       bal.logf("Replication level distribution (counting N replicas on a single server as N):")
+       maxCount := 0
+       for _, count := range s.replHistogram {
+               if maxCount < count {
+                       maxCount = count
+               }
+       }
+       hashes := strings.Repeat("#", hashColumns)
+       countWidth := 1 + int(math.Log10(float64(maxCount+1)))
+       scaleCount := 10 * float64(hashColumns) / math.Floor(1+10*math.Log10(float64(maxCount+1)))
+       for repl, count := range s.replHistogram {
+               nHashes := int(scaleCount * math.Log10(float64(count+1)))
+               bal.logf("%2d: %*d %s", repl, countWidth, count, hashes[:nHashes])
+       }
+}
+
+// CheckSanityLate checks for configuration and runtime errors after
+// GetCurrentState() and ComputeChangeSets() have finished.
+//
+// If it returns an error, it is dangerous to run any Commit methods.
+func (bal *Balancer) CheckSanityLate() error {
+       if bal.errors != nil {
+               for _, err := range bal.errors {
+                       bal.logf("deferred error: %v", err)
+               }
+               return fmt.Errorf("cannot proceed safely after deferred errors")
+       }
+
+       if bal.collScanned == 0 {
+               return fmt.Errorf("received zero collections")
+       }
+
+       anyDesired := false
+       bal.BlockStateMap.Apply(func(_ arvados.SizedDigest, blk *BlockState) {
+               if blk.Desired > 0 {
+                       anyDesired = true
+               }
+       })
+       if !anyDesired {
+               return fmt.Errorf("zero blocks have desired replication>0")
+       }
+
+       if dr := bal.DefaultReplication; dr < 1 {
+               return fmt.Errorf("Default replication (%d) is less than 1", dr)
+       }
+
+       // TODO: no two services have identical indexes
+       // TODO: no collisions (same md5, different size)
+       return nil
+}
+
+// CommitPulls sends the computed lists of pull requests to the
+// keepstore servers. This has the effect of increasing replication of
+// existing blocks that are either underreplicated or poorly
+// distributed according to rendezvous hashing.
+func (bal *Balancer) CommitPulls(c *arvados.Client) error {
+       return bal.commitAsync(c, "send pull list",
+               func(srv *KeepService) error {
+                       return srv.CommitPulls(c)
+               })
+}
+
+// CommitTrash sends the computed lists of trash requests to the
+// keepstore servers. This has the effect of deleting blocks that are
+// overreplicated or unreferenced.
+func (bal *Balancer) CommitTrash(c *arvados.Client) error {
+       return bal.commitAsync(c, "send trash list",
+               func(srv *KeepService) error {
+                       return srv.CommitTrash(c)
+               })
+}
+
+func (bal *Balancer) commitAsync(c *arvados.Client, label string, f func(srv *KeepService) error) error {
+       errs := make(chan error)
+       for _, srv := range bal.KeepServices {
+               go func(srv *KeepService) {
+                       var err error
+                       defer func() { errs <- err }()
+                       label := fmt.Sprintf("%s: %v", srv, label)
+                       defer timeMe(bal.Logger, label)()
+                       err = f(srv)
+                       if err != nil {
+                               err = fmt.Errorf("%s: %v", label, err)
+                       }
+               }(srv)
+       }
+       var lastErr error
+       for _ = range bal.KeepServices {
+               if err := <-errs; err != nil {
+                       bal.logf("%v", err)
+                       lastErr = err
+               }
+       }
+       close(errs)
+       return lastErr
+}
+
+func (bal *Balancer) logf(f string, args ...interface{}) {
+       if bal.Logger != nil {
+               bal.Logger.Printf(f, args...)
+       }
+}
diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go
new file mode 100644 (file)
index 0000000..a138d91
--- /dev/null
@@ -0,0 +1,374 @@
+package main
+
+import (
+       _ "encoding/json"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "log"
+       "net/http"
+       "net/http/httptest"
+       "strings"
+       "sync"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&runSuite{})
+
+type reqTracker struct {
+       reqs []http.Request
+       sync.Mutex
+}
+
+func (rt *reqTracker) Count() int {
+       rt.Lock()
+       defer rt.Unlock()
+       return len(rt.reqs)
+}
+
+func (rt *reqTracker) Add(req *http.Request) int {
+       rt.Lock()
+       defer rt.Unlock()
+       rt.reqs = append(rt.reqs, *req)
+       return len(rt.reqs)
+}
+
+// stubServer is an HTTP transport that intercepts and processes all
+// requests using its own handlers.
+type stubServer struct {
+       mux      *http.ServeMux
+       srv      *httptest.Server
+       mutex    sync.Mutex
+       Requests reqTracker
+       logf     func(string, ...interface{})
+}
+
+// Start initializes the stub server and returns an *http.Client that
+// uses the stub server to handle all requests.
+//
+// A stubServer that has been started should eventually be shut down
+// with Close().
+func (s *stubServer) Start() *http.Client {
+       // Set up a config.Client that forwards all requests to s.mux
+       // via s.srv. Test cases will attach handlers to s.mux to get
+       // the desired responses.
+       s.mux = http.NewServeMux()
+       s.srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+               s.mutex.Lock()
+               s.Requests.Add(r)
+               s.mutex.Unlock()
+               w.Header().Set("Content-Type", "application/json")
+               s.mux.ServeHTTP(w, r)
+       }))
+       return &http.Client{Transport: s}
+}
+
+func (s *stubServer) RoundTrip(req *http.Request) (*http.Response, error) {
+       w := httptest.NewRecorder()
+       s.mux.ServeHTTP(w, req)
+       return &http.Response{
+               StatusCode: w.Code,
+               Status:     fmt.Sprintf("%d %s", w.Code, http.StatusText(w.Code)),
+               Header:     w.HeaderMap,
+               Body:       ioutil.NopCloser(w.Body)}, nil
+}
+
+// Close releases resources used by the server.
+func (s *stubServer) Close() {
+       s.srv.Close()
+}
+
+func (s *stubServer) serveStatic(path, data string) *reqTracker {
+       rt := &reqTracker{}
+       s.mux.HandleFunc(path, func(w http.ResponseWriter, r *http.Request) {
+               rt.Add(r)
+               if r.Body != nil {
+                       ioutil.ReadAll(r.Body)
+                       r.Body.Close()
+               }
+               io.WriteString(w, data)
+       })
+       return rt
+}
+
+func (s *stubServer) serveCurrentUserAdmin() *reqTracker {
+       return s.serveStatic("/arvados/v1/users/current",
+               `{"uuid":"zzzzz-tpzed-000000000000000","is_admin":true,"is_active":true}`)
+}
+
+func (s *stubServer) serveCurrentUserNotAdmin() *reqTracker {
+       return s.serveStatic("/arvados/v1/users/current",
+               `{"uuid":"zzzzz-tpzed-000000000000000","is_admin":false,"is_active":true}`)
+}
+
+func (s *stubServer) serveDiscoveryDoc() *reqTracker {
+       return s.serveStatic("/discovery/v1/apis/arvados/v1/rest",
+               `{"defaultCollectionReplication":2}`)
+}
+
+func (s *stubServer) serveZeroCollections() *reqTracker {
+       return s.serveStatic("/arvados/v1/collections",
+               `{"items":[],"items_available":0}`)
+}
+
+func (s *stubServer) serveFooBarFileCollections() *reqTracker {
+       rt := &reqTracker{}
+       s.mux.HandleFunc("/arvados/v1/collections", func(w http.ResponseWriter, r *http.Request) {
+               r.ParseForm()
+               rt.Add(r)
+               if strings.Contains(r.Form.Get("filters"), `modified_at`) {
+                       io.WriteString(w, `{"items_available":0,"items":[]}`)
+               } else {
+                       io.WriteString(w, `{"items_available":2,"items":[
+                               {"uuid":"zzzzz-4zz18-ehbhgtheo8909or","portable_data_hash":"fa7aeb5140e2848d39b416daeef4ffc5+45","manifest_text":". 37b51d194a7513e45b56f6524f2d51f2+3 0:3:bar\n","modified_at":"2014-02-03T17:22:54Z"},
+                               {"uuid":"zzzzz-4zz18-znfnqtbbv4spc3w","portable_data_hash":"1f4b0bc7583c2a7f9102c395f4ffc5e3+45","manifest_text":". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n","modified_at":"2014-02-03T17:22:54Z"}]}`)
+               }
+       })
+       return rt
+}
+
+func (s *stubServer) serveCollectionsButSkipOne() *reqTracker {
+       rt := &reqTracker{}
+       s.mux.HandleFunc("/arvados/v1/collections", func(w http.ResponseWriter, r *http.Request) {
+               r.ParseForm()
+               rt.Add(r)
+               if strings.Contains(r.Form.Get("filters"), `"modified_at","\u003c="`) {
+                       io.WriteString(w, `{"items_available":3,"items":[]}`)
+               } else if strings.Contains(r.Form.Get("filters"), `"modified_at","\u003e="`) {
+                       io.WriteString(w, `{"items_available":0,"items":[]}`)
+               } else {
+                       io.WriteString(w, `{"items_available":2,"items":[
+                               {"uuid":"zzzzz-4zz18-ehbhgtheo8909or","portable_data_hash":"fa7aeb5140e2848d39b416daeef4ffc5+45","manifest_text":". 37b51d194a7513e45b56f6524f2d51f2+3 0:3:bar\n","modified_at":"2014-02-03T17:22:54Z"},
+                               {"uuid":"zzzzz-4zz18-znfnqtbbv4spc3w","portable_data_hash":"1f4b0bc7583c2a7f9102c395f4ffc5e3+45","manifest_text":". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n","modified_at":"2014-02-03T17:22:54Z"}]}`)
+               }
+       })
+       return rt
+}
+
+func (s *stubServer) serveZeroKeepServices() *reqTracker {
+       return s.serveStatic("/arvados/v1/keep_services",
+               `{"items":[],"items_available":0}`)
+}
+
+func (s *stubServer) serveFourDiskKeepServices() *reqTracker {
+       return s.serveStatic("/arvados/v1/keep_services", `{"items_available":5,"items":[
+               {"uuid":"zzzzz-bi6l4-000000000000000","service_host":"keep0.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+               {"uuid":"zzzzz-bi6l4-000000000000001","service_host":"keep1.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+               {"uuid":"zzzzz-bi6l4-000000000000002","service_host":"keep2.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+               {"uuid":"zzzzz-bi6l4-000000000000003","service_host":"keep3.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+               {"uuid":"zzzzz-bi6l4-h0a0xwut9qa6g3a","service_host":"keep.zzzzz.arvadosapi.com","service_port":25333,"service_ssl_flag":true,"service_type":"proxy"}]}`)
+}
+
+func (s *stubServer) serveKeepstoreIndexFoo4Bar1() *reqTracker {
+       rt := &reqTracker{}
+       s.mux.HandleFunc("/index/", func(w http.ResponseWriter, r *http.Request) {
+               count := rt.Add(r)
+               if r.Host == "keep0.zzzzz.arvadosapi.com:25107" {
+                       io.WriteString(w, "37b51d194a7513e45b56f6524f2d51f2+3 12345678\n")
+               }
+               fmt.Fprintf(w, "acbd18db4cc2f85cedef654fccc4a4d8+3 %d\n\n", 12345678+count)
+       })
+       return rt
+}
+
+func (s *stubServer) serveKeepstoreTrash() *reqTracker {
+       return s.serveStatic("/trash", `{}`)
+}
+
+func (s *stubServer) serveKeepstorePull() *reqTracker {
+       return s.serveStatic("/pull", `{}`)
+}
+
+type runSuite struct {
+       stub   stubServer
+       config Config
+}
+
+// make a log.Logger that writes to the current test's c.Log().
+func (s *runSuite) logger(c *check.C) *log.Logger {
+       r, w := io.Pipe()
+       go func() {
+               buf := make([]byte, 10000)
+               for {
+                       n, err := r.Read(buf)
+                       if n > 0 {
+                               if buf[n-1] == '\n' {
+                                       n--
+                               }
+                               c.Log(string(buf[:n]))
+                       }
+                       if err != nil {
+                               break
+                       }
+               }
+       }()
+       return log.New(w, "", log.LstdFlags)
+}
+
+func (s *runSuite) SetUpTest(c *check.C) {
+       s.config = Config{
+               Client: arvados.Client{
+                       AuthToken: "xyzzy",
+                       APIHost:   "zzzzz.arvadosapi.com",
+                       Client:    s.stub.Start()},
+               KeepServiceTypes: []string{"disk"}}
+       s.stub.serveDiscoveryDoc()
+       s.stub.logf = c.Logf
+}
+
+func (s *runSuite) TearDownTest(c *check.C) {
+       s.stub.Close()
+}
+
+func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
+       opts := RunOptions{
+               CommitPulls: true,
+               CommitTrash: true,
+               Logger:      s.logger(c),
+       }
+       s.stub.serveCurrentUserAdmin()
+       s.stub.serveZeroCollections()
+       s.stub.serveFourDiskKeepServices()
+       s.stub.serveKeepstoreIndexFoo4Bar1()
+       trashReqs := s.stub.serveKeepstoreTrash()
+       pullReqs := s.stub.serveKeepstorePull()
+       err := (&Balancer{}).Run(s.config, opts)
+       c.Check(err, check.ErrorMatches, "received zero collections")
+       c.Check(trashReqs.Count(), check.Equals, 4)
+       c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestServiceTypes(c *check.C) {
+       opts := RunOptions{
+               CommitPulls: true,
+               CommitTrash: true,
+               Logger:      s.logger(c),
+       }
+       s.config.KeepServiceTypes = []string{"unlisted-type"}
+       s.stub.serveCurrentUserAdmin()
+       s.stub.serveFooBarFileCollections()
+       s.stub.serveFourDiskKeepServices()
+       indexReqs := s.stub.serveKeepstoreIndexFoo4Bar1()
+       trashReqs := s.stub.serveKeepstoreTrash()
+       err := (&Balancer{}).Run(s.config, opts)
+       c.Check(err, check.IsNil)
+       c.Check(indexReqs.Count(), check.Equals, 0)
+       c.Check(trashReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
+       opts := RunOptions{
+               CommitPulls: true,
+               CommitTrash: true,
+               Logger:      s.logger(c),
+       }
+       s.stub.serveCurrentUserNotAdmin()
+       s.stub.serveZeroCollections()
+       s.stub.serveFourDiskKeepServices()
+       trashReqs := s.stub.serveKeepstoreTrash()
+       pullReqs := s.stub.serveKeepstorePull()
+       err := (&Balancer{}).Run(s.config, opts)
+       c.Check(err, check.ErrorMatches, "current user .* is not .* admin user")
+       c.Check(trashReqs.Count(), check.Equals, 0)
+       c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestDetectSkippedCollections(c *check.C) {
+       opts := RunOptions{
+               CommitPulls: true,
+               CommitTrash: true,
+               Logger:      s.logger(c),
+       }
+       s.stub.serveCurrentUserAdmin()
+       s.stub.serveCollectionsButSkipOne()
+       s.stub.serveFourDiskKeepServices()
+       s.stub.serveKeepstoreIndexFoo4Bar1()
+       trashReqs := s.stub.serveKeepstoreTrash()
+       pullReqs := s.stub.serveKeepstorePull()
+       err := (&Balancer{}).Run(s.config, opts)
+       c.Check(err, check.ErrorMatches, `Retrieved 2 collections with modtime <= .* but server now reports there are 3 collections.*`)
+       c.Check(trashReqs.Count(), check.Equals, 4)
+       c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestDryRun(c *check.C) {
+       opts := RunOptions{
+               CommitPulls: false,
+               CommitTrash: false,
+               Logger:      s.logger(c),
+       }
+       s.stub.serveCurrentUserAdmin()
+       s.stub.serveFooBarFileCollections()
+       s.stub.serveFourDiskKeepServices()
+       s.stub.serveKeepstoreIndexFoo4Bar1()
+       trashReqs := s.stub.serveKeepstoreTrash()
+       pullReqs := s.stub.serveKeepstorePull()
+       var bal Balancer
+       err := bal.Run(s.config, opts)
+       c.Check(err, check.IsNil)
+       c.Check(trashReqs.Count(), check.Equals, 0)
+       c.Check(pullReqs.Count(), check.Equals, 0)
+       stats := bal.getStatistics()
+       c.Check(stats.pulls, check.Not(check.Equals), 0)
+       c.Check(stats.underrep.replicas, check.Not(check.Equals), 0)
+       c.Check(stats.overrep.replicas, check.Not(check.Equals), 0)
+}
+
+func (s *runSuite) TestCommit(c *check.C) {
+       opts := RunOptions{
+               CommitPulls: true,
+               CommitTrash: true,
+               Logger:      s.logger(c),
+               Dumper:      s.logger(c),
+       }
+       s.stub.serveCurrentUserAdmin()
+       s.stub.serveFooBarFileCollections()
+       s.stub.serveFourDiskKeepServices()
+       s.stub.serveKeepstoreIndexFoo4Bar1()
+       trashReqs := s.stub.serveKeepstoreTrash()
+       pullReqs := s.stub.serveKeepstorePull()
+       var bal Balancer
+       err := bal.Run(s.config, opts)
+       c.Check(err, check.IsNil)
+       c.Check(trashReqs.Count(), check.Equals, 8)
+       c.Check(pullReqs.Count(), check.Equals, 4)
+       stats := bal.getStatistics()
+       // "foo" block is overreplicated by 2
+       c.Check(stats.trashes, check.Equals, 2)
+       // "bar" block is underreplicated by 1, and its only copy is
+       // in a poor rendezvous position
+       c.Check(stats.pulls, check.Equals, 2)
+}
+
+func (s *runSuite) TestRunForever(c *check.C) {
+       opts := RunOptions{
+               CommitPulls: true,
+               CommitTrash: true,
+               Logger:      s.logger(c),
+               Dumper:      s.logger(c),
+       }
+       s.stub.serveCurrentUserAdmin()
+       s.stub.serveFooBarFileCollections()
+       s.stub.serveFourDiskKeepServices()
+       s.stub.serveKeepstoreIndexFoo4Bar1()
+       trashReqs := s.stub.serveKeepstoreTrash()
+       pullReqs := s.stub.serveKeepstorePull()
+
+       stop := make(chan interface{})
+       s.config.RunPeriod = arvados.Duration(time.Millisecond)
+       go RunForever(s.config, opts, stop)
+
+       // Each run should send 4 clear trash lists + 4 pull lists + 4
+       // trash lists. We should complete four runs in much less than
+       // a second.
+       for t0 := time.Now(); pullReqs.Count() < 16 && time.Since(t0) < 10*time.Second; {
+               time.Sleep(time.Millisecond)
+       }
+       stop <- true
+       c.Check(pullReqs.Count() >= 16, check.Equals, true)
+       c.Check(trashReqs.Count(), check.Equals, 2*pullReqs.Count())
+}
diff --git a/services/keep-balance/balance_test.go b/services/keep-balance/balance_test.go
new file mode 100644 (file)
index 0000000..682a5fb
--- /dev/null
@@ -0,0 +1,255 @@
+package main
+
+import (
+       "crypto/md5"
+       "fmt"
+       "sort"
+       "strconv"
+       "testing"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+
+       check "gopkg.in/check.v1"
+)
+
+// Test with Gocheck
+func Test(t *testing.T) {
+       check.TestingT(t)
+}
+
+var _ = check.Suite(&balancerSuite{})
+
+type balancerSuite struct {
+       Balancer
+       srvs            []*KeepService
+       blks            map[string]tester
+       knownRendezvous [][]int
+       signatureTTL    int64
+}
+
+const (
+       // index into knownRendezvous
+       known0 = 0
+)
+
+type slots []int
+
+type tester struct {
+       known       int
+       desired     int
+       current     slots
+       timestamps  []int64
+       shouldPull  slots
+       shouldTrash slots
+}
+
+func (bal *balancerSuite) SetUpSuite(c *check.C) {
+       bal.knownRendezvous = nil
+       for _, str := range []string{
+               "3eab2d5fc9681074",
+               "097dba52e648f1c3",
+               "c5b4e023f8a7d691",
+               "9d81c02e76a3bf54",
+       } {
+               var slots []int
+               for _, c := range []byte(str) {
+                       pos, _ := strconv.ParseUint(string(c), 16, 4)
+                       slots = append(slots, int(pos))
+               }
+               bal.knownRendezvous = append(bal.knownRendezvous, slots)
+       }
+
+       bal.signatureTTL = 3600
+}
+
+func (bal *balancerSuite) SetUpTest(c *check.C) {
+       bal.srvs = make([]*KeepService, 16)
+       bal.KeepServices = make(map[string]*KeepService)
+       for i := range bal.srvs {
+               srv := &KeepService{
+                       KeepService: arvados.KeepService{
+                               UUID: fmt.Sprintf("zzzzz-bi6l4-%015x", i),
+                       },
+               }
+               bal.srvs[i] = srv
+               bal.KeepServices[srv.UUID] = srv
+       }
+
+       bal.MinMtime = time.Now().Unix() - bal.signatureTTL
+}
+
+func (bal *balancerSuite) TestPerfect(c *check.C) {
+       bal.try(c, tester{
+               desired:     2,
+               current:     slots{0, 1},
+               shouldPull:  nil,
+               shouldTrash: nil})
+}
+
+func (bal *balancerSuite) TestDecreaseRepl(c *check.C) {
+       bal.try(c, tester{
+               desired:     2,
+               current:     slots{0, 2, 1},
+               shouldTrash: slots{2}})
+}
+
+func (bal *balancerSuite) TestDecreaseReplToZero(c *check.C) {
+       bal.try(c, tester{
+               desired:     0,
+               current:     slots{0, 1, 3},
+               shouldTrash: slots{0, 1, 3}})
+}
+
+func (bal *balancerSuite) TestIncreaseRepl(c *check.C) {
+       bal.try(c, tester{
+               desired:    4,
+               current:    slots{0, 1},
+               shouldPull: slots{2, 3}})
+}
+
+func (bal *balancerSuite) TestSkipReadonly(c *check.C) {
+       bal.srvList(0, slots{3})[0].ReadOnly = true
+       bal.try(c, tester{
+               desired:    4,
+               current:    slots{0, 1},
+               shouldPull: slots{2, 4}})
+}
+
+func (bal *balancerSuite) TestFixUnbalanced(c *check.C) {
+       bal.try(c, tester{
+               desired:    2,
+               current:    slots{2, 0},
+               shouldPull: slots{1}})
+       bal.try(c, tester{
+               desired:    2,
+               current:    slots{2, 7},
+               shouldPull: slots{0, 1}})
+       // if only one of the pulls succeeds, we'll see this next:
+       bal.try(c, tester{
+               desired:     2,
+               current:     slots{2, 1, 7},
+               shouldPull:  slots{0},
+               shouldTrash: slots{7}})
+       // if both pulls succeed, we'll see this next:
+       bal.try(c, tester{
+               desired:     2,
+               current:     slots{2, 0, 1, 7},
+               shouldTrash: slots{2, 7}})
+
+       // unbalanced + excessive replication => pull + trash
+       bal.try(c, tester{
+               desired:     2,
+               current:     slots{2, 5, 7},
+               shouldPull:  slots{0, 1},
+               shouldTrash: slots{7}})
+}
+
+func (bal *balancerSuite) TestIncreaseReplTimestampCollision(c *check.C) {
+       // For purposes of increasing replication, we assume identical
+       // replicas are distinct.
+       bal.try(c, tester{
+               desired:    4,
+               current:    slots{0, 1},
+               timestamps: []int64{12345678, 12345678},
+               shouldPull: slots{2, 3}})
+}
+
+func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
+       // For purposes of decreasing replication, we assume identical
+       // replicas are NOT distinct.
+       bal.try(c, tester{
+               desired:    2,
+               current:    slots{0, 1, 2},
+               timestamps: []int64{12345678, 12345678, 12345678}})
+       bal.try(c, tester{
+               desired:    2,
+               current:    slots{0, 1, 2},
+               timestamps: []int64{12345678, 10000000, 10000000}})
+}
+
+func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
+       oldTime := bal.MinMtime - 3600
+       newTime := bal.MinMtime + 3600
+       // The excess replica is too new to delete.
+       bal.try(c, tester{
+               desired:    2,
+               current:    slots{0, 1, 2},
+               timestamps: []int64{oldTime, newTime, newTime + 1}})
+       // The best replicas are too new to delete, but the excess
+       // replica is old enough.
+       bal.try(c, tester{
+               desired:     2,
+               current:     slots{0, 1, 2},
+               timestamps:  []int64{newTime, newTime + 1, oldTime},
+               shouldTrash: slots{2}})
+}
+
+// Clear all servers' changesets, balance a single block, and verify
+// the appropriate changes for that block have been added to the
+// changesets.
+func (bal *balancerSuite) try(c *check.C, t tester) {
+       bal.setupServiceRoots()
+       blk := &BlockState{
+               Desired:  t.desired,
+               Replicas: bal.replList(t.known, t.current)}
+       for i, t := range t.timestamps {
+               blk.Replicas[i].Mtime = t
+       }
+       for _, srv := range bal.srvs {
+               srv.ChangeSet = &ChangeSet{}
+       }
+       bal.balanceBlock(knownBlkid(t.known), blk)
+
+       var didPull, didTrash slots
+       for i, srv := range bal.srvs {
+               var slot int
+               for probeOrder, srvNum := range bal.knownRendezvous[t.known] {
+                       if srvNum == i {
+                               slot = probeOrder
+                       }
+               }
+               for _, pull := range srv.Pulls {
+                       didPull = append(didPull, slot)
+                       c.Check(pull.SizedDigest, check.Equals, knownBlkid(t.known))
+               }
+               for _, trash := range srv.Trashes {
+                       didTrash = append(didTrash, slot)
+                       c.Check(trash.SizedDigest, check.Equals, knownBlkid(t.known))
+               }
+       }
+
+       for _, list := range []slots{didPull, didTrash, t.shouldPull, t.shouldTrash} {
+               sort.Sort(sort.IntSlice(list))
+       }
+       c.Check(didPull, check.DeepEquals, t.shouldPull)
+       c.Check(didTrash, check.DeepEquals, t.shouldTrash)
+}
+
+// srvList returns the KeepServices, sorted in rendezvous order and
+// then selected by idx. For example, srvList(3, 0, 1, 4) returns the
+// the first-, second-, and fifth-best servers for storing
+// bal.knownBlkid(3).
+func (bal *balancerSuite) srvList(knownBlockID int, order slots) (srvs []*KeepService) {
+       for _, i := range order {
+               srvs = append(srvs, bal.srvs[bal.knownRendezvous[knownBlockID][i]])
+       }
+       return
+}
+
+// replList is like srvList but returns an "existing replicas" slice,
+// suitable for a BlockState test fixture.
+func (bal *balancerSuite) replList(knownBlockID int, order slots) (repls []Replica) {
+       mtime := time.Now().Unix() - bal.signatureTTL - 86400
+       for _, srv := range bal.srvList(knownBlockID, order) {
+               repls = append(repls, Replica{srv, mtime})
+               mtime++
+       }
+       return
+}
+
+// generate the same data hashes that are tested in
+// sdk/go/keepclient/root_sorter_test.go
+func knownBlkid(i int) arvados.SizedDigest {
+       return arvados.SizedDigest(fmt.Sprintf("%x+64", md5.Sum([]byte(fmt.Sprintf("%064x", i)))))
+}
diff --git a/services/keep-balance/block_state.go b/services/keep-balance/block_state.go
new file mode 100644 (file)
index 0000000..d607386
--- /dev/null
@@ -0,0 +1,95 @@
+package main
+
+import (
+       "sync"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// Replica is a file on disk (or object in an S3 bucket, or blob in an
+// Azure storage container, etc.) as reported in a keepstore index
+// response.
+type Replica struct {
+       *KeepService
+       Mtime int64
+}
+
+// BlockState indicates the number of desired replicas (according to
+// the collections we know about) and the replicas actually stored
+// (according to the keepstore indexes we know about).
+type BlockState struct {
+       Replicas []Replica
+       Desired  int
+}
+
+func (bs *BlockState) addReplica(r Replica) {
+       bs.Replicas = append(bs.Replicas, r)
+}
+
+func (bs *BlockState) increaseDesired(n int) {
+       if bs.Desired < n {
+               bs.Desired = n
+       }
+}
+
+// BlockStateMap is a goroutine-safe wrapper around a
+// map[arvados.SizedDigest]*BlockState.
+type BlockStateMap struct {
+       entries map[arvados.SizedDigest]*BlockState
+       mutex   sync.Mutex
+}
+
+// NewBlockStateMap returns a newly allocated BlockStateMap.
+func NewBlockStateMap() *BlockStateMap {
+       return &BlockStateMap{
+               entries: make(map[arvados.SizedDigest]*BlockState),
+       }
+}
+
+// return a BlockState entry, allocating a new one if needed. (Private
+// method: not goroutine-safe.)
+func (bsm *BlockStateMap) get(blkid arvados.SizedDigest) *BlockState {
+       // TODO? Allocate BlockState structs a slice at a time,
+       // instead of one at a time.
+       blk := bsm.entries[blkid]
+       if blk == nil {
+               blk = &BlockState{}
+               bsm.entries[blkid] = blk
+       }
+       return blk
+}
+
+// Apply runs f on each entry in the map.
+func (bsm *BlockStateMap) Apply(f func(arvados.SizedDigest, *BlockState)) {
+       bsm.mutex.Lock()
+       defer bsm.mutex.Unlock()
+
+       for blkid, blk := range bsm.entries {
+               f(blkid, blk)
+       }
+}
+
+// AddReplicas updates the map to indicate srv has a replica of each
+// block in idx.
+func (bsm *BlockStateMap) AddReplicas(srv *KeepService, idx []arvados.KeepServiceIndexEntry) {
+       bsm.mutex.Lock()
+       defer bsm.mutex.Unlock()
+
+       for _, ent := range idx {
+               bsm.get(ent.SizedDigest).addReplica(Replica{
+                       KeepService: srv,
+                       Mtime:       ent.Mtime,
+               })
+       }
+}
+
+// IncreaseDesired updates the map to indicate the desired replication
+// for the given blocks is at least n.
+func (bsm *BlockStateMap) IncreaseDesired(n int, blocks []arvados.SizedDigest) {
+       bsm.mutex.Lock()
+       defer bsm.mutex.Unlock()
+
+       for _, blkid := range blocks {
+               bsm.get(blkid).increaseDesired(n)
+       }
+}
diff --git a/services/keep-balance/change_set.go b/services/keep-balance/change_set.go
new file mode 100644 (file)
index 0000000..417ea7f
--- /dev/null
@@ -0,0 +1,75 @@
+package main
+
+import (
+       "encoding/json"
+       "fmt"
+       "sync"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// Pull is a request to retrieve a block from a remote server, and
+// store it locally.
+type Pull struct {
+       arvados.SizedDigest
+       Source *KeepService
+}
+
+// MarshalJSON formats a pull request the way keepstore wants to see
+// it.
+func (p Pull) MarshalJSON() ([]byte, error) {
+       type KeepstorePullRequest struct {
+               Locator string   `json:"locator"`
+               Servers []string `json:"servers"`
+       }
+       return json.Marshal(KeepstorePullRequest{
+               Locator: string(p.SizedDigest[:32]),
+               Servers: []string{p.Source.URLBase()}})
+}
+
+// Trash is a request to delete a block.
+type Trash struct {
+       arvados.SizedDigest
+       Mtime int64
+}
+
+// MarshalJSON formats a trash request the way keepstore wants to see
+// it, i.e., as a bare locator with no +size hint.
+func (t Trash) MarshalJSON() ([]byte, error) {
+       type KeepstoreTrashRequest struct {
+               Locator    string `json:"locator"`
+               BlockMtime int64  `json:"block_mtime"`
+       }
+       return json.Marshal(KeepstoreTrashRequest{
+               Locator:    string(t.SizedDigest[:32]),
+               BlockMtime: t.Mtime})
+}
+
+// ChangeSet is a set of change requests that will be sent to a
+// keepstore server.
+type ChangeSet struct {
+       Pulls   []Pull
+       Trashes []Trash
+       mutex   sync.Mutex
+}
+
+// AddPull adds a Pull operation.
+func (cs *ChangeSet) AddPull(p Pull) {
+       cs.mutex.Lock()
+       cs.Pulls = append(cs.Pulls, p)
+       cs.mutex.Unlock()
+}
+
+// AddTrash adds a Trash operation
+func (cs *ChangeSet) AddTrash(t Trash) {
+       cs.mutex.Lock()
+       cs.Trashes = append(cs.Trashes, t)
+       cs.mutex.Unlock()
+}
+
+// String implements fmt.Stringer.
+func (cs *ChangeSet) String() string {
+       cs.mutex.Lock()
+       defer cs.mutex.Unlock()
+       return fmt.Sprintf("ChangeSet{Pulls:%d, Trashes:%d}", len(cs.Pulls), len(cs.Trashes))
+}
diff --git a/services/keep-balance/change_set_test.go b/services/keep-balance/change_set_test.go
new file mode 100644 (file)
index 0000000..b5dcb5c
--- /dev/null
@@ -0,0 +1,35 @@
+package main
+
+import (
+       "encoding/json"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&changeSetSuite{})
+
+type changeSetSuite struct{}
+
+func (s *changeSetSuite) TestJSONFormat(c *check.C) {
+       srv := &KeepService{
+               KeepService: arvados.KeepService{
+                       UUID:           "zzzzz-bi6l4-000000000000001",
+                       ServiceType:    "disk",
+                       ServiceSSLFlag: false,
+                       ServiceHost:    "keep1.zzzzz.arvadosapi.com",
+                       ServicePort:    25107}}
+
+       buf, err := json.Marshal([]Pull{{
+               SizedDigest: arvados.SizedDigest("acbd18db4cc2f85cedef654fccc4a4d8+3"),
+               Source:      srv}})
+       c.Check(err, check.IsNil)
+       c.Check(string(buf), check.Equals, `[{"locator":"acbd18db4cc2f85cedef654fccc4a4d8","servers":["http://keep1.zzzzz.arvadosapi.com:25107"]}]`)
+
+       buf, err = json.Marshal([]Trash{{
+               SizedDigest: arvados.SizedDigest("acbd18db4cc2f85cedef654fccc4a4d8+3"),
+               Mtime:       123456789}})
+       c.Check(err, check.IsNil)
+       c.Check(string(buf), check.Equals, `[{"locator":"acbd18db4cc2f85cedef654fccc4a4d8","block_mtime":123456789}]`)
+}
diff --git a/services/keep-balance/collection.go b/services/keep-balance/collection.go
new file mode 100644 (file)
index 0000000..e6a1f08
--- /dev/null
@@ -0,0 +1,95 @@
+package main
+
+import (
+       "fmt"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+func countCollections(c *arvados.Client, params arvados.ResourceListParams) (int, error) {
+       var page arvados.CollectionList
+       var zero int
+       params.Limit = &zero
+       err := c.RequestAndDecode(&page, "GET", "arvados/v1/collections", nil, params)
+       return page.ItemsAvailable, err
+}
+
+// EachCollection calls f once for every readable
+// collection. EachCollection stops if it encounters an error, such as
+// f returning a non-nil error.
+//
+// The progress function is called periodically with done (number of
+// times f has been called) and total (number of times f is expected
+// to be called).
+func EachCollection(c *arvados.Client, f func(arvados.Collection) error, progress func(done, total int)) error {
+       if progress == nil {
+               progress = func(_, _ int) {}
+       }
+
+       expectCount, err := countCollections(c, arvados.ResourceListParams{})
+       if err != nil {
+               return err
+       }
+
+       limit := 1000
+       params := arvados.ResourceListParams{
+               Limit:  &limit,
+               Order:  "modified_at, uuid",
+               Select: []string{"uuid", "manifest_text", "modified_at", "portable_data_hash", "replication_desired"},
+       }
+       var last arvados.Collection
+       var filterTime time.Time
+       callCount := 0
+       for {
+               progress(callCount, expectCount)
+               var page arvados.CollectionList
+               err := c.RequestAndDecode(&page, "GET", "arvados/v1/collections", nil, params)
+               if err != nil {
+                       return err
+               }
+               for _, coll := range page.Items {
+                       if last.ModifiedAt != nil && *last.ModifiedAt == *coll.ModifiedAt && last.UUID >= coll.UUID {
+                               continue
+                       }
+                       callCount++
+                       err = f(coll)
+                       if err != nil {
+                               return err
+                       }
+                       last = coll
+               }
+               if last.ModifiedAt == nil || *last.ModifiedAt == filterTime {
+                       if page.ItemsAvailable > len(page.Items) {
+                               // TODO: use "mtime=X && UUID>Y"
+                               // filters to get all collections with
+                               // this timestamp, then use "mtime>X"
+                               // to get the next timestamp.
+                               return fmt.Errorf("BUG: Received an entire page with the same modified_at timestamp (%v), cannot make progress", filterTime)
+                       }
+                       break
+               }
+               filterTime = *last.ModifiedAt
+               params.Filters = []arvados.Filter{{
+                       Attr:     "modified_at",
+                       Operator: ">=",
+                       Operand:  filterTime,
+               }, {
+                       Attr:     "uuid",
+                       Operator: "!=",
+                       Operand:  last.UUID,
+               }}
+       }
+       progress(callCount, expectCount)
+
+       if checkCount, err := countCollections(c, arvados.ResourceListParams{Filters: []arvados.Filter{{
+               Attr:     "modified_at",
+               Operator: "<=",
+               Operand:  filterTime}}}); err != nil {
+               return err
+       } else if callCount < checkCount {
+               return fmt.Errorf("Retrieved %d collections with modtime <= T=%q, but server now reports there are %d collections with modtime <= T", callCount, filterTime, checkCount)
+       }
+
+       return nil
+}
diff --git a/services/keep-balance/integration_test.go b/services/keep-balance/integration_test.go
new file mode 100644 (file)
index 0000000..b090614
--- /dev/null
@@ -0,0 +1,92 @@
+package main
+
+import (
+       "bytes"
+       "log"
+       "net/http"
+       "os"
+       "strings"
+       "testing"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&integrationSuite{})
+
+type integrationSuite struct {
+       config     Config
+       keepClient *keepclient.KeepClient
+}
+
+func (s *integrationSuite) SetUpSuite(c *check.C) {
+       if testing.Short() {
+               c.Skip("-short")
+       }
+       arvadostest.ResetEnv()
+       arvadostest.StartAPI()
+       arvadostest.StartKeep(4, true)
+
+       arv, err := arvadosclient.MakeArvadosClient()
+       arv.ApiToken = arvadostest.DataManagerToken
+       c.Assert(err, check.IsNil)
+       s.keepClient = &keepclient.KeepClient{
+               Arvados: &arv,
+               Client:  &http.Client{},
+       }
+       c.Assert(s.keepClient.DiscoverKeepServers(), check.IsNil)
+       s.putReplicas(c, "foo", 4)
+       s.putReplicas(c, "bar", 1)
+}
+
+func (s *integrationSuite) putReplicas(c *check.C, data string, replicas int) {
+       s.keepClient.Want_replicas = replicas
+       _, _, err := s.keepClient.PutB([]byte(data))
+       c.Assert(err, check.IsNil)
+}
+
+func (s *integrationSuite) TearDownSuite(c *check.C) {
+       if testing.Short() {
+               c.Skip("-short")
+       }
+       arvadostest.StopKeep(4)
+       arvadostest.StopAPI()
+}
+
+func (s *integrationSuite) SetUpTest(c *check.C) {
+       s.config = Config{
+               Client: arvados.Client{
+                       APIHost:   os.Getenv("ARVADOS_API_HOST"),
+                       AuthToken: arvadostest.DataManagerToken,
+                       Insecure:  true,
+               },
+               KeepServiceTypes: []string{"disk"},
+       }
+}
+
+func (s *integrationSuite) TestBalanceAPIFixtures(c *check.C) {
+       var logBuf *bytes.Buffer
+       for iter := 0; iter < 20; iter++ {
+               logBuf := &bytes.Buffer{}
+               opts := RunOptions{
+                       CommitPulls: true,
+                       CommitTrash: true,
+                       Logger:      log.New(logBuf, "", log.LstdFlags),
+               }
+               err := (&Balancer{}).Run(s.config, opts)
+               c.Check(err, check.IsNil)
+               if iter == 0 {
+                       c.Check(logBuf.String(), check.Matches, `(?ms).*ChangeSet{Pulls:1.*`)
+                       c.Check(logBuf.String(), check.Not(check.Matches), `(?ms).*ChangeSet{.*Trashes:[^0]}*`)
+               } else if strings.Contains(logBuf.String(), "ChangeSet{Pulls:0") {
+                       break
+               }
+               time.Sleep(200 * time.Millisecond)
+       }
+       c.Check(logBuf.String(), check.Not(check.Matches), `(?ms).*0 replicas (0 blocks, 0 bytes) underreplicated.*`)
+}
diff --git a/services/keep-balance/keep_service.go b/services/keep-balance/keep_service.go
new file mode 100644 (file)
index 0000000..f65355d
--- /dev/null
@@ -0,0 +1,76 @@
+package main
+
+import (
+       "encoding/json"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net/http"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// KeepService represents a keepstore server that is being rebalanced.
+type KeepService struct {
+       arvados.KeepService
+       *ChangeSet
+}
+
+// String implements fmt.Stringer.
+func (srv *KeepService) String() string {
+       return fmt.Sprintf("%s (%s:%d, %s)", srv.UUID, srv.ServiceHost, srv.ServicePort, srv.ServiceType)
+}
+
+var ksSchemes = map[bool]string{false: "http", true: "https"}
+
+// URLBase returns scheme://host:port for this server.
+func (srv *KeepService) URLBase() string {
+       return fmt.Sprintf("%s://%s:%d", ksSchemes[srv.ServiceSSLFlag], srv.ServiceHost, srv.ServicePort)
+}
+
+// CommitPulls sends the current list of pull requests to the storage
+// server (even if the list is empty).
+func (srv *KeepService) CommitPulls(c *arvados.Client) error {
+       return srv.put(c, "pull", srv.ChangeSet.Pulls)
+}
+
+// CommitTrash sends the current list of trash requests to the storage
+// server (even if the list is empty).
+func (srv *KeepService) CommitTrash(c *arvados.Client) error {
+       return srv.put(c, "trash", srv.ChangeSet.Trashes)
+}
+
+// Perform a PUT request at path, with data (as JSON) in the request
+// body.
+func (srv *KeepService) put(c *arvados.Client, path string, data interface{}) error {
+       // We'll start a goroutine to do the JSON encoding, so we can
+       // stream it to the http client through a Pipe, rather than
+       // keeping the entire encoded version in memory.
+       jsonR, jsonW := io.Pipe()
+
+       // errC communicates any encoding errors back to our main
+       // goroutine.
+       errC := make(chan error, 1)
+
+       go func() {
+               enc := json.NewEncoder(jsonW)
+               errC <- enc.Encode(data)
+               jsonW.Close()
+       }()
+
+       url := srv.URLBase() + "/" + path
+       req, err := http.NewRequest("PUT", url, ioutil.NopCloser(jsonR))
+       if err != nil {
+               return fmt.Errorf("building request for %s: %v", url, err)
+       }
+       err = c.DoAndDecode(nil, req)
+
+       // If there was an error encoding the request body, report
+       // that instead of the response: obviously we won't get a
+       // useful response if our request wasn't properly encoded.
+       if encErr := <-errC; encErr != nil {
+               return fmt.Errorf("encoding data for %s: %v", url, encErr)
+       }
+
+       return err
+}
diff --git a/services/keep-balance/main.go b/services/keep-balance/main.go
new file mode 100644 (file)
index 0000000..42a8d63
--- /dev/null
@@ -0,0 +1,156 @@
+package main
+
+import (
+       "encoding/json"
+       "flag"
+       "io/ioutil"
+       "log"
+       "os"
+       "os/signal"
+       "syscall"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// Config specifies site configuration, like API credentials and the
+// choice of which servers are to be balanced.
+//
+// Config is loaded from a JSON config file (see usage()).
+type Config struct {
+       // Arvados API endpoint and credentials.
+       Client arvados.Client
+
+       // List of service types (e.g., "disk") to balance.
+       KeepServiceTypes []string
+
+       KeepServiceList arvados.KeepServiceList
+
+       // How often to check
+       RunPeriod arvados.Duration
+}
+
+// RunOptions controls runtime behavior. The flags/options that belong
+// here are the ones that are useful for interactive use. For example,
+// "CommitTrash" is a runtime option rather than a config item because
+// it invokes a troubleshooting feature rather than expressing how
+// balancing is meant to be done at a given site.
+//
+// RunOptions fields are controlled by command line flags.
+type RunOptions struct {
+       Once        bool
+       CommitPulls bool
+       CommitTrash bool
+       Logger      *log.Logger
+       Dumper      *log.Logger
+}
+
+var debugf = func(string, ...interface{}) {}
+
+func main() {
+       var config Config
+       var runOptions RunOptions
+
+       configPath := flag.String("config", "",
+               "`path` of json configuration file")
+       serviceListPath := flag.String("config.KeepServiceList", "",
+               "`path` of json file with list of keep services to balance, as given by \"arv keep_service list\" "+
+                       "(default: config[\"KeepServiceList\"], or if none given, get all available services and filter by config[\"KeepServiceTypes\"])")
+       flag.BoolVar(&runOptions.Once, "once", false,
+               "balance once and then exit")
+       flag.BoolVar(&runOptions.CommitPulls, "commit-pulls", false,
+               "send pull requests (make more replicas of blocks that are underreplicated or are not in optimal rendezvous probe order)")
+       flag.BoolVar(&runOptions.CommitTrash, "commit-trash", false,
+               "send trash requests (delete unreferenced old blocks, and excess replicas of overreplicated blocks)")
+       dumpFlag := flag.Bool("dump", false, "dump details for each block to stdout")
+       debugFlag := flag.Bool("debug", false, "enable debug messages")
+       flag.Usage = usage
+       flag.Parse()
+
+       if *configPath == "" {
+               log.Fatal("You must specify a config file (see `keep-balance -help`)")
+       }
+       mustReadJSON(&config, *configPath)
+       if *serviceListPath != "" {
+               mustReadJSON(&config.KeepServiceList, *serviceListPath)
+       }
+
+       if *debugFlag {
+               debugf = log.Printf
+               if j, err := json.Marshal(config); err != nil {
+                       log.Fatal(err)
+               } else {
+                       log.Printf("config is %s", j)
+               }
+       }
+       if *dumpFlag {
+               runOptions.Dumper = log.New(os.Stdout, "", log.LstdFlags)
+       }
+       err := CheckConfig(config, runOptions)
+       if err != nil {
+               // (don't run)
+       } else if runOptions.Once {
+               err = (&Balancer{}).Run(config, runOptions)
+       } else {
+               err = RunForever(config, runOptions, nil)
+       }
+       if err != nil {
+               log.Fatal(err)
+       }
+}
+
+func mustReadJSON(dst interface{}, path string) {
+       if buf, err := ioutil.ReadFile(path); err != nil {
+               log.Fatalf("Reading %q: %v", path, err)
+       } else if err = json.Unmarshal(buf, dst); err != nil {
+               log.Fatalf("Decoding %q: %v", path, err)
+       }
+}
+
+// RunForever runs forever, or (for testing purposes) until the given
+// stop channel is ready to receive.
+func RunForever(config Config, runOptions RunOptions, stop <-chan interface{}) error {
+       if runOptions.Logger == nil {
+               runOptions.Logger = log.New(os.Stderr, "", log.LstdFlags)
+       }
+       logger := runOptions.Logger
+
+       ticker := time.NewTicker(time.Duration(config.RunPeriod))
+
+       // The unbuffered channel here means we only hear SIGUSR1 if
+       // it arrives while we're waiting in select{}.
+       sigUSR1 := make(chan os.Signal)
+       signal.Notify(sigUSR1, syscall.SIGUSR1)
+
+       logger.Printf("starting up: will scan every %v and on SIGUSR1", config.RunPeriod)
+
+       for {
+               if !runOptions.CommitPulls && !runOptions.CommitTrash {
+                       logger.Print("WARNING: Will scan periodically, but no changes will be committed.")
+                       logger.Print("=======  Consider using -commit-pulls and -commit-trash flags.")
+               }
+
+               err := (&Balancer{}).Run(config, runOptions)
+               if err != nil {
+                       logger.Print("run failed: ", err)
+               } else {
+                       logger.Print("run succeeded")
+               }
+
+               select {
+               case <-stop:
+                       signal.Stop(sigUSR1)
+                       return nil
+               case <-ticker.C:
+                       logger.Print("timer went off")
+               case <-sigUSR1:
+                       logger.Print("received SIGUSR1, resetting timer")
+                       // Reset the timer so we don't start the N+1st
+                       // run too soon after the Nth run is triggered
+                       // by SIGUSR1.
+                       ticker.Stop()
+                       ticker = time.NewTicker(time.Duration(config.RunPeriod))
+               }
+               logger.Print("starting next run")
+       }
+}
diff --git a/services/keep-balance/main_test.go b/services/keep-balance/main_test.go
new file mode 100644 (file)
index 0000000..4a56098
--- /dev/null
@@ -0,0 +1,43 @@
+package main
+
+import (
+       "encoding/json"
+       "time"
+
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&mainSuite{})
+
+type mainSuite struct{}
+
+func (s *mainSuite) TestExampleJSON(c *check.C) {
+       var config Config
+       c.Check(json.Unmarshal(exampleConfigFile, &config), check.IsNil)
+       c.Check(config.KeepServiceTypes, check.DeepEquals, []string{"disk"})
+       c.Check(config.Client.AuthToken, check.Equals, "xyzzy")
+       c.Check(time.Duration(config.RunPeriod), check.Equals, 600*time.Second)
+}
+
+func (s *mainSuite) TestConfigJSONWithKeepServiceList(c *check.C) {
+       var config Config
+       c.Check(json.Unmarshal([]byte(`
+               {
+                   "Client": {
+                       "APIHost": "zzzzz.arvadosapi.com:443",
+                       "AuthToken": "xyzzy",
+                       "Insecure": false
+                   },
+                   "KeepServiceList": {
+                       "items": [
+                           {"uuid":"zzzzz-bi64l-abcdefghijklmno", "service_type":"disk", "service_host":"a.zzzzz.arvadosapi.com", "service_port":12345},
+                           {"uuid":"zzzzz-bi64l-bcdefghijklmnop", "service_type":"blob", "service_host":"b.zzzzz.arvadosapi.com", "service_port":12345}
+                       ]
+                   },
+                   "RunPeriod": "600s"
+               }`), &config), check.IsNil)
+       c.Assert(len(config.KeepServiceList.Items), check.Equals, 2)
+       c.Check(config.KeepServiceList.Items[0].UUID, check.Equals, "zzzzz-bi64l-abcdefghijklmno")
+       c.Check(config.KeepServiceList.Items[0].ServicePort, check.Equals, 12345)
+       c.Check(config.Client.AuthToken, check.Equals, "xyzzy")
+}
diff --git a/services/keep-balance/time_me.go b/services/keep-balance/time_me.go
new file mode 100644 (file)
index 0000000..e5f16b7
--- /dev/null
@@ -0,0 +1,14 @@
+package main
+
+import (
+       "log"
+       "time"
+)
+
+func timeMe(logger *log.Logger, label string) func() {
+       t0 := time.Now()
+       logger.Printf("%s: start", label)
+       return func() {
+               logger.Printf("%s: took %v", label, time.Since(t0))
+       }
+}
diff --git a/services/keep-balance/usage.go b/services/keep-balance/usage.go
new file mode 100644 (file)
index 0000000..eb9990c
--- /dev/null
@@ -0,0 +1,83 @@
+package main
+
+import (
+       "flag"
+       "fmt"
+       "os"
+)
+
+var exampleConfigFile = []byte(`
+    {
+       "Client": {
+           "APIHost": "zzzzz.arvadosapi.com:443",
+           "AuthToken": "xyzzy",
+           "Insecure": false
+       },
+       "KeepServiceTypes": [
+           "disk"
+       ],
+       "RunPeriod": "600s"
+    }`)
+
+func usage() {
+       fmt.Fprintf(os.Stderr, `
+
+keep-balance rebalances a set of keepstore servers. It creates new
+copies of underreplicated blocks, deletes excess copies of
+overreplicated and unreferenced blocks, and moves blocks to better
+positions (according to the rendezvous hash algorithm) so clients find
+them faster.
+
+Usage: keep-balance -config path/to/config.json [options]
+
+Options:
+`)
+       flag.PrintDefaults()
+       fmt.Fprintf(os.Stderr, `
+Example config file:
+%s
+
+    Client.AuthToken must be recognized by Arvados as an admin token,
+    and must be recognized by all Keep services as a "data manager
+    key".
+
+    Client.Insecure should be true if your Arvados API endpoint uses
+    an unverifiable SSL/TLS certificate.
+
+Periodic scanning:
+
+    By default, keep-balance operates periodically, i.e.: do a
+    scan/balance operation, sleep, repeat.
+
+    RunPeriod determines the interval between start times of
+    successive scan/balance operations. If a scan/balance operation
+    takes longer than RunPeriod, the next one will follow it
+    immediately.
+
+    If SIGUSR1 is received during an idle period between operations,
+    the next operation will start immediately.
+
+One-time scanning:
+
+    Use the -once flag to do a single operation and then exit. The
+    exit code will be zero if the operation was successful.
+
+Committing:
+
+    By default, keep-service computes and reports changes but does not
+    implement them by sending pull and trash lists to the Keep
+    services.
+
+    Use the -commit-pull and -commit-trash flags to implement the
+    computed changes.
+
+Limitations:
+
+    keep-balance does not attempt to discover whether committed pull
+    and trash requests ever get carried out -- only that they are
+    accepted by the Keep services. If some services are full, new
+    copies of underreplicated blocks might never get made, only
+    repeatedly requested.
+
+`, exampleConfigFile)
+}
index e1b23621af8f70aa214e43639140ca23ed2784c4..6f5f66ae0ef1bf57979f04189fe4d110818b1bd6 100644 (file)
@@ -320,6 +320,12 @@ func (h *handler) ServeHTTP(wOrig http.ResponseWriter, r *http.Request) {
                statusCode, statusText = http.StatusInternalServerError, err.Error()
                return
        }
+       if kc.Client != nil && kc.Client.Transport != nil {
+               // Workaround for https://dev.arvados.org/issues/9005
+               if t, ok := kc.Client.Transport.(*http.Transport); ok {
+                       defer t.CloseIdleConnections()
+               }
+       }
        rdr, err := kc.CollectionFileReader(collection, filename)
        if os.IsNotExist(err) {
                statusCode = http.StatusNotFound
index cda8b17d0547c62f3fdcc49825b2d9344d58e271..324588a29a11db72f8c30cfafff7095480db2822 100644 (file)
@@ -84,8 +84,14 @@ func (s *IntegrationSuite) Test1GBFile(c *check.C) {
        s.test100BlockFile(c, 10000000)
 }
 
-func (s *IntegrationSuite) Test300MBFile(c *check.C) {
-       s.test100BlockFile(c, 3000000)
+func (s *IntegrationSuite) Test100BlockFile(c *check.C) {
+       if testing.Short() {
+               // 3 MB
+               s.test100BlockFile(c, 30000)
+       } else {
+               // 300 MB
+               s.test100BlockFile(c, 3000000)
+       }
 }
 
 func (s *IntegrationSuite) test100BlockFile(c *check.C, blocksize int) {
index 7b5cd2befb8f69bd25fa62674d01590214aec5ad..4cd931037ef830dfd8a6b25022126c84c13d7036 100644 (file)
@@ -185,7 +185,7 @@ func CheckAuthorizationHeader(kc *keepclient.KeepClient, cache *ApiTokenCache, r
        }
 
        if cache.RecallToken(tok) {
-               // Valid in the cache, short circut
+               // Valid in the cache, short circuit
                return true, tok
        }
 
index 687c2fb36b7526bbbe498b86fd3c154d07ce9bd4..99da2a3a3de35de90be820b1a5285e5b592004d7 100644 (file)
@@ -10,6 +10,7 @@ import (
        "log"
        "os"
        "regexp"
+       "strconv"
        "strings"
        "sync"
        "time"
@@ -133,17 +134,36 @@ func (v *AzureBlobVolume) Check() error {
        return nil
 }
 
+// Return true if expires_at metadata attribute is found on the block
+func (v *AzureBlobVolume) checkTrashed(loc string) (bool, map[string]string, error) {
+       metadata, err := v.bsClient.GetBlobMetadata(v.containerName, loc)
+       if err != nil {
+               return false, metadata, v.translateError(err)
+       }
+       if metadata["expires_at"] != "" {
+               return true, metadata, nil
+       }
+       return false, metadata, nil
+}
+
 // Get reads a Keep block that has been stored as a block blob in the
 // container.
 //
 // If the block is younger than azureWriteRaceInterval and is
 // unexpectedly empty, assume a PutBlob operation is in progress, and
 // wait for it to finish writing.
-func (v *AzureBlobVolume) Get(loc string) ([]byte, error) {
+func (v *AzureBlobVolume) Get(loc string, buf []byte) (int, error) {
+       trashed, _, err := v.checkTrashed(loc)
+       if err != nil {
+               return 0, err
+       }
+       if trashed {
+               return 0, os.ErrNotExist
+       }
        var deadline time.Time
        haveDeadline := false
-       buf, err := v.get(loc)
-       for err == nil && len(buf) == 0 && loc != "d41d8cd98f00b204e9800998ecf8427e" {
+       size, err := v.get(loc, buf)
+       for err == nil && size == 0 && loc != "d41d8cd98f00b204e9800998ecf8427e" {
                // Seeing a brand new empty block probably means we're
                // in a race with CreateBlob, which under the hood
                // (apparently) does "CreateEmpty" and "CommitData"
@@ -163,34 +183,32 @@ func (v *AzureBlobVolume) Get(loc string) ([]byte, error) {
                } else if time.Now().After(deadline) {
                        break
                }
-               bufs.Put(buf)
                time.Sleep(azureWriteRacePollTime)
-               buf, err = v.get(loc)
+               size, err = v.get(loc, buf)
        }
        if haveDeadline {
-               log.Printf("Race ended with len(buf)==%d", len(buf))
+               log.Printf("Race ended with size==%d", size)
        }
-       return buf, err
+       return size, err
 }
 
-func (v *AzureBlobVolume) get(loc string) ([]byte, error) {
-       expectSize := BlockSize
+func (v *AzureBlobVolume) get(loc string, buf []byte) (int, error) {
+       expectSize := len(buf)
        if azureMaxGetBytes < BlockSize {
                // Unfortunately the handler doesn't tell us how long the blob
                // is expected to be, so we have to ask Azure.
                props, err := v.bsClient.GetBlobProperties(v.containerName, loc)
                if err != nil {
-                       return nil, v.translateError(err)
+                       return 0, v.translateError(err)
                }
                if props.ContentLength > int64(BlockSize) || props.ContentLength < 0 {
-                       return nil, fmt.Errorf("block %s invalid size %d (max %d)", loc, props.ContentLength, BlockSize)
+                       return 0, fmt.Errorf("block %s invalid size %d (max %d)", loc, props.ContentLength, BlockSize)
                }
                expectSize = int(props.ContentLength)
        }
 
-       buf := bufs.Get(expectSize)
        if expectSize == 0 {
-               return buf, nil
+               return 0, nil
        }
 
        // We'll update this actualSize if/when we get the last piece.
@@ -212,7 +230,7 @@ func (v *AzureBlobVolume) get(loc string) ([]byte, error) {
                        if startPos == 0 && endPos == expectSize {
                                rdr, err = v.bsClient.GetBlob(v.containerName, loc)
                        } else {
-                               rdr, err = v.bsClient.GetBlobRange(v.containerName, loc, fmt.Sprintf("%d-%d", startPos, endPos-1))
+                               rdr, err = v.bsClient.GetBlobRange(v.containerName, loc, fmt.Sprintf("%d-%d", startPos, endPos-1), nil)
                        }
                        if err != nil {
                                errors[p] = err
@@ -235,15 +253,21 @@ func (v *AzureBlobVolume) get(loc string) ([]byte, error) {
        wg.Wait()
        for _, err := range errors {
                if err != nil {
-                       bufs.Put(buf)
-                       return nil, v.translateError(err)
+                       return 0, v.translateError(err)
                }
        }
-       return buf[:actualSize], nil
+       return actualSize, nil
 }
 
 // Compare the given data with existing stored data.
 func (v *AzureBlobVolume) Compare(loc string, expect []byte) error {
+       trashed, _, err := v.checkTrashed(loc)
+       if err != nil {
+               return err
+       }
+       if trashed {
+               return os.ErrNotExist
+       }
        rdr, err := v.bsClient.GetBlob(v.containerName, loc)
        if err != nil {
                return v.translateError(err)
@@ -257,7 +281,7 @@ func (v *AzureBlobVolume) Put(loc string, block []byte) error {
        if v.readonly {
                return MethodDisabledError
        }
-       return v.bsClient.CreateBlockBlobFromReader(v.containerName, loc, uint64(len(block)), bytes.NewReader(block))
+       return v.bsClient.CreateBlockBlobFromReader(v.containerName, loc, uint64(len(block)), bytes.NewReader(block), nil)
 }
 
 // Touch updates the last-modified property of a block blob.
@@ -265,13 +289,28 @@ func (v *AzureBlobVolume) Touch(loc string) error {
        if v.readonly {
                return MethodDisabledError
        }
-       return v.bsClient.SetBlobMetadata(v.containerName, loc, map[string]string{
-               "touch": fmt.Sprintf("%d", time.Now()),
-       })
+       trashed, metadata, err := v.checkTrashed(loc)
+       if err != nil {
+               return err
+       }
+       if trashed {
+               return os.ErrNotExist
+       }
+
+       metadata["touch"] = fmt.Sprintf("%d", time.Now())
+       return v.bsClient.SetBlobMetadata(v.containerName, loc, metadata, nil)
 }
 
 // Mtime returns the last-modified property of a block blob.
 func (v *AzureBlobVolume) Mtime(loc string) (time.Time, error) {
+       trashed, _, err := v.checkTrashed(loc)
+       if err != nil {
+               return time.Time{}, err
+       }
+       if trashed {
+               return time.Time{}, os.ErrNotExist
+       }
+
        props, err := v.bsClient.GetBlobProperties(v.containerName, loc)
        if err != nil {
                return time.Time{}, err
@@ -283,7 +322,8 @@ func (v *AzureBlobVolume) Mtime(loc string) (time.Time, error) {
 // container.
 func (v *AzureBlobVolume) IndexTo(prefix string, writer io.Writer) error {
        params := storage.ListBlobsParameters{
-               Prefix: prefix,
+               Prefix:  prefix,
+               Include: "metadata",
        }
        for {
                resp, err := v.bsClient.ListBlobs(v.containerName, params)
@@ -306,6 +346,10 @@ func (v *AzureBlobVolume) IndexTo(prefix string, writer io.Writer) error {
                                // value.
                                continue
                        }
+                       if b.Metadata["expires_at"] != "" {
+                               // Trashed blob; exclude it from response
+                               continue
+                       }
                        fmt.Fprintf(writer, "%s+%d %d\n", b.Name, b.Properties.ContentLength, t.Unix())
                }
                if resp.NextMarker == "" {
@@ -321,10 +365,6 @@ func (v *AzureBlobVolume) Trash(loc string) error {
                return MethodDisabledError
        }
 
-       if trashLifetime != 0 {
-               return ErrNotImplemented
-       }
-
        // Ideally we would use If-Unmodified-Since, but that
        // particular condition seems to be ignored by Azure. Instead,
        // we get the Etag before checking Mtime, and use If-Match to
@@ -339,15 +379,38 @@ func (v *AzureBlobVolume) Trash(loc string) error {
        } else if time.Since(t) < blobSignatureTTL {
                return nil
        }
-       return v.bsClient.DeleteBlob(v.containerName, loc, map[string]string{
+
+       // If trashLifetime == 0, just delete it
+       if trashLifetime == 0 {
+               return v.bsClient.DeleteBlob(v.containerName, loc, map[string]string{
+                       "If-Match": props.Etag,
+               })
+       }
+
+       // Otherwise, mark as trash
+       return v.bsClient.SetBlobMetadata(v.containerName, loc, map[string]string{
+               "expires_at": fmt.Sprintf("%d", time.Now().Add(trashLifetime).Unix()),
+       }, map[string]string{
                "If-Match": props.Etag,
        })
 }
 
 // Untrash a Keep block.
-// TBD
+// Delete the expires_at metadata attribute
 func (v *AzureBlobVolume) Untrash(loc string) error {
-       return ErrNotImplemented
+       // if expires_at does not exist, return NotFoundError
+       metadata, err := v.bsClient.GetBlobMetadata(v.containerName, loc)
+       if err != nil {
+               return v.translateError(err)
+       }
+       if metadata["expires_at"] == "" {
+               return os.ErrNotExist
+       }
+
+       // reset expires_at metadata attribute
+       metadata["expires_at"] = ""
+       err = v.bsClient.SetBlobMetadata(v.containerName, loc, metadata, nil)
+       return v.translateError(err)
 }
 
 // Status returns a VolumeStatus struct with placeholder data.
@@ -382,7 +445,7 @@ func (v *AzureBlobVolume) translateError(err error) error {
        switch {
        case err == nil:
                return err
-       case strings.Contains(err.Error(), "404 Not Found"):
+       case strings.Contains(err.Error(), "Not Found"):
                // "storage: service returned without a response body (404 Not Found)"
                return os.ErrNotExist
        default:
@@ -398,6 +461,51 @@ func (v *AzureBlobVolume) isKeepBlock(s string) bool {
 
 // EmptyTrash looks for trashed blocks that exceeded trashLifetime
 // and deletes them from the volume.
-// TBD
 func (v *AzureBlobVolume) EmptyTrash() {
+       var bytesDeleted, bytesInTrash int64
+       var blocksDeleted, blocksInTrash int
+       params := storage.ListBlobsParameters{Include: "metadata"}
+
+       for {
+               resp, err := v.bsClient.ListBlobs(v.containerName, params)
+               if err != nil {
+                       log.Printf("EmptyTrash: ListBlobs: %v", err)
+                       break
+               }
+               for _, b := range resp.Blobs {
+                       // Check if the block is expired
+                       if b.Metadata["expires_at"] == "" {
+                               continue
+                       }
+
+                       blocksInTrash++
+                       bytesInTrash += b.Properties.ContentLength
+
+                       expiresAt, err := strconv.ParseInt(b.Metadata["expires_at"], 10, 64)
+                       if err != nil {
+                               log.Printf("EmptyTrash: ParseInt(%v): %v", b.Metadata["expires_at"], err)
+                               continue
+                       }
+
+                       if expiresAt > time.Now().Unix() {
+                               continue
+                       }
+
+                       err = v.bsClient.DeleteBlob(v.containerName, b.Name, map[string]string{
+                               "If-Match": b.Properties.Etag,
+                       })
+                       if err != nil {
+                               log.Printf("EmptyTrash: DeleteBlob(%v): %v", b.Name, err)
+                               continue
+                       }
+                       blocksDeleted++
+                       bytesDeleted += b.Properties.ContentLength
+               }
+               if resp.NextMarker == "" {
+                       break
+               }
+               params.Marker = resp.NextMarker
+       }
+
+       log.Printf("EmptyTrash stats for %v: Deleted %v bytes in %v blocks. Remaining in trash: %v bytes in %v blocks.", v.String(), bytesDeleted, blocksDeleted, bytesInTrash-bytesDeleted, blocksInTrash-blocksDeleted)
 }
index 439b40221465ada53c805c7b7afb47ba974652a9..5d556b3e8c40eb242addf53f4996c49eb396138f 100644 (file)
@@ -74,6 +74,7 @@ func (h *azStubHandler) PutRaw(container, hash string, data []byte) {
        h.blobs[container+"|"+hash] = &azBlob{
                Data:        data,
                Mtime:       time.Now(),
+               Metadata:    make(map[string]string),
                Uncommitted: make(map[string][]byte),
        }
 }
@@ -136,14 +137,23 @@ func (h *azStubHandler) ServeHTTP(rw http.ResponseWriter, r *http.Request) {
                        h.blobs[container+"|"+hash] = &azBlob{
                                Mtime:       time.Now(),
                                Uncommitted: make(map[string][]byte),
+                               Metadata:    make(map[string]string),
                                Etag:        makeEtag(),
                        }
                        h.unlockAndRace()
                }
+               metadata := make(map[string]string)
+               for k, v := range r.Header {
+                       if strings.HasPrefix(strings.ToLower(k), "x-ms-meta-") {
+                               name := k[len("x-ms-meta-"):]
+                               metadata[strings.ToLower(name)] = v[0]
+                       }
+               }
                h.blobs[container+"|"+hash] = &azBlob{
                        Data:        body,
                        Mtime:       time.Now(),
                        Uncommitted: make(map[string][]byte),
+                       Metadata:    metadata,
                        Etag:        makeEtag(),
                }
                rw.WriteHeader(http.StatusCreated)
@@ -196,11 +206,22 @@ func (h *azStubHandler) ServeHTTP(rw http.ResponseWriter, r *http.Request) {
                blob.Metadata = make(map[string]string)
                for k, v := range r.Header {
                        if strings.HasPrefix(strings.ToLower(k), "x-ms-meta-") {
-                               blob.Metadata[k] = v[0]
+                               name := k[len("x-ms-meta-"):]
+                               blob.Metadata[strings.ToLower(name)] = v[0]
                        }
                }
                blob.Mtime = time.Now()
                blob.Etag = makeEtag()
+       case (r.Method == "GET" || r.Method == "HEAD") && r.Form.Get("comp") == "metadata" && hash != "":
+               // "Get Blob Metadata" API
+               if !blobExists {
+                       rw.WriteHeader(http.StatusNotFound)
+                       return
+               }
+               for k, v := range blob.Metadata {
+                       rw.Header().Set(fmt.Sprintf("x-ms-meta-%s", k), v)
+               }
+               return
        case (r.Method == "GET" || r.Method == "HEAD") && hash != "":
                // "Get Blob" API
                if !blobExists {
@@ -265,14 +286,20 @@ func (h *azStubHandler) ServeHTTP(rw http.ResponseWriter, r *http.Request) {
                        }
                        if len(resp.Blobs) > 0 || marker == "" || marker == hash {
                                blob := h.blobs[container+"|"+hash]
-                               resp.Blobs = append(resp.Blobs, storage.Blob{
+                               bmeta := map[string]string(nil)
+                               if r.Form.Get("include") == "metadata" {
+                                       bmeta = blob.Metadata
+                               }
+                               b := storage.Blob{
                                        Name: hash,
                                        Properties: storage.BlobProperties{
                                                LastModified:  blob.Mtime.Format(time.RFC1123),
                                                ContentLength: int64(len(blob.Data)),
                                                Etag:          blob.Etag,
                                        },
-                               })
+                                       Metadata: bmeta,
+                               }
+                               resp.Blobs = append(resp.Blobs, b)
                        }
                }
                buf, err := xml.Marshal(resp)
@@ -425,13 +452,12 @@ func TestAzureBlobVolumeRangeFenceposts(t *testing.T) {
                if err != nil {
                        t.Error(err)
                }
-               gotData, err := v.Get(hash)
+               gotData := make([]byte, len(data))
+               gotLen, err := v.Get(hash, gotData)
                if err != nil {
                        t.Error(err)
                }
                gotHash := fmt.Sprintf("%x", md5.Sum(gotData))
-               gotLen := len(gotData)
-               bufs.Put(gotData)
                if gotLen != size {
                        t.Error("length mismatch: got %d != %d", gotLen, size)
                }
@@ -477,11 +503,10 @@ func TestAzureBlobVolumeCreateBlobRace(t *testing.T) {
        // Wait for the stub's Put to create the empty blob
        v.azHandler.race <- continuePut
        go func() {
-               buf, err := v.Get(TestHash)
+               buf := make([]byte, len(TestBlock))
+               _, err := v.Get(TestHash, buf)
                if err != nil {
                        t.Error(err)
-               } else {
-                       bufs.Put(buf)
                }
                close(allDone)
        }()
@@ -521,15 +546,15 @@ func TestAzureBlobVolumeCreateBlobRaceDeadline(t *testing.T) {
        allDone := make(chan struct{})
        go func() {
                defer close(allDone)
-               buf, err := v.Get(TestHash)
+               buf := make([]byte, BlockSize)
+               n, err := v.Get(TestHash, buf)
                if err != nil {
                        t.Error(err)
                        return
                }
-               if len(buf) != 0 {
-                       t.Errorf("Got %+q, expected empty buf", buf)
+               if n != 0 {
+                       t.Errorf("Got %+q, expected empty buf", buf[:n])
                }
-               bufs.Put(buf)
        }()
        select {
        case <-allDone:
index a7675fb1dcfbea5782a40ef6fc5b0d0c8bd93a8f..7c17424ba568227790469e4e32867f33fea8ff4e 100644 (file)
@@ -561,7 +561,8 @@ func TestDeleteHandler(t *testing.T) {
                        expectedDc, responseDc)
        }
        // Confirm the block has been deleted
-       _, err := vols[0].Get(TestHash)
+       buf := make([]byte, BlockSize)
+       _, err := vols[0].Get(TestHash, buf)
        var blockDeleted = os.IsNotExist(err)
        if !blockDeleted {
                t.Error("superuserExistingBlockReq: block not deleted")
@@ -585,7 +586,7 @@ func TestDeleteHandler(t *testing.T) {
                        expectedDc, responseDc)
        }
        // Confirm the block has NOT been deleted.
-       _, err = vols[0].Get(TestHash)
+       _, err = vols[0].Get(TestHash, buf)
        if err != nil {
                t.Errorf("testing delete on new block: %s\n", err)
        }
@@ -814,7 +815,7 @@ func IssueRequest(rt *RequestTester) *httptest.ResponseRecorder {
        if rt.apiToken != "" {
                req.Header.Set("Authorization", "OAuth2 "+rt.apiToken)
        }
-       loggingRouter := MakeLoggingRESTRouter()
+       loggingRouter := MakeRESTRouter()
        loggingRouter.ServeHTTP(response, req)
        return response
 }
@@ -913,6 +914,65 @@ func TestPutHandlerNoBufferleak(t *testing.T) {
        }
 }
 
+type notifyingResponseRecorder struct {
+       *httptest.ResponseRecorder
+       closer chan bool
+}
+
+func (r *notifyingResponseRecorder) CloseNotify() <-chan bool {
+       return r.closer
+}
+
+func TestGetHandlerClientDisconnect(t *testing.T) {
+       defer func(was bool) {
+               enforcePermissions = was
+       }(enforcePermissions)
+       enforcePermissions = false
+
+       defer func(orig *bufferPool) {
+               bufs = orig
+       }(bufs)
+       bufs = newBufferPool(1, BlockSize)
+       defer bufs.Put(bufs.Get(BlockSize))
+
+       KeepVM = MakeTestVolumeManager(2)
+       defer KeepVM.Close()
+
+       if err := KeepVM.AllWritable()[0].Put(TestHash, TestBlock); err != nil {
+               t.Error(err)
+       }
+
+       resp := &notifyingResponseRecorder{
+               ResponseRecorder: httptest.NewRecorder(),
+               closer:           make(chan bool, 1),
+       }
+       if _, ok := http.ResponseWriter(resp).(http.CloseNotifier); !ok {
+               t.Fatal("notifyingResponseRecorder is broken")
+       }
+       // If anyone asks, the client has disconnected.
+       resp.closer <- true
+
+       ok := make(chan struct{})
+       go func() {
+               req, _ := http.NewRequest("GET", fmt.Sprintf("/%s+%d", TestHash, len(TestBlock)), nil)
+               (&LoggingRESTRouter{MakeRESTRouter()}).ServeHTTP(resp, req)
+               ok <- struct{}{}
+       }()
+
+       select {
+       case <-time.After(20 * time.Second):
+               t.Fatal("request took >20s, close notifier must be broken")
+       case <-ok:
+       }
+
+       ExpectStatusCode(t, "client disconnect", http.StatusServiceUnavailable, resp.ResponseRecorder)
+       for i, v := range KeepVM.AllWritable() {
+               if calls := v.(*MockVolume).called["GET"]; calls != 0 {
+                       t.Errorf("volume %d got %d calls, expected 0", i, calls)
+               }
+       }
+}
+
 // Invoke the GetBlockHandler a bunch of times to test for bufferpool resource
 // leak.
 func TestGetHandlerNoBufferleak(t *testing.T) {
index 043ab69b17c255aa463fe8259a777cec682453f5..f698982415aae5bd7d8a341428acb2d8bdb57317 100644 (file)
@@ -79,18 +79,61 @@ func GetBlockHandler(resp http.ResponseWriter, req *http.Request) {
                }
        }
 
-       block, err := GetBlock(mux.Vars(req)["hash"])
+       // TODO: Probe volumes to check whether the block _might_
+       // exist. Some volumes/types could support a quick existence
+       // check without causing other operations to suffer. If all
+       // volumes support that, and assure us the block definitely
+       // isn't here, we can return 404 now instead of waiting for a
+       // buffer.
+
+       buf, err := getBufferForResponseWriter(resp, bufs, BlockSize)
        if err != nil {
-               // This type assertion is safe because the only errors
-               // GetBlock can return are DiskHashError or NotFoundError.
-               http.Error(resp, err.Error(), err.(*KeepError).HTTPCode)
+               http.Error(resp, err.Error(), http.StatusServiceUnavailable)
                return
        }
-       defer bufs.Put(block)
+       defer bufs.Put(buf)
 
-       resp.Header().Set("Content-Length", strconv.Itoa(len(block)))
+       size, err := GetBlock(mux.Vars(req)["hash"], buf, resp)
+       if err != nil {
+               code := http.StatusInternalServerError
+               if err, ok := err.(*KeepError); ok {
+                       code = err.HTTPCode
+               }
+               http.Error(resp, err.Error(), code)
+               return
+       }
+
+       resp.Header().Set("Content-Length", strconv.Itoa(size))
        resp.Header().Set("Content-Type", "application/octet-stream")
-       resp.Write(block)
+       resp.Write(buf[:size])
+}
+
+// Get a buffer from the pool -- but give up and return a non-nil
+// error if resp implements http.CloseNotifier and tells us that the
+// client has disconnected before we get a buffer.
+func getBufferForResponseWriter(resp http.ResponseWriter, bufs *bufferPool, bufSize int) ([]byte, error) {
+       var closeNotifier <-chan bool
+       if resp, ok := resp.(http.CloseNotifier); ok {
+               closeNotifier = resp.CloseNotify()
+       }
+       var buf []byte
+       bufReady := make(chan []byte)
+       go func() {
+               bufReady <- bufs.Get(bufSize)
+               close(bufReady)
+       }()
+       select {
+       case buf = <-bufReady:
+               return buf, nil
+       case <-closeNotifier:
+               go func() {
+                       // Even if closeNotifier happened first, we
+                       // need to keep waiting for our buf so we can
+                       // return it to the pool.
+                       bufs.Put(<-bufReady)
+               }()
+               return nil, ErrClientDisconnect
+       }
 }
 
 // PutBlockHandler is a HandleFunc to address Put block requests.
@@ -116,8 +159,13 @@ func PutBlockHandler(resp http.ResponseWriter, req *http.Request) {
                return
        }
 
-       buf := bufs.Get(int(req.ContentLength))
-       _, err := io.ReadFull(req.Body, buf)
+       buf, err := getBufferForResponseWriter(resp, bufs, int(req.ContentLength))
+       if err != nil {
+               http.Error(resp, err.Error(), http.StatusServiceUnavailable)
+               return
+       }
+
+       _, err = io.ReadFull(req.Body, buf)
        if err != nil {
                http.Error(resp, err.Error(), 500)
                bufs.Put(buf)
@@ -481,7 +529,6 @@ func UntrashHandler(resp http.ResponseWriter, req *http.Request) {
        }
 }
 
-// ==============================
 // GetBlock and PutBlock implement lower-level code for handling
 // blocks by rooting through volumes connected to the local machine.
 // Once the handler has determined that system policy permits the
@@ -492,24 +539,21 @@ func UntrashHandler(resp http.ResponseWriter, req *http.Request) {
 // should be the only part of the code that cares about which volume a
 // block is stored on, so it should be responsible for figuring out
 // which volume to check for fetching blocks, storing blocks, etc.
-// ==============================
 
-// GetBlock fetches and returns the block identified by "hash".
-//
-// On success, GetBlock returns a byte slice with the block data, and
-// a nil error.
+// GetBlock fetches the block identified by "hash" into the provided
+// buf, and returns the data size.
 //
 // If the block cannot be found on any volume, returns NotFoundError.
 //
 // If the block found does not have the correct MD5 hash, returns
 // DiskHashError.
 //
-func GetBlock(hash string) ([]byte, error) {
+func GetBlock(hash string, buf []byte, resp http.ResponseWriter) (int, error) {
        // Attempt to read the requested hash from a keep volume.
        errorToCaller := NotFoundError
 
        for _, vol := range KeepVM.AllReadable() {
-               buf, err := vol.Get(hash)
+               size, err := vol.Get(hash, buf)
                if err != nil {
                        // IsNotExist is an expected error and may be
                        // ignored. All other errors are logged. In
@@ -523,23 +567,22 @@ func GetBlock(hash string) ([]byte, error) {
                }
                // Check the file checksum.
                //
-               filehash := fmt.Sprintf("%x", md5.Sum(buf))
+               filehash := fmt.Sprintf("%x", md5.Sum(buf[:size]))
                if filehash != hash {
                        // TODO: Try harder to tell a sysadmin about
                        // this.
                        log.Printf("%s: checksum mismatch for request %s (actual %s)",
                                vol, hash, filehash)
                        errorToCaller = DiskHashError
-                       bufs.Put(buf)
                        continue
                }
                if errorToCaller == DiskHashError {
                        log.Printf("%s: checksum mismatch for request %s but a good copy was found on another volume and returned",
                                vol, hash)
                }
-               return buf, nil
+               return size, nil
        }
-       return nil, errorToCaller
+       return 0, errorToCaller
 }
 
 // PutBlock Stores the BLOCK (identified by the content id HASH) in Keep.
index c5349d399c32ebc5692d0a39d4cc8c9c3ad83e1a..dda7edcec3509e683465a93d5eb775bf18f16d19 100644 (file)
@@ -45,12 +45,13 @@ func testGetBlock(t TB, factory TestableVolumeManagerFactory, testHash string, t
        testableVolumes[1].PutRaw(testHash, testBlock)
 
        // Get should pass
-       buf, err := GetBlock(testHash)
+       buf := make([]byte, len(testBlock))
+       n, err := GetBlock(testHash, buf, nil)
        if err != nil {
                t.Fatalf("Error while getting block %s", err)
        }
-       if bytes.Compare(buf, testBlock) != 0 {
-               t.Errorf("Put succeeded but Get returned %+v, expected %+v", buf, testBlock)
+       if bytes.Compare(buf[:n], testBlock) != 0 {
+               t.Errorf("Put succeeded but Get returned %+v, expected %+v", buf[:n], testBlock)
        }
 }
 
@@ -64,9 +65,10 @@ func testPutRawBadDataGetBlock(t TB, factory TestableVolumeManagerFactory,
        testableVolumes[1].PutRaw(testHash, badData)
 
        // Get should fail
-       _, err := GetBlock(testHash)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(testHash, buf, nil)
        if err == nil {
-               t.Fatalf("Expected error while getting corrupt block %v", testHash)
+               t.Fatalf("Got %+q, expected error while getting corrupt block %v", buf[:size], testHash)
        }
 }
 
@@ -85,11 +87,12 @@ func testPutBlock(t TB, factory TestableVolumeManagerFactory, testHash string, t
        }
 
        // Check that PutBlock stored the data as expected
-       buf, err := GetBlock(testHash)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(testHash, buf, nil)
        if err != nil {
                t.Fatalf("Error during GetBlock for %q: %s", testHash, err)
-       } else if bytes.Compare(buf, testBlock) != 0 {
-               t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf)
+       } else if bytes.Compare(buf[:size], testBlock) != 0 {
+               t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf[:size])
        }
 }
 
@@ -109,10 +112,11 @@ func testPutBlockCorrupt(t TB, factory TestableVolumeManagerFactory,
 
        // Put succeeded and overwrote the badData in one volume,
        // and Get should return the testBlock now, ignoring the bad data.
-       buf, err := GetBlock(testHash)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(testHash, buf, nil)
        if err != nil {
                t.Fatalf("Error during GetBlock for %q: %s", testHash, err)
-       } else if bytes.Compare(buf, testBlock) != 0 {
-               t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf)
+       } else if bytes.Compare(buf[:size], testBlock) != 0 {
+               t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf[:size])
        }
 }
index 40e62c5c50146aa6a89f0bcf6566eb194b943b73..819d52fe0adecd71670ab89d57f1967b64368b4a 100644 (file)
@@ -4,6 +4,8 @@ import (
        "bytes"
        "flag"
        "fmt"
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/httpserver"
        "git.curoverse.com/arvados.git/sdk/go/keepclient"
        "io/ioutil"
        "log"
@@ -90,6 +92,7 @@ var (
        TooLongError        = &KeepError{413, "Block is too large"}
        MethodDisabledError = &KeepError{405, "Method disabled"}
        ErrNotImplemented   = &KeepError{500, "Unsupported configuration"}
+       ErrClientDisconnect = &KeepError{503, "Client disconnected"}
 )
 
 func (e *KeepError) Error() string {
@@ -145,6 +148,7 @@ func main() {
                blobSigningKeyFile   string
                permissionTTLSec     int
                pidfile              string
+               maxRequests          int
        )
        flag.StringVar(
                &dataManagerTokenFile,
@@ -162,6 +166,11 @@ func main() {
                "listen",
                DefaultAddr,
                "Listening address, in the form \"host:port\". e.g., 10.0.1.24:8000. Omit the host part to listen on all interfaces.")
+       flag.IntVar(
+               &maxRequests,
+               "max-requests",
+               0,
+               "Maximum concurrent requests. When this limit is reached, new requests will receive 503 responses. Note: this limit does not include idle connections from clients using HTTP keepalive, so it does not strictly limit the number of concurrent connections. (default 2 * max-buffers)")
        flag.BoolVar(
                &neverDelete,
                "never-delete",
@@ -189,7 +198,7 @@ func main() {
                &permissionTTLSec,
                "blob-signature-ttl",
                int(time.Duration(2*7*24*time.Hour).Seconds()),
-               "Lifetime of blob permission signatures. "+
+               "Lifetime of blob permission signatures. Modifying the ttl will invalidate all existing signatures. "+
                        "See services/api/config/application.default.yml.")
        flag.BoolVar(
                &flagSerializeIO,
@@ -302,13 +311,18 @@ func main() {
                }
        }
 
+       if maxRequests <= 0 {
+               maxRequests = maxBuffers * 2
+               log.Printf("-max-requests <1 or not specified; defaulting to maxBuffers * 2 == %d", maxRequests)
+       }
+
        // Start a round-robin VolumeManager with the volumes we have found.
        KeepVM = MakeRRVolumeManager(volumes)
 
-       // Tell the built-in HTTP server to direct all requests to the REST router.
-       loggingRouter := MakeLoggingRESTRouter()
-       http.HandleFunc("/", func(resp http.ResponseWriter, req *http.Request) {
-               loggingRouter.ServeHTTP(resp, req)
+       // Middleware stack: logger, maxRequests limiter, method handlers
+       http.Handle("/", &LoggingRESTRouter{
+               httpserver.NewRequestLimiter(maxRequests,
+                       MakeRESTRouter()),
        })
 
        // Set up a TCP listener.
@@ -319,7 +333,7 @@ func main() {
 
        // Initialize Pull queue and worker
        keepClient := &keepclient.KeepClient{
-               Arvados:       nil,
+               Arvados:       &arvadosclient.ArvadosClient{},
                Want_replicas: 1,
                Client:        &http.Client{},
        }
index 2a1c3d243ab922855b2bf6344f69631a78272662..c0adbc0bd74dad7d115dfe70a3374b2815704c56 100644 (file)
@@ -66,12 +66,13 @@ func TestGetBlock(t *testing.T) {
        }
 
        // Check that GetBlock returns success.
-       result, err := GetBlock(TestHash)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(TestHash, buf, nil)
        if err != nil {
                t.Errorf("GetBlock error: %s", err)
        }
-       if fmt.Sprint(result) != fmt.Sprint(TestBlock) {
-               t.Errorf("expected %s, got %s", TestBlock, result)
+       if bytes.Compare(buf[:size], TestBlock) != 0 {
+               t.Errorf("got %v, expected %v", buf[:size], TestBlock)
        }
 }
 
@@ -86,9 +87,10 @@ func TestGetBlockMissing(t *testing.T) {
        defer KeepVM.Close()
 
        // Check that GetBlock returns failure.
-       result, err := GetBlock(TestHash)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(TestHash, buf, nil)
        if err != NotFoundError {
-               t.Errorf("Expected NotFoundError, got %v", result)
+               t.Errorf("Expected NotFoundError, got %v, err %v", buf[:size], err)
        }
 }
 
@@ -107,9 +109,10 @@ func TestGetBlockCorrupt(t *testing.T) {
        vols[0].Put(TestHash, BadBlock)
 
        // Check that GetBlock returns failure.
-       result, err := GetBlock(TestHash)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(TestHash, buf, nil)
        if err != DiskHashError {
-               t.Errorf("Expected DiskHashError, got %v (buf: %v)", err, result)
+               t.Errorf("Expected DiskHashError, got %v (buf: %v)", err, buf[:size])
        }
 }
 
@@ -133,13 +136,14 @@ func TestPutBlockOK(t *testing.T) {
        }
 
        vols := KeepVM.AllReadable()
-       result, err := vols[1].Get(TestHash)
+       buf := make([]byte, BlockSize)
+       n, err := vols[1].Get(TestHash, buf)
        if err != nil {
                t.Fatalf("Volume #0 Get returned error: %v", err)
        }
-       if string(result) != string(TestBlock) {
+       if string(buf[:n]) != string(TestBlock) {
                t.Fatalf("PutBlock stored '%s', Get retrieved '%s'",
-                       string(TestBlock), string(result))
+                       string(TestBlock), string(buf[:n]))
        }
 }
 
@@ -162,14 +166,14 @@ func TestPutBlockOneVol(t *testing.T) {
                t.Fatalf("PutBlock: n %d err %v", n, err)
        }
 
-       result, err := GetBlock(TestHash)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(TestHash, buf, nil)
        if err != nil {
                t.Fatalf("GetBlock: %v", err)
        }
-       if string(result) != string(TestBlock) {
-               t.Error("PutBlock/GetBlock mismatch")
-               t.Fatalf("PutBlock stored '%s', GetBlock retrieved '%s'",
-                       string(TestBlock), string(result))
+       if bytes.Compare(buf[:size], TestBlock) != 0 {
+               t.Fatalf("PutBlock stored %+q, GetBlock retrieved %+q",
+                       TestBlock, buf[:size])
        }
 }
 
@@ -191,7 +195,7 @@ func TestPutBlockMD5Fail(t *testing.T) {
        }
 
        // Confirm that GetBlock fails to return anything.
-       if result, err := GetBlock(TestHash); err != NotFoundError {
+       if result, err := GetBlock(TestHash, make([]byte, BlockSize), nil); err != NotFoundError {
                t.Errorf("GetBlock succeeded after a corrupt block store (result = %s, err = %v)",
                        string(result), err)
        }
@@ -216,10 +220,11 @@ func TestPutBlockCorrupt(t *testing.T) {
        }
 
        // The block on disk should now match TestBlock.
-       if block, err := GetBlock(TestHash); err != nil {
+       buf := make([]byte, BlockSize)
+       if size, err := GetBlock(TestHash, buf, nil); err != nil {
                t.Errorf("GetBlock: %v", err)
-       } else if bytes.Compare(block, TestBlock) != 0 {
-               t.Errorf("GetBlock returned: '%s'", string(block))
+       } else if bytes.Compare(buf[:size], TestBlock) != 0 {
+               t.Errorf("Got %+q, expected %+q", buf[:size], TestBlock)
        }
 }
 
@@ -290,12 +295,13 @@ func TestPutBlockTouchFails(t *testing.T) {
                t.Errorf("mtime was changed on vols[0]:\noldMtime = %v\nnewMtime = %v\n",
                        oldMtime, newMtime)
        }
-       result, err := vols[1].Get(TestHash)
+       buf := make([]byte, BlockSize)
+       n, err := vols[1].Get(TestHash, buf)
        if err != nil {
                t.Fatalf("vols[1]: %v", err)
        }
-       if bytes.Compare(result, TestBlock) != 0 {
-               t.Errorf("new block does not match test block\nnew block = %v\n", result)
+       if bytes.Compare(buf[:n], TestBlock) != 0 {
+               t.Errorf("new block does not match test block\nnew block = %v\n", buf[:n])
        }
 }
 
index 9edfb6e69d15356bd7bee516e8993e0d98953bc5..0f556b538ac7ae15b1939f61bad13be3ed0404e5 100644 (file)
@@ -4,7 +4,6 @@ package main
 // LoggingResponseWriter
 
 import (
-       "github.com/gorilla/mux"
        "log"
        "net/http"
        "strings"
@@ -20,51 +19,59 @@ type LoggingResponseWriter struct {
        sentHdr      time.Time
 }
 
+// CloseNotify implements http.CloseNotifier.
+func (resp *LoggingResponseWriter) CloseNotify() <-chan bool {
+       wrapped, ok := resp.ResponseWriter.(http.CloseNotifier)
+       if !ok {
+               // If upstream doesn't implement CloseNotifier, we can
+               // satisfy the interface by returning a channel that
+               // never sends anything (the interface doesn't
+               // guarantee that anything will ever be sent on the
+               // channel even if the client disconnects).
+               return nil
+       }
+       return wrapped.CloseNotify()
+}
+
 // WriteHeader writes header to ResponseWriter
-func (loggingWriter *LoggingResponseWriter) WriteHeader(code int) {
-       if loggingWriter.sentHdr == zeroTime {
-               loggingWriter.sentHdr = time.Now()
+func (resp *LoggingResponseWriter) WriteHeader(code int) {
+       if resp.sentHdr == zeroTime {
+               resp.sentHdr = time.Now()
        }
-       loggingWriter.Status = code
-       loggingWriter.ResponseWriter.WriteHeader(code)
+       resp.Status = code
+       resp.ResponseWriter.WriteHeader(code)
 }
 
 var zeroTime time.Time
 
-func (loggingWriter *LoggingResponseWriter) Write(data []byte) (int, error) {
-       if loggingWriter.Length == 0 && len(data) > 0 && loggingWriter.sentHdr == zeroTime {
-               loggingWriter.sentHdr = time.Now()
+func (resp *LoggingResponseWriter) Write(data []byte) (int, error) {
+       if resp.Length == 0 && len(data) > 0 && resp.sentHdr == zeroTime {
+               resp.sentHdr = time.Now()
        }
-       loggingWriter.Length += len(data)
-       if loggingWriter.Status >= 400 {
-               loggingWriter.ResponseBody += string(data)
+       resp.Length += len(data)
+       if resp.Status >= 400 {
+               resp.ResponseBody += string(data)
        }
-       return loggingWriter.ResponseWriter.Write(data)
+       return resp.ResponseWriter.Write(data)
 }
 
 // LoggingRESTRouter is used to add logging capabilities to mux.Router
 type LoggingRESTRouter struct {
-       router *mux.Router
-}
-
-// MakeLoggingRESTRouter initializes LoggingRESTRouter
-func MakeLoggingRESTRouter() *LoggingRESTRouter {
-       router := MakeRESTRouter()
-       return (&LoggingRESTRouter{router})
+       router http.Handler
 }
 
-func (loggingRouter *LoggingRESTRouter) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+func (loggingRouter *LoggingRESTRouter) ServeHTTP(wrappedResp http.ResponseWriter, req *http.Request) {
        t0 := time.Now()
-       loggingWriter := LoggingResponseWriter{http.StatusOK, 0, resp, "", zeroTime}
-       loggingRouter.router.ServeHTTP(&loggingWriter, req)
-       statusText := http.StatusText(loggingWriter.Status)
-       if loggingWriter.Status >= 400 {
-               statusText = strings.Replace(loggingWriter.ResponseBody, "\n", "", -1)
+       resp := LoggingResponseWriter{http.StatusOK, 0, wrappedResp, "", zeroTime}
+       loggingRouter.router.ServeHTTP(&resp, req)
+       statusText := http.StatusText(resp.Status)
+       if resp.Status >= 400 {
+               statusText = strings.Replace(resp.ResponseBody, "\n", "", -1)
        }
        now := time.Now()
        tTotal := now.Sub(t0)
-       tLatency := loggingWriter.sentHdr.Sub(t0)
-       tResponse := now.Sub(loggingWriter.sentHdr)
-       log.Printf("[%s] %s %s %d %.6fs %.6fs %.6fs %d %d \"%s\"", req.RemoteAddr, req.Method, req.URL.Path[1:], req.ContentLength, tTotal.Seconds(), tLatency.Seconds(), tResponse.Seconds(), loggingWriter.Status, loggingWriter.Length, statusText)
+       tLatency := resp.sentHdr.Sub(t0)
+       tResponse := now.Sub(resp.sentHdr)
+       log.Printf("[%s] %s %s %d %.6fs %.6fs %.6fs %d %d \"%s\"", req.RemoteAddr, req.Method, req.URL.Path[1:], req.ContentLength, tTotal.Seconds(), tLatency.Seconds(), tResponse.Seconds(), resp.Status, resp.Length, statusText)
 
 }
diff --git a/services/keepstore/logging_router_test.go b/services/keepstore/logging_router_test.go
new file mode 100644 (file)
index 0000000..aa88556
--- /dev/null
@@ -0,0 +1,10 @@
+package main
+
+import (
+       "net/http"
+       "testing"
+)
+
+func TestLoggingResponseWriterImplementsCloseNotifier(t *testing.T) {
+       http.ResponseWriter(&LoggingResponseWriter{}).(http.CloseNotifier).CloseNotify()
+}
index 6168a321c27e464fff5d0555ed363b2636331c76..9cd97bd3b746b1d66c0eba3b002fe5c9b8d70083 100644 (file)
@@ -13,7 +13,7 @@ var PermissionSecret []byte
 // SignLocator takes a blobLocator, an apiToken and an expiry time, and
 // returns a signed locator string.
 func SignLocator(blobLocator, apiToken string, expiry time.Time) string {
-       return keepclient.SignLocator(blobLocator, apiToken, expiry, PermissionSecret)
+       return keepclient.SignLocator(blobLocator, apiToken, expiry, blobSignatureTTL, PermissionSecret)
 }
 
 // VerifySignature returns nil if the signature on the signedLocator
@@ -22,7 +22,7 @@ func SignLocator(blobLocator, apiToken string, expiry time.Time) string {
 // something the client could have figured out independently) or
 // PermissionError.
 func VerifySignature(signedLocator, apiToken string) error {
-       err := keepclient.VerifySignature(signedLocator, apiToken, PermissionSecret)
+       err := keepclient.VerifySignature(signedLocator, apiToken, blobSignatureTTL, PermissionSecret)
        if err == keepclient.ErrSignatureExpired {
                return ExpiredError
        } else if err != nil {
index f4443fc7be1b423c4f535cccae66f0de32e71648..43717b23720d8c71b32c126810f8e39dd41a0429 100644 (file)
@@ -17,7 +17,8 @@ const (
                "gokee3eamvjy8qq1fvy238838enjmy5wzy2md7yvsitp5vztft6j4q866efym7e6" +
                "vu5wm9fpnwjyxfldw3vbo01mgjs75rgo7qioh8z8ij7jpyp8508okhgbbex3ceei" +
                "786u5rw2a9gx743dj3fgq2irk"
-       knownSignature     = "257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a"
+       knownSignatureTTL  = 1209600 * time.Second
+       knownSignature     = "89118b78732c33104a4d6231e8b5a5fa1e4301e3"
        knownTimestamp     = "7fffffff"
        knownSigHint       = "+A" + knownSignature + "@" + knownTimestamp
        knownSignedLocator = knownLocator + knownSigHint
@@ -34,6 +35,8 @@ func TestSignLocator(t *testing.T) {
        }
        t0 := time.Unix(tsInt, 0)
 
+       blobSignatureTTL = knownSignatureTTL
+
        PermissionSecret = []byte(knownKey)
        if x := SignLocator(knownLocator, knownToken, t0); x != knownSignedLocator {
                t.Fatalf("Got %+q, expected %+q", x, knownSignedLocator)
@@ -50,6 +53,8 @@ func TestVerifyLocator(t *testing.T) {
                PermissionSecret = b
        }(PermissionSecret)
 
+       blobSignatureTTL = knownSignatureTTL
+
        PermissionSecret = []byte(knownKey)
        if err := VerifySignature(knownSignedLocator, knownToken); err != nil {
                t.Fatal(err)
index 79a680d58a3efebab11467ca2f3a474d2e0d0feb..80a7c89f2ed4f6669566711c40c4d0a59940e439 100644 (file)
@@ -10,6 +10,7 @@ import (
        "net/http"
        "os"
        "regexp"
+       "strings"
        "time"
 
        "github.com/AdRoll/goamz/aws"
@@ -153,20 +154,18 @@ func (v *S3Volume) Check() error {
        return nil
 }
 
-func (v *S3Volume) Get(loc string) ([]byte, error) {
+func (v *S3Volume) Get(loc string, buf []byte) (int, error) {
        rdr, err := v.Bucket.GetReader(loc)
        if err != nil {
-               return nil, v.translateError(err)
+               return 0, v.translateError(err)
        }
        defer rdr.Close()
-       buf := bufs.Get(BlockSize)
        n, err := io.ReadFull(rdr, buf)
        switch err {
        case nil, io.EOF, io.ErrUnexpectedEOF:
-               return buf[:n], nil
+               return n, nil
        default:
-               bufs.Put(buf)
-               return nil, v.translateError(err)
+               return 0, v.translateError(err)
        }
 }
 
@@ -312,7 +311,8 @@ func (v *S3Volume) isKeepBlock(s string) bool {
 func (v *S3Volume) translateError(err error) error {
        switch err := err.(type) {
        case *s3.Error:
-               if err.StatusCode == http.StatusNotFound && err.Code == "NoSuchKey" {
+               if (err.StatusCode == http.StatusNotFound && err.Code == "NoSuchKey") ||
+                       strings.Contains(err.Error(), "Not Found") {
                        return os.ErrNotExist
                }
                // Other 404 errors like NoSuchVersion and
index 1d3063a9de10651cf675062a439403372d39f49f..d111caeac8e5b571202502e0aea63f07816365ba 100644 (file)
@@ -163,7 +163,7 @@ func TestTrashWorkerIntegration_TwoDifferentLocatorsInVolume1(t *testing.T) {
 }
 
 /* Allow default Trash Life time to be used. Thus, the newly created block
-   will not be deleted becuase its Mtime is within the trash life time.
+   will not be deleted because its Mtime is within the trash life time.
 */
 func TestTrashWorkerIntegration_SameLocatorInTwoVolumesWithDefaultTrashLifeTime(t *testing.T) {
        neverDelete = false
@@ -290,26 +290,27 @@ func performTrashWorkerTest(testData TrashWorkerTestData, t *testing.T) {
        expectEqualWithin(t, time.Second, 0, func() interface{} { return trashq.Status().InProgress })
 
        // Verify Locator1 to be un/deleted as expected
-       data, _ := GetBlock(testData.Locator1)
+       buf := make([]byte, BlockSize)
+       size, err := GetBlock(testData.Locator1, buf, nil)
        if testData.ExpectLocator1 {
-               if len(data) == 0 {
+               if size == 0 || err != nil {
                        t.Errorf("Expected Locator1 to be still present: %s", testData.Locator1)
                }
        } else {
-               if len(data) > 0 {
+               if size > 0 || err == nil {
                        t.Errorf("Expected Locator1 to be deleted: %s", testData.Locator1)
                }
        }
 
        // Verify Locator2 to be un/deleted as expected
        if testData.Locator1 != testData.Locator2 {
-               data, _ = GetBlock(testData.Locator2)
+               size, err = GetBlock(testData.Locator2, buf, nil)
                if testData.ExpectLocator2 {
-                       if len(data) == 0 {
+                       if size == 0 || err != nil {
                                t.Errorf("Expected Locator2 to be still present: %s", testData.Locator2)
                        }
                } else {
-                       if len(data) > 0 {
+                       if size > 0 || err == nil {
                                t.Errorf("Expected Locator2 to be deleted: %s", testData.Locator2)
                        }
                }
@@ -321,7 +322,8 @@ func performTrashWorkerTest(testData TrashWorkerTestData, t *testing.T) {
        if testData.DifferentMtimes {
                locatorFoundIn := 0
                for _, volume := range KeepVM.AllReadable() {
-                       if _, err := volume.Get(testData.Locator1); err == nil {
+                       buf := make([]byte, BlockSize)
+                       if _, err := volume.Get(testData.Locator1, buf); err == nil {
                                locatorFoundIn = locatorFoundIn + 1
                        }
                }
index 17da54fdadbca571cae93fde18342d2776b4e3a7..8ae6660fd477fa90365a019c837a121a08cc9595 100644 (file)
@@ -10,17 +10,14 @@ import (
 // for example, a single mounted disk, a RAID array, an Amazon S3 volume,
 // etc.
 type Volume interface {
-       // Get a block. IFF the returned error is nil, the caller must
-       // put the returned slice back into the buffer pool when it's
-       // finished with it. (Otherwise, the buffer pool will be
-       // depleted and eventually -- when all available buffers are
-       // used and not returned -- operations will reach deadlock.)
+       // Get a block: copy the block data into buf, and return the
+       // number of bytes copied.
        //
        // loc is guaranteed to consist of 32 or more lowercase hex
        // digits.
        //
-       // Get should not verify the integrity of the returned data:
-       // it should just return whatever was found in its backing
+       // Get should not verify the integrity of the data: it should
+       // just return whatever was found in its backing
        // store. (Integrity checking is the caller's responsibility.)
        //
        // If an error is encountered that prevents it from
@@ -36,10 +33,12 @@ type Volume interface {
        // access log if the block is not found on any other volumes
        // either).
        //
-       // If the data in the backing store is bigger than BlockSize,
-       // Get is permitted to return an error without reading any of
-       // the data.
-       Get(loc string) ([]byte, error)
+       // If the data in the backing store is bigger than len(buf),
+       // then Get is permitted to return an error without reading
+       // any of the data.
+       //
+       // len(buf) will not exceed BlockSize.
+       Get(loc string, buf []byte) (int, error)
 
        // Compare the given data with the stored data (i.e., what Get
        // would return). If equal, return nil. If not, return
index 95166c252f004bef5a1ba1f583f4e13f54117f47..f8fe0d0ebce719c6c823fe9caa9fcce12324eb49 100644 (file)
@@ -89,14 +89,13 @@ func testGet(t TB, factory TestableVolumeFactory) {
 
        v.PutRaw(TestHash, TestBlock)
 
-       buf, err := v.Get(TestHash)
+       buf := make([]byte, BlockSize)
+       n, err := v.Get(TestHash, buf)
        if err != nil {
                t.Fatal(err)
        }
 
-       bufs.Put(buf)
-
-       if bytes.Compare(buf, TestBlock) != 0 {
+       if bytes.Compare(buf[:n], TestBlock) != 0 {
                t.Errorf("expected %s, got %s", string(TestBlock), string(buf))
        }
 }
@@ -107,7 +106,8 @@ func testGetNoSuchBlock(t TB, factory TestableVolumeFactory) {
        v := factory(t)
        defer v.Teardown()
 
-       if _, err := v.Get(TestHash2); err == nil {
+       buf := make([]byte, BlockSize)
+       if _, err := v.Get(TestHash2, buf); err == nil {
                t.Errorf("Expected error while getting non-existing block %v", TestHash2)
        }
 }
@@ -208,24 +208,22 @@ func testPutBlockWithDifferentContent(t TB, factory TestableVolumeFactory, testH
        v.PutRaw(testHash, testDataA)
 
        putErr := v.Put(testHash, testDataB)
-       buf, getErr := v.Get(testHash)
+       buf := make([]byte, BlockSize)
+       n, getErr := v.Get(testHash, buf)
        if putErr == nil {
                // Put must not return a nil error unless it has
                // overwritten the existing data.
-               if bytes.Compare(buf, testDataB) != 0 {
-                       t.Errorf("Put succeeded but Get returned %+q, expected %+q", buf, testDataB)
+               if bytes.Compare(buf[:n], testDataB) != 0 {
+                       t.Errorf("Put succeeded but Get returned %+q, expected %+q", buf[:n], testDataB)
                }
        } else {
                // It is permissible for Put to fail, but it must
                // leave us with either the original data, the new
                // data, or nothing at all.
-               if getErr == nil && bytes.Compare(buf, testDataA) != 0 && bytes.Compare(buf, testDataB) != 0 {
-                       t.Errorf("Put failed but Get returned %+q, which is neither %+q nor %+q", buf, testDataA, testDataB)
+               if getErr == nil && bytes.Compare(buf[:n], testDataA) != 0 && bytes.Compare(buf[:n], testDataB) != 0 {
+                       t.Errorf("Put failed but Get returned %+q, which is neither %+q nor %+q", buf[:n], testDataA, testDataB)
                }
        }
-       if getErr == nil {
-               bufs.Put(buf)
-       }
 }
 
 // Put and get multiple blocks
@@ -253,34 +251,32 @@ func testPutMultipleBlocks(t TB, factory TestableVolumeFactory) {
                t.Errorf("Got err putting block %q: %q, expected nil", TestBlock3, err)
        }
 
-       data, err := v.Get(TestHash)
+       data := make([]byte, BlockSize)
+       n, err := v.Get(TestHash, data)
        if err != nil {
                t.Error(err)
        } else {
-               if bytes.Compare(data, TestBlock) != 0 {
-                       t.Errorf("Block present, but got %+q, expected %+q", data, TestBlock)
+               if bytes.Compare(data[:n], TestBlock) != 0 {
+                       t.Errorf("Block present, but got %+q, expected %+q", data[:n], TestBlock)
                }
-               bufs.Put(data)
        }
 
-       data, err = v.Get(TestHash2)
+       n, err = v.Get(TestHash2, data)
        if err != nil {
                t.Error(err)
        } else {
-               if bytes.Compare(data, TestBlock2) != 0 {
-                       t.Errorf("Block present, but got %+q, expected %+q", data, TestBlock2)
+               if bytes.Compare(data[:n], TestBlock2) != 0 {
+                       t.Errorf("Block present, but got %+q, expected %+q", data[:n], TestBlock2)
                }
-               bufs.Put(data)
        }
 
-       data, err = v.Get(TestHash3)
+       n, err = v.Get(TestHash3, data)
        if err != nil {
                t.Error(err)
        } else {
-               if bytes.Compare(data, TestBlock3) != 0 {
-                       t.Errorf("Block present, but to %+q, expected %+q", data, TestBlock3)
+               if bytes.Compare(data[:n], TestBlock3) != 0 {
+                       t.Errorf("Block present, but to %+q, expected %+q", data[:n], TestBlock3)
                }
-               bufs.Put(data)
        }
 }
 
@@ -426,14 +422,12 @@ func testDeleteNewBlock(t TB, factory TestableVolumeFactory) {
        if err := v.Trash(TestHash); err != nil {
                t.Error(err)
        }
-       data, err := v.Get(TestHash)
+       data := make([]byte, BlockSize)
+       n, err := v.Get(TestHash, data)
        if err != nil {
                t.Error(err)
-       } else {
-               if bytes.Compare(data, TestBlock) != 0 {
-                       t.Errorf("Got data %+q, expected %+q", data, TestBlock)
-               }
-               bufs.Put(data)
+       } else if bytes.Compare(data[:n], TestBlock) != 0 {
+               t.Errorf("Got data %+q, expected %+q", data[:n], TestBlock)
        }
 }
 
@@ -455,9 +449,31 @@ func testDeleteOldBlock(t TB, factory TestableVolumeFactory) {
        if err := v.Trash(TestHash); err != nil {
                t.Error(err)
        }
-       if _, err := v.Get(TestHash); err == nil || !os.IsNotExist(err) {
+       data := make([]byte, BlockSize)
+       if _, err := v.Get(TestHash, data); err == nil || !os.IsNotExist(err) {
                t.Errorf("os.IsNotExist(%v) should have been true", err)
        }
+
+       _, err := v.Mtime(TestHash)
+       if err == nil || !os.IsNotExist(err) {
+               t.Fatalf("os.IsNotExist(%v) should have been true", err)
+       }
+
+       err = v.Compare(TestHash, TestBlock)
+       if err == nil || !os.IsNotExist(err) {
+               t.Fatalf("os.IsNotExist(%v) should have been true", err)
+       }
+
+       indexBuf := new(bytes.Buffer)
+       v.IndexTo("", indexBuf)
+       if strings.Contains(string(indexBuf.Bytes()), TestHash) {
+               t.Fatalf("Found trashed block in IndexTo")
+       }
+
+       err = v.Touch(TestHash)
+       if err == nil || !os.IsNotExist(err) {
+               t.Fatalf("os.IsNotExist(%v) should have been true", err)
+       }
 }
 
 // Calling Delete() for a block that does not exist should result in error.
@@ -514,9 +530,10 @@ func testUpdateReadOnly(t TB, factory TestableVolumeFactory) {
        }
 
        v.PutRaw(TestHash, TestBlock)
+       buf := make([]byte, BlockSize)
 
        // Get from read-only volume should succeed
-       _, err := v.Get(TestHash)
+       _, err := v.Get(TestHash, buf)
        if err != nil {
                t.Errorf("got err %v, expected nil", err)
        }
@@ -526,7 +543,7 @@ func testUpdateReadOnly(t TB, factory TestableVolumeFactory) {
        if err == nil {
                t.Errorf("Expected error when putting block in a read-only volume")
        }
-       _, err = v.Get(TestHash2)
+       _, err = v.Get(TestHash2, buf)
        if err == nil {
                t.Errorf("Expected error when getting block whose put in read-only volume failed")
        }
@@ -561,45 +578,45 @@ func testGetConcurrent(t TB, factory TestableVolumeFactory) {
        v.PutRaw(TestHash3, TestBlock3)
 
        sem := make(chan int)
-       go func(sem chan int) {
-               buf, err := v.Get(TestHash)
+       go func() {
+               buf := make([]byte, BlockSize)
+               n, err := v.Get(TestHash, buf)
                if err != nil {
                        t.Errorf("err1: %v", err)
                }
-               bufs.Put(buf)
-               if bytes.Compare(buf, TestBlock) != 0 {
-                       t.Errorf("buf should be %s, is %s", string(TestBlock), string(buf))
+               if bytes.Compare(buf[:n], TestBlock) != 0 {
+                       t.Errorf("buf should be %s, is %s", string(TestBlock), string(buf[:n]))
                }
                sem <- 1
-       }(sem)
+       }()
 
-       go func(sem chan int) {
-               buf, err := v.Get(TestHash2)
+       go func() {
+               buf := make([]byte, BlockSize)
+               n, err := v.Get(TestHash2, buf)
                if err != nil {
                        t.Errorf("err2: %v", err)
                }
-               bufs.Put(buf)
-               if bytes.Compare(buf, TestBlock2) != 0 {
-                       t.Errorf("buf should be %s, is %s", string(TestBlock2), string(buf))
+               if bytes.Compare(buf[:n], TestBlock2) != 0 {
+                       t.Errorf("buf should be %s, is %s", string(TestBlock2), string(buf[:n]))
                }
                sem <- 1
-       }(sem)
+       }()
 
-       go func(sem chan int) {
-               buf, err := v.Get(TestHash3)
+       go func() {
+               buf := make([]byte, BlockSize)
+               n, err := v.Get(TestHash3, buf)
                if err != nil {
                        t.Errorf("err3: %v", err)
                }
-               bufs.Put(buf)
-               if bytes.Compare(buf, TestBlock3) != 0 {
-                       t.Errorf("buf should be %s, is %s", string(TestBlock3), string(buf))
+               if bytes.Compare(buf[:n], TestBlock3) != 0 {
+                       t.Errorf("buf should be %s, is %s", string(TestBlock3), string(buf[:n]))
                }
                sem <- 1
-       }(sem)
+       }()
 
        // Wait for all goroutines to finish
-       for done := 0; done < 3; {
-               done += <-sem
+       for done := 0; done < 3; done++ {
+               <-sem
        }
 }
 
@@ -639,36 +656,34 @@ func testPutConcurrent(t TB, factory TestableVolumeFactory) {
        }(sem)
 
        // Wait for all goroutines to finish
-       for done := 0; done < 3; {
-               done += <-sem
+       for done := 0; done < 3; done++ {
+               <-sem
        }
 
        // Double check that we actually wrote the blocks we expected to write.
-       buf, err := v.Get(TestHash)
+       buf := make([]byte, BlockSize)
+       n, err := v.Get(TestHash, buf)
        if err != nil {
                t.Errorf("Get #1: %v", err)
        }
-       bufs.Put(buf)
-       if bytes.Compare(buf, TestBlock) != 0 {
-               t.Errorf("Get #1: expected %s, got %s", string(TestBlock), string(buf))
+       if bytes.Compare(buf[:n], TestBlock) != 0 {
+               t.Errorf("Get #1: expected %s, got %s", string(TestBlock), string(buf[:n]))
        }
 
-       buf, err = v.Get(TestHash2)
+       n, err = v.Get(TestHash2, buf)
        if err != nil {
                t.Errorf("Get #2: %v", err)
        }
-       bufs.Put(buf)
-       if bytes.Compare(buf, TestBlock2) != 0 {
-               t.Errorf("Get #2: expected %s, got %s", string(TestBlock2), string(buf))
+       if bytes.Compare(buf[:n], TestBlock2) != 0 {
+               t.Errorf("Get #2: expected %s, got %s", string(TestBlock2), string(buf[:n]))
        }
 
-       buf, err = v.Get(TestHash3)
+       n, err = v.Get(TestHash3, buf)
        if err != nil {
                t.Errorf("Get #3: %v", err)
        }
-       bufs.Put(buf)
-       if bytes.Compare(buf, TestBlock3) != 0 {
-               t.Errorf("Get #3: expected %s, got %s", string(TestBlock3), string(buf))
+       if bytes.Compare(buf[:n], TestBlock3) != 0 {
+               t.Errorf("Get #3: expected %s, got %s", string(TestBlock3), string(buf[:n]))
        }
 }
 
@@ -689,14 +704,13 @@ func testPutFullBlock(t TB, factory TestableVolumeFactory) {
        if err != nil {
                t.Fatal(err)
        }
-       rdata, err := v.Get(hash)
+       buf := make([]byte, BlockSize)
+       n, err := v.Get(hash, buf)
        if err != nil {
                t.Error(err)
-       } else {
-               defer bufs.Put(rdata)
        }
-       if bytes.Compare(rdata, wdata) != 0 {
-               t.Error("rdata != wdata")
+       if bytes.Compare(buf[:n], wdata) != 0 {
+               t.Error("buf %+q != wdata %+q", buf[:n], wdata)
        }
 }
 
@@ -717,27 +731,27 @@ func testTrashUntrash(t TB, factory TestableVolumeFactory) {
        v.PutRaw(TestHash, TestBlock)
        v.TouchWithDate(TestHash, time.Now().Add(-2*blobSignatureTTL))
 
-       buf, err := v.Get(TestHash)
+       buf := make([]byte, BlockSize)
+       n, err := v.Get(TestHash, buf)
        if err != nil {
                t.Fatal(err)
        }
-       if bytes.Compare(buf, TestBlock) != 0 {
-               t.Errorf("Got data %+q, expected %+q", buf, TestBlock)
+       if bytes.Compare(buf[:n], TestBlock) != 0 {
+               t.Errorf("Got data %+q, expected %+q", buf[:n], TestBlock)
        }
-       bufs.Put(buf)
 
        // Trash
        err = v.Trash(TestHash)
        if v.Writable() == false {
                if err != MethodDisabledError {
-                       t.Error(err)
+                       t.Fatal(err)
                }
        } else if err != nil {
                if err != ErrNotImplemented {
-                       t.Error(err)
+                       t.Fatal(err)
                }
        } else {
-               _, err = v.Get(TestHash)
+               _, err = v.Get(TestHash, buf)
                if err == nil || !os.IsNotExist(err) {
                        t.Errorf("os.IsNotExist(%v) should have been true", err)
                }
@@ -750,14 +764,13 @@ func testTrashUntrash(t TB, factory TestableVolumeFactory) {
        }
 
        // Get the block - after trash and untrash sequence
-       buf, err = v.Get(TestHash)
+       n, err = v.Get(TestHash, buf)
        if err != nil {
                t.Fatal(err)
        }
-       if bytes.Compare(buf, TestBlock) != 0 {
-               t.Errorf("Got data %+q, expected %+q", buf, TestBlock)
+       if bytes.Compare(buf[:n], TestBlock) != 0 {
+               t.Errorf("Got data %+q, expected %+q", buf[:n], TestBlock)
        }
-       bufs.Put(buf)
 }
 
 func testTrashEmptyTrashUntrash(t TB, factory TestableVolumeFactory) {
@@ -768,14 +781,31 @@ func testTrashEmptyTrashUntrash(t TB, factory TestableVolumeFactory) {
        }(trashLifetime)
 
        checkGet := func() error {
-               buf, err := v.Get(TestHash)
+               buf := make([]byte, BlockSize)
+               n, err := v.Get(TestHash, buf)
                if err != nil {
                        return err
                }
-               if bytes.Compare(buf, TestBlock) != 0 {
-                       t.Fatalf("Got data %+q, expected %+q", buf, TestBlock)
+               if bytes.Compare(buf[:n], TestBlock) != 0 {
+                       t.Fatalf("Got data %+q, expected %+q", buf[:n], TestBlock)
+               }
+
+               _, err = v.Mtime(TestHash)
+               if err != nil {
+                       return err
                }
-               bufs.Put(buf)
+
+               err = v.Compare(TestHash, TestBlock)
+               if err != nil {
+                       return err
+               }
+
+               indexBuf := new(bytes.Buffer)
+               v.IndexTo("", indexBuf)
+               if !strings.Contains(string(indexBuf.Bytes()), TestHash) {
+                       return os.ErrNotExist
+               }
+
                return nil
        }
 
@@ -791,6 +821,7 @@ func testTrashEmptyTrashUntrash(t TB, factory TestableVolumeFactory) {
                t.Fatal(err)
        }
 
+       // Trash the block
        err = v.Trash(TestHash)
        if err == MethodDisabledError || err == ErrNotImplemented {
                // Skip the trash tests for read-only volumes, and
@@ -803,6 +834,11 @@ func testTrashEmptyTrashUntrash(t TB, factory TestableVolumeFactory) {
                t.Fatalf("os.IsNotExist(%v) should have been true", err)
        }
 
+       err = v.Touch(TestHash)
+       if err == nil || !os.IsNotExist(err) {
+               t.Fatalf("os.IsNotExist(%v) should have been true", err)
+       }
+
        v.EmptyTrash()
 
        // Even after emptying the trash, we can untrash our block
@@ -811,11 +847,20 @@ func testTrashEmptyTrashUntrash(t TB, factory TestableVolumeFactory) {
        if err != nil {
                t.Fatal(err)
        }
+
        err = checkGet()
        if err != nil {
                t.Fatal(err)
        }
 
+       err = v.Touch(TestHash)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       // Because we Touch'ed, need to backdate again for next set of tests
+       v.TouchWithDate(TestHash, time.Now().Add(-2*blobSignatureTTL))
+
        // Untrash should fail if the only block in the trash has
        // already been untrashed.
        err = v.Untrash(TestHash)
@@ -856,11 +901,14 @@ func testTrashEmptyTrashUntrash(t TB, factory TestableVolumeFactory) {
 
        // Trash it again, and this time call EmptyTrash so it really
        // goes away.
+       // (In Azure volumes, un/trash changes Mtime, so first backdate again)
+       v.TouchWithDate(TestHash, time.Now().Add(-2*blobSignatureTTL))
        err = v.Trash(TestHash)
        err = checkGet()
        if err == nil || !os.IsNotExist(err) {
-               t.Errorf("os.IsNotExist(%v) should have been true", err)
+               t.Fatalf("os.IsNotExist(%v) should have been true", err)
        }
+       // EmptryTrash
        v.EmptyTrash()
 
        // Untrash won't find it
index e8a5a338f51cb25d47f419d02f6ca76a211d535c..5671b8d4a9fd7405f8ca7fd35a18fdf10289a059 100644 (file)
@@ -113,17 +113,16 @@ func (v *MockVolume) Compare(loc string, buf []byte) error {
        }
 }
 
-func (v *MockVolume) Get(loc string) ([]byte, error) {
+func (v *MockVolume) Get(loc string, buf []byte) (int, error) {
        v.gotCall("Get")
        <-v.Gate
        if v.Bad {
-               return nil, errors.New("Bad volume")
+               return 0, errors.New("Bad volume")
        } else if block, ok := v.Store[loc]; ok {
-               buf := bufs.Get(len(block))
-               copy(buf, block)
-               return buf, nil
+               copy(buf[:len(block)], block)
+               return len(block), nil
        }
-       return nil, os.ErrNotExist
+       return 0, os.ErrNotExist
 }
 
 func (v *MockVolume) Put(loc string, block []byte) error {
index 996068cf3d2438f71364b0b2c9ddafcdbd712c54..7aff85e59a4357acb1e27ce5386756feb96fa0e1 100644 (file)
@@ -181,26 +181,24 @@ func (v *UnixVolume) stat(path string) (os.FileInfo, error) {
        return stat, err
 }
 
-// Get retrieves a block identified by the locator string "loc", and
-// returns its contents as a byte slice.
-//
-// Get returns a nil buffer IFF it returns a non-nil error.
-func (v *UnixVolume) Get(loc string) ([]byte, error) {
+// Get retrieves a block, copies it to the given slice, and returns
+// the number of bytes copied.
+func (v *UnixVolume) Get(loc string, buf []byte) (int, error) {
        path := v.blockPath(loc)
        stat, err := v.stat(path)
        if err != nil {
-               return nil, v.translateError(err)
+               return 0, v.translateError(err)
+       }
+       if stat.Size() > int64(len(buf)) {
+               return 0, TooLongError
        }
-       buf := bufs.Get(int(stat.Size()))
+       var read int
+       size := int(stat.Size())
        err = v.getFunc(path, func(rdr io.Reader) error {
-               _, err = io.ReadFull(rdr, buf)
+               read, err = io.ReadFull(rdr, buf[:size])
                return err
        })
-       if err != nil {
-               bufs.Put(buf)
-               return nil, err
-       }
-       return buf, nil
+       return read, err
 }
 
 // Compare returns nil if Get(loc) would return the same content as
@@ -540,7 +538,7 @@ func (v *UnixVolume) translateError(err error) error {
        }
 }
 
-var trashLocRegexp = regexp.MustCompile(`/([0-9a-f]{32})\.trash\.(\d+)$`)
+var unixTrashLocRegexp = regexp.MustCompile(`/([0-9a-f]{32})\.trash\.(\d+)$`)
 
 // EmptyTrash walks hierarchy looking for {hash}.trash.*
 // and deletes those with deadline < now.
@@ -556,7 +554,7 @@ func (v *UnixVolume) EmptyTrash() {
                if info.Mode().IsDir() {
                        return nil
                }
-               matches := trashLocRegexp.FindStringSubmatch(path)
+               matches := unixTrashLocRegexp.FindStringSubmatch(path)
                if len(matches) != 3 {
                        return nil
                }
index 0775e89ed275d14f7e2be510084a52e39af84472..c95538bc4da380f7af5561984d7a069324cea970 100644 (file)
@@ -106,12 +106,13 @@ func TestGetNotFound(t *testing.T) {
        defer v.Teardown()
        v.Put(TestHash, TestBlock)
 
-       buf, err := v.Get(TestHash2)
+       buf := make([]byte, BlockSize)
+       n, err := v.Get(TestHash2, buf)
        switch {
        case os.IsNotExist(err):
                break
        case err == nil:
-               t.Errorf("Read should have failed, returned %s", string(buf))
+               t.Errorf("Read should have failed, returned %+q", buf[:n])
        default:
                t.Errorf("Read expected ErrNotExist, got: %s", err)
        }
@@ -151,7 +152,8 @@ func TestUnixVolumeReadonly(t *testing.T) {
 
        v.PutRaw(TestHash, TestBlock)
 
-       _, err := v.Get(TestHash)
+       buf := make([]byte, BlockSize)
+       _, err := v.Get(TestHash, buf)
        if err != nil {
                t.Errorf("got err %v, expected nil", err)
        }
index 9591b42ca3e3a0f38060222124fabccecc7fe4a9..68ea97ab75aac1b71aca36427165ed54cef97cc2 100644 (file)
@@ -3,6 +3,8 @@ from __future__ import absolute_import, print_function
 import errno
 import logging
 import os
+import signal
+import time
 import threading
 import traceback
 
@@ -82,4 +84,35 @@ class BaseNodeManagerActor(pykka.ThreadingActor):
         if (exception_type in (threading.ThreadError, MemoryError) or
             exception_type is OSError and exception_value.errno == errno.ENOMEM):
             lg.critical("Unhandled exception is a fatal error, killing Node Manager")
-            os.killpg(os.getpgid(0), 9)
+            os.kill(os.getpid(), signal.SIGKILL)
+
+    def ping(self):
+        return True
+
+
+class WatchdogActor(pykka.ThreadingActor):
+    def __init__(self, timeout, *args, **kwargs):
+         super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
+         self.timeout = timeout
+         self.actors = [a.proxy() for a in args]
+         self.actor_ref = TellableActorRef(self)
+         self._later = self.actor_ref.tell_proxy()
+
+    def kill_self(self, e, act):
+        lg = getattr(self, "_logger", logging)
+        lg.critical("Watchdog exception", exc_info=e)
+        lg.critical("Actor %s watchdog ping time out, killing Node Manager", act)
+        os.kill(os.getpid(), signal.SIGKILL)
+
+    def on_start(self):
+        self._later.run()
+
+    def run(self):
+        a = None
+        try:
+            for a in self.actors:
+                a.ping().get(self.timeout)
+            time.sleep(20)
+            self._later.run()
+        except Exception as e:
+            self.kill_self(e, a)
index 4848289e8bfed1fbf253f7c8589a29e5c548051b..a950210aa89c0bbf18e9df64815393bfae71c65a 100644 (file)
@@ -14,6 +14,7 @@ from .. import \
     arvados_node_missing, RetryMixin
 from ...clientactor import _notify_subscribers
 from ... import config
+from .transitions import transitions
 
 class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
     """Base class for actors that change a compute node's state.
@@ -208,22 +209,21 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self._logger.info("Shutdown cancelled: %s.", reason)
         self._finished(success_flag=False)
 
-    def _stop_if_window_closed(orig_func):
+    def _cancel_on_exception(orig_func):
         @functools.wraps(orig_func)
-        def stop_wrapper(self, *args, **kwargs):
-            if (self.cancellable and
-                  (self._monitor.shutdown_eligible().get() is not True)):
-                self._later.cancel_shutdown(self.WINDOW_CLOSED)
-                return None
-            else:
+        def finish_wrapper(self, *args, **kwargs):
+            try:
                 return orig_func(self, *args, **kwargs)
-        return stop_wrapper
+            except Exception as error:
+                self._logger.error("Actor error %s", error)
+                self._later.cancel_shutdown("Unhandled exception %s" % error)
+        return finish_wrapper
 
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @_stop_if_window_closed
+    @_cancel_on_exception
     @RetryMixin._retry()
     def shutdown_node(self):
         self._logger.info("Starting shutdown")
+        arv_node = self._arvados_node()
         if not self._cloud.destroy_node(self.cloud_node):
             if self._cloud.broken(self.cloud_node):
                 self._later.cancel_shutdown(self.NODE_BROKEN)
@@ -232,7 +232,6 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
                 # Force a retry.
                 raise cloud_types.LibcloudError("destroy_node failed")
         self._logger.info("Shutdown success")
-        arv_node = self._arvados_node()
         if arv_node is None:
             self._finished(success_flag=True)
         else:
@@ -244,9 +243,6 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
         self._finished(success_flag=True)
 
-    # Make the decorator available to subclasses.
-    _stop_if_window_closed = staticmethod(_stop_if_window_closed)
-
 
 class ComputeNodeUpdateActor(config.actor_class):
     """Actor to dispatch one-off cloud management requests.
@@ -256,12 +252,6 @@ class ComputeNodeUpdateActor(config.actor_class):
     this to perform maintenance tasks on themselves.  Having a
     dedicated actor for this gives us the opportunity to control the
     flow of requests; e.g., by backing off when errors occur.
-
-    This actor is most like a "traditional" Pykka actor: there's no
-    subscribing, but instead methods return real driver results.  If
-    you're interested in those results, you should get them from the
-    Future that the proxy method returns.  Be prepared to handle exceptions
-    from the cloud driver when you do.
     """
     def __init__(self, cloud_factory, max_retry_wait=180):
         super(ComputeNodeUpdateActor, self).__init__()
@@ -270,6 +260,12 @@ class ComputeNodeUpdateActor(config.actor_class):
         self.error_streak = 0
         self.next_request_time = time.time()
 
+    def _set_logger(self):
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
+
+    def on_start(self):
+        self._set_logger()
+
     def _throttle_errors(orig_func):
         @functools.wraps(orig_func)
         def throttle_wrapper(self, *args, **kwargs):
@@ -280,10 +276,12 @@ class ComputeNodeUpdateActor(config.actor_class):
             try:
                 result = orig_func(self, *args, **kwargs)
             except Exception as error:
-                self.error_streak += 1
-                self.next_request_time += min(2 ** self.error_streak,
-                                              self.max_retry_wait)
-                raise
+                if self._cloud.is_cloud_exception(error):
+                    self.error_streak += 1
+                    self.next_request_time += min(2 ** self.error_streak,
+                                                  self.max_retry_wait)
+                self._logger.warn(
+                    "Unhandled exception: %s", error, exc_info=error)
             else:
                 self.error_streak = 0
                 return result
@@ -341,64 +339,105 @@ class ComputeNodeMonitorActor(config.actor_class):
         self._last_log = msg
         self._logger.debug(msg, *args)
 
-    def in_state(self, *states):
-        # Return a boolean to say whether or not our Arvados node record is in
-        # one of the given states.  If state information is not
-        # available--because this node has no Arvados record, the record is
-        # stale, or the record has no state information--return None.
-        if (self.arvados_node is None) or not timestamp_fresh(
-              arvados_node_mtime(self.arvados_node), self.node_stale_after):
-            return None
+    def get_state(self):
+        """Get node state, one of ['unpaired', 'busy', 'idle', 'down']."""
+
+        # If this node is not associated with an Arvados node, return 'unpaired'.
+        if self.arvados_node is None:
+            return 'unpaired'
+
         state = self.arvados_node['crunch_worker_state']
-        if not state:
-            return None
-        result = state in states
-        if state == 'idle':
-            result = result and not self.arvados_node['job_uuid']
-        return result
+
+        # If state information is not available because it is missing or the
+        # record is stale, return 'down'.
+        if not state or not timestamp_fresh(arvados_node_mtime(self.arvados_node),
+                                            self.node_stale_after):
+            state = 'down'
+
+        # There's a window between when a node pings for the first time and the
+        # value of 'slurm_state' is synchronized by crunch-dispatch.  In this
+        # window, the node will still report as 'down'.  Check that
+        # first_ping_at is truthy and consider the node 'idle' during the
+        # initial boot grace period.
+        if (state == 'down' and
+            self.arvados_node['first_ping_at'] and
+            timestamp_fresh(self.cloud_node_start_time,
+                            self.boot_fail_after) and
+            not self._cloud.broken(self.cloud_node)):
+            state = 'idle'
+
+        # "missing" means last_ping_at is stale, this should be
+        # considered "down"
+        if arvados_node_missing(self.arvados_node, self.node_stale_after):
+            state = 'down'
+
+        # Turns out using 'job_uuid' this way is a bad idea.  The node record
+        # is assigned the job_uuid before the job is locked (which removes it
+        # from the queue) which means the job will be double-counted as both in
+        # the wishlist and but also keeping a node busy.  This end result is
+        # excess nodes being booted.
+        #if state == 'idle' and self.arvados_node['job_uuid']:
+        #    state = 'busy'
+
+        return state
+
+    def in_state(self, *states):
+        return self.get_state() in states
 
     def shutdown_eligible(self):
-        """Return True if eligible for shutdown, or a string explaining why the node
-        is not eligible for shutdown."""
+        """Determine if node is candidate for shut down.
 
-        if not self._shutdowns.window_open():
-            return "shutdown window is not open."
-        if self.arvados_node is None:
-            # Node is unpaired.
-            # If it hasn't pinged Arvados after boot_fail seconds, shut it down
-            if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
-                return "node is still booting, will be considered a failed boot at %s" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.cloud_node_start_time + self.boot_fail_after))
-            else:
-                return True
-        missing = arvados_node_missing(self.arvados_node, self.node_stale_after)
-        if missing and self._cloud.broken(self.cloud_node):
-            # Node is paired, but Arvados says it is missing and the cloud says the node
-            # is in an error state, so shut it down.
-            return True
-        if missing is None and self._cloud.broken(self.cloud_node):
-            self._logger.info(
-                "Cloud node considered 'broken' but paired node %s last_ping_at is None, " +
-                "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).",
-                self.arvados_node['uuid'])
-        if self.in_state('idle'):
-            return True
+        Returns a tuple of (boolean, string) where the first value is whether
+        the node is candidate for shut down, and the second value is the
+        reason for the decision.
+        """
+
+        # Collect states and then consult state transition table whether we
+        # should shut down.  Possible states are:
+        # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
+        # window = ["open", "closed"]
+        # boot_grace = ["boot wait", "boot exceeded"]
+        # idle_grace = ["not idle", "idle wait", "idle exceeded"]
+
+        if self.arvados_node and not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
+            return (False, "node state is stale")
+
+        crunch_worker_state = self.get_state()
+
+        window = "open" if self._shutdowns.window_open() else "closed"
+
+        if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
+            boot_grace = "boot wait"
         else:
-            return "node is not idle."
+            boot_grace = "boot exceeded"
+
+        # API server side not implemented yet.
+        idle_grace = 'idle exceeded'
+
+        node_state = (crunch_worker_state, window, boot_grace, idle_grace)
+        t = transitions[node_state]
+        if t is not None:
+            # yes, shutdown eligible
+            return (True, "node state is %s" % (node_state,))
+        else:
+            # no, return a reason
+            return (False, "node state is %s" % (node_state,))
 
     def consider_shutdown(self):
         try:
+            eligible, reason = self.shutdown_eligible()
             next_opening = self._shutdowns.next_opening()
-            eligible = self.shutdown_eligible()
-            if eligible is True:
-                self._debug("Suggesting shutdown.")
+            if eligible:
+                self._debug("Suggesting shutdown because %s", reason)
                 _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
-            elif self._shutdowns.window_open():
-                self._debug("Cannot shut down because %s", eligible)
-            elif self.last_shutdown_opening != next_opening:
-                self._debug("Shutdown window closed.  Next at %s.",
-                            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
-                self._timer.schedule(next_opening, self._later.consider_shutdown)
-                self.last_shutdown_opening = next_opening
+            else:
+                self._debug("Not eligible for shut down because %s", reason)
+
+                if self.last_shutdown_opening != next_opening:
+                    self._debug("Shutdown window closed.  Next at %s.",
+                                time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
+                    self._timer.schedule(next_opening, self._later.consider_shutdown)
+                    self.last_shutdown_opening = next_opening
         except Exception:
             self._logger.exception("Unexpected exception")
 
index 4d70436801564e9a35675e95c18f33fddc125806..cae87197f72f0241ef8763c637e6eecb885acc94 100644 (file)
@@ -10,12 +10,23 @@ from . import \
 from . import ComputeNodeShutdownActor as ShutdownActorBase
 from .. import RetryMixin
 
-class ComputeNodeShutdownActor(ShutdownActorBase):
+class SlurmMixin(object):
     SLURM_END_STATES = frozenset(['down\n', 'down*\n',
                                   'drain\n', 'drain*\n',
                                   'fail\n', 'fail*\n'])
     SLURM_DRAIN_STATES = frozenset(['drain\n', 'drng\n'])
 
+    def _set_node_state(self, nodename, state, *args):
+        cmd = ['scontrol', 'update', 'NodeName=' + nodename,
+               'State=' + state]
+        cmd.extend(args)
+        subprocess.check_output(cmd)
+
+    def _get_slurm_state(self, nodename):
+        return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', nodename])
+
+
+class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
     def on_start(self):
         arv_node = self._arvados_node()
         if arv_node is None:
@@ -27,48 +38,40 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
             self._logger.info("Draining SLURM node %s", self._nodename)
             self._later.issue_slurm_drain()
 
-    def _set_node_state(self, state, *args):
-        cmd = ['scontrol', 'update', 'NodeName=' + self._nodename,
-               'State=' + state]
-        cmd.extend(args)
-        subprocess.check_output(cmd)
-
-    def _get_slurm_state(self):
-        return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
-
-    # The following methods retry on OSError.  This is intended to mitigate bug
-    # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
-    # allocate memory" resulting in the untimely death of the shutdown actor
-    # and tends to result in node manager getting into a wedged state where it
-    # won't allocate new nodes or shut down gracefully.  The underlying causes
-    # of the excessive memory usage that result in the "Cannot allocate memory"
-    # error are still being investigated.
-
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     def cancel_shutdown(self, reason):
         if self._nodename:
-            if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
+            if self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
                 # Resume from "drng" or "drain"
-                self._set_node_state('RESUME')
+                self._set_node_state(self._nodename, 'RESUME')
             else:
                 # Node is in a state such as 'idle' or 'alloc' so don't
                 # try to resume it because that will just raise an error.
                 pass
         return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
 
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    @ShutdownActorBase._stop_if_window_closed
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     def issue_slurm_drain(self):
-        self._set_node_state('DRAIN', 'Reason=Node Manager shutdown')
-        self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
-        self._later.await_slurm_drain()
+        if self.cancel_reason is not None:
+            return
+        if self._nodename:
+            self._set_node_state(self._nodename, 'DRAIN', 'Reason=Node Manager shutdown')
+            self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
+            self._later.await_slurm_drain()
+        else:
+            self._later.shutdown_node()
 
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    @ShutdownActorBase._stop_if_window_closed
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     def await_slurm_drain(self):
-        output = self._get_slurm_state()
-        if output in self.SLURM_END_STATES:
-            self._later.shutdown_node()
-        else:
+        if self.cancel_reason is not None:
+            return
+        output = self._get_slurm_state(self._nodename)
+        if output in ("drng\n", "alloc\n", "drng*\n", "alloc*\n"):
             self._timer.schedule(time.time() + 10,
                                  self._later.await_slurm_drain)
+        elif output in ("idle\n"):
+            # Not in "drng" so cancel self.
+            self.cancel_shutdown("slurm state is %s" % output.strip())
+        else:
+            # any other state.
+            self._later.shutdown_node()
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py b/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py
new file mode 100644 (file)
index 0000000..2ff3c94
--- /dev/null
@@ -0,0 +1,52 @@
+transitions = {
+ ('busy', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'closed', 'boot exceeded', 'not idle'): None,
+ ('busy', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot wait', 'idle wait'): None,
+ ('busy', 'closed', 'boot wait', 'not idle'): None,
+ ('busy', 'open', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'open', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'open', 'boot exceeded', 'not idle'): None,
+ ('busy', 'open', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'open', 'boot wait', 'idle wait'): None,
+ ('busy', 'open', 'boot wait', 'not idle'): None,
+
+ ('down', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('down', 'closed', 'boot wait', 'idle wait'): None,
+ ('down', 'closed', 'boot wait', 'not idle'): None,
+ ('down', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN",
+
+ ('idle', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'closed', 'boot exceeded', 'not idle'): None,
+ ('idle', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot wait', 'idle wait'): None,
+ ('idle', 'closed', 'boot wait', 'not idle'): None,
+ ('idle', 'open', 'boot exceeded', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'open', 'boot exceeded', 'not idle'): None,
+ ('idle', 'open', 'boot wait', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot wait', 'idle wait'): None,
+ ('idle', 'open', 'boot wait', 'not idle'): None,
+
+ ('unpaired', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'closed', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'closed', 'boot wait', 'not idle'): None,
+ ('unpaired', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'open', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'open', 'boot wait', 'not idle'): None}
index d72c86dc0afacc05c271c4beaa6b32a46c000c70..fa9cfff670c996b87b5d5044c7b48d2ec9d436eb 100644 (file)
@@ -79,33 +79,60 @@ class BaseComputeNodeDriver(RetryMixin):
             key = NodeAuthSSHKey(ssh_file.read())
         return 'auth', key
 
-    def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
+    def search_for_now(self, term, list_method, key=attrgetter('id'), **kwargs):
         """Return one matching item from a list of cloud objects.
 
         Raises ValueError if the number of matching objects is not exactly 1.
 
         Arguments:
         * term: The value that identifies a matching item.
-        * list_method: A string that names the method to call on this
-          instance's libcloud driver for a list of objects.
+        * list_method: A string that names the method to call for a
+          list of objects.
         * key: A function that accepts a cloud object and returns a
           value search for a `term` match on each item.  Returns the
           object's 'id' attribute by default.
         """
+        try:
+            list_func = getattr(self, list_method)
+        except AttributeError:
+            list_func = getattr(self.real, list_method)
+        items = list_func(**kwargs)
+        results = [item for item in items if key(item) == term]
+        count = len(results)
+        if count != 1:
+            raise ValueError("{} returned {} results for {!r}".format(
+                    list_method, count, term))
+        return results[0]
+
+    def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
+        """Return one cached matching item from a list of cloud objects.
+
+        See search_for_now() for details of arguments and exceptions.
+        This method caches results, so it's good to find static cloud objects
+        like node sizes, regions, etc.
+        """
         cache_key = (list_method, term)
         if cache_key not in self.SEARCH_CACHE:
-            items = getattr(self.real, list_method)(**kwargs)
-            results = [item for item in items
-                       if key(item) == term]
-            count = len(results)
-            if count != 1:
-                raise ValueError("{} returned {} results for '{}'".format(
-                        list_method, count, term))
-            self.SEARCH_CACHE[cache_key] = results[0]
+            self.SEARCH_CACHE[cache_key] = self.search_for_now(
+                term, list_method, key, **kwargs)
         return self.SEARCH_CACHE[cache_key]
 
-    def list_nodes(self):
-        return self.real.list_nodes(**self.list_kwargs)
+    def list_nodes(self, **kwargs):
+        l = self.list_kwargs.copy()
+        l.update(kwargs)
+        return self.real.list_nodes(**l)
+
+    def create_cloud_name(self, arvados_node):
+        """Return a cloud node name for the given Arvados node record.
+
+        Subclasses must override this method.  It should return a string
+        that can be used as the name for a newly-created cloud node,
+        based on identifying information in the Arvados node record.
+
+        Arguments:
+        * arvados_node: This Arvados node record to seed the new cloud node.
+        """
+        raise NotImplementedError("BaseComputeNodeDriver.create_cloud_name")
 
     def arvados_create_kwargs(self, size, arvados_node):
         """Return dynamic keyword arguments for create_node.
@@ -141,19 +168,17 @@ class BaseComputeNodeDriver(RetryMixin):
             kwargs.update(self.arvados_create_kwargs(size, arvados_node))
             kwargs['size'] = size
             return self.real.create_node(**kwargs)
-        except self.CLOUD_ERRORS:
+        except self.CLOUD_ERRORS as create_error:
             # Workaround for bug #6702: sometimes the create node request
             # succeeds but times out and raises an exception instead of
             # returning a result.  If this happens, we get stuck in a retry
             # loop forever because subsequent create_node attempts will fail
             # due to node name collision.  So check if the node we intended to
             # create shows up in the cloud node list and return it if found.
-            node = self.search_for(kwargs['name'], 'list_nodes', self._name_key)
-            if node:
-                return node
-            else:
-                # something else went wrong, re-raise the exception
-                raise
+            try:
+                return self.search_for_now(kwargs['name'], 'list_nodes', self._name_key)
+            except ValueError:
+                raise create_error
 
     def post_create_node(self, cloud_node):
         # ComputeNodeSetupActor calls this method after the cloud node is
index 167d8b3210937acc226eaa1b5d41e333225b4176..e293d1bebeb5a479b69ff3e22784b9a467b17dd2 100644 (file)
@@ -38,15 +38,18 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             auth_kwargs, list_kwargs, create_kwargs,
             driver_class)
 
+    def create_cloud_name(self, arvados_node):
+        uuid_parts = arvados_node['uuid'].split('-', 2)
+        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
     def arvados_create_kwargs(self, size, arvados_node):
-        cluster_id, _, node_id = arvados_node['uuid'].split('-')
-        name = 'compute-{}-{}'.format(node_id, cluster_id)
         tags = {
             'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
             'arv-ping-url': self._make_ping_url(arvados_node)
         }
         tags.update(self.tags)
 
+        name = self.create_cloud_name(arvados_node)
         customdata = """#!/bin/sh
 mkdir -p    /var/tmp/arv-node-data/meta-data
 echo %s > /var/tmp/arv-node-data/arv-ping-url
index d314d38986e0df62a3c79624c28e77c7630cdbb6..8deabbd50a6163da537193d0df39ac720f1d04d0 100644 (file)
@@ -64,8 +64,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
     def _init_subnet_id(self, subnet_id):
         return 'ex_subnet', self.search_for(subnet_id, 'ex_list_subnets')
 
+    create_cloud_name = staticmethod(arvados_node_fqdn)
+
     def arvados_create_kwargs(self, size, arvados_node):
-        return {'name': arvados_node_fqdn(arvados_node),
+        return {'name': self.create_cloud_name(arvados_node),
                 'ex_userdata': self._make_ping_url(arvados_node)}
 
     def post_create_node(self, cloud_node):
index be9988333b60ae4a9bae2711fe8c50b4f65831d2..b853f00a6728693cce4b855021e18bb35c869087 100644 (file)
@@ -60,9 +60,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             self.create_kwargs['ex_metadata']['sshKeys'] = (
                 'root:' + ssh_file.read().strip())
 
+    def create_cloud_name(self, arvados_node):
+        uuid_parts = arvados_node['uuid'].split('-', 2)
+        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
     def arvados_create_kwargs(self, size, arvados_node):
-        cluster_id, _, node_id = arvados_node['uuid'].split('-')
-        name = 'compute-{}-{}'.format(node_id, cluster_id)
+        name = self.create_cloud_name(arvados_node)
         disks = [
             {'autoDelete': True,
              'boot': True,
index 15891a92bcf8a7fdd5365595489e5efe96a5e0cd..b54461c47d81b9f87bc1096da254cdf60e3f8aa9 100644 (file)
@@ -44,7 +44,8 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                        'poll_stale_after': '600',
                        'max_total_price': '0',
                        'boot_fail_after': str(sys.maxint),
-                       'node_stale_after': str(60 * 60 * 2)},
+                       'node_stale_after': str(60 * 60 * 2),
+                       'watchdog': '600'},
             'Logging': {'file': '/dev/stderr',
                         'level': 'WARNING'},
         }.iteritems():
index 7976f21f1a11b8083593a8e7ac9a68c494a47e16..a809148cdf8cadf37ae6da24a01603ca93fbf2da 100644 (file)
@@ -19,7 +19,7 @@ class _ComputeNodeRecord(object):
         self.cloud_node = cloud_node
         self.arvados_node = arvados_node
         self.assignment_time = assignment_time
-
+        self.shutdown_actor = None
 
 class _BaseNodeTracker(object):
     def __init__(self):
@@ -140,9 +140,7 @@ class NodeManagerDaemonActor(actor_class):
         self.cloud_nodes = _CloudNodeTracker()
         self.arvados_nodes = _ArvadosNodeTracker()
         self.booting = {}       # Actor IDs to ComputeNodeSetupActors
-        self.booted = {}        # Cloud node IDs to _ComputeNodeRecords
-        self.shutdowns = {}     # Cloud node IDs to ComputeNodeShutdownActors
-        self.sizes_booting_shutdown = {} # Actor IDs or Cloud node IDs to node size
+        self.sizes_booting = {} # Actor IDs to node size
 
     def on_start(self):
         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
@@ -182,127 +180,133 @@ class NodeManagerDaemonActor(actor_class):
         record = _ComputeNodeRecord(actor.proxy(), cloud_node)
         return record
 
+    def _register_cloud_node(self, node):
+        rec = self.cloud_nodes.get(node.id)
+        if rec is None:
+            self._logger.info("Registering new cloud node %s", node.id)
+            record = self._new_node(node)
+            self.cloud_nodes.add(record)
+        else:
+            rec.cloud_node = node
+
     def update_cloud_nodes(self, nodelist):
         self._update_poll_time('cloud_nodes')
-        for key, node in self.cloud_nodes.update_from(nodelist):
-            self._logger.info("Registering new cloud node %s", key)
-            if key in self.booted:
-                record = self.booted.pop(key)
-            else:
-                record = self._new_node(node)
-            self.cloud_nodes.add(record)
-            for arv_rec in self.arvados_nodes.unpaired():
-                if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
-                    self._pair_nodes(record, arv_rec.arvados_node)
-                    break
-        for key, record in self.cloud_nodes.orphans.iteritems():
-            if key in self.shutdowns:
+        for _, node in self.cloud_nodes.update_from(nodelist):
+            self._register_cloud_node(node)
+
+        self.try_pairing()
+
+        for record in self.cloud_nodes.orphans.itervalues():
+            if record.shutdown_actor:
                 try:
-                    self.shutdowns[key].stop().get()
+                    record.shutdown_actor.stop()
                 except pykka.ActorDeadError:
                     pass
-                del self.shutdowns[key]
-                del self.sizes_booting_shutdown[key]
-            record.actor.stop()
-            record.cloud_node = None
+                record.shutdown_actor = None
+
+            # A recently booted node is a node that successfully completed the
+            # setup actor but has not yet appeared in the cloud node list.
+            # This will have the tag _nodemanager_recently_booted on it, which
+            # means (if we're not shutting it down) we want to put it back into
+            # the cloud node list.  Once it really appears in the cloud list,
+            # the object in record.cloud_node will be replaced by a new one
+            # that lacks the "_nodemanager_recently_booted" tag.
+            if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+                self.cloud_nodes.add(record)
+            else:
+                record.actor.stop()
+                record.cloud_node = None
+
+    def _register_arvados_node(self, key, arv_node):
+        self._logger.info("Registering new Arvados node %s", key)
+        record = _ComputeNodeRecord(arvados_node=arv_node)
+        self.arvados_nodes.add(record)
 
     def update_arvados_nodes(self, nodelist):
         self._update_poll_time('arvados_nodes')
         for key, node in self.arvados_nodes.update_from(nodelist):
-            self._logger.info("Registering new Arvados node %s", key)
-            record = _ComputeNodeRecord(arvados_node=node)
-            self.arvados_nodes.add(record)
-        for arv_rec in self.arvados_nodes.unpaired():
-            arv_node = arv_rec.arvados_node
-            for cloud_rec in self.cloud_nodes.unpaired():
-                if cloud_rec.actor.offer_arvados_pair(arv_node).get():
-                    self._pair_nodes(cloud_rec, arv_node)
+            self._register_arvados_node(key, node)
+        self.try_pairing()
+
+    def try_pairing(self):
+        for record in self.cloud_nodes.unpaired():
+            for arv_rec in self.arvados_nodes.unpaired():
+                if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+                    self._pair_nodes(record, arv_rec.arvados_node)
                     break
 
     def _nodes_booting(self, size):
         s = sum(1
                 for c in self.booting.iterkeys()
-                if size is None or self.sizes_booting_shutdown[c].id == size.id)
-        s += sum(1
-                 for c in self.booted.itervalues()
-                 if size is None or c.cloud_node.size.id == size.id)
+                if size is None or self.sizes_booting[c].id == size.id)
         return s
 
-    def _nodes_unpaired(self, size):
-        return sum(1
-                   for c in self.cloud_nodes.unpaired()
-                   if size is None or c.cloud_node.size.id == size.id)
-
-    def _nodes_booted(self, size):
-        return sum(1
-                  for c in self.cloud_nodes.nodes.itervalues()
-                  if size is None or c.cloud_node.size.id == size.id)
-
-    def _nodes_up(self, size):
-        up = self._nodes_booting(size) + self._nodes_booted(size)
+    def _node_states(self, size):
+        states = pykka.get_all(rec.actor.get_state()
+                               for rec in self.cloud_nodes.nodes.itervalues()
+                               if ((size is None or rec.cloud_node.size.id == size.id) and
+                                   rec.shutdown_actor is None))
+        states += ['shutdown' for rec in self.cloud_nodes.nodes.itervalues()
+                   if ((size is None or rec.cloud_node.size.id == size.id) and
+                       rec.shutdown_actor is not None)]
+        return states
+
+    def _state_counts(self, size):
+        states = self._node_states(size)
+        counts = {
+            "booting": self._nodes_booting(size),
+            "unpaired": 0,
+            "busy": 0,
+            "idle": 0,
+            "down": 0,
+            "shutdown": 0
+        }
+        for s in states:
+            counts[s] = counts[s] + 1
+        return counts
+
+    def _nodes_up(self, counts):
+        up = counts["booting"] + counts["unpaired"] + counts["idle"] + counts["busy"]
         return up
 
     def _total_price(self):
         cost = 0
-        cost += sum(self.server_calculator.find_size(self.sizes_booting_shutdown[c].id).price
+        cost += sum(self.server_calculator.find_size(self.sizes_booting[c].id).price
                   for c in self.booting.iterkeys())
         cost += sum(self.server_calculator.find_size(c.cloud_node.size.id).price
-                    for i in (self.booted, self.cloud_nodes.nodes)
-                    for c in i.itervalues())
+                    for c in self.cloud_nodes.nodes.itervalues())
         return cost
 
-    def _nodes_busy(self, size):
-        return sum(1 for busy in
-                   pykka.get_all(rec.actor.in_state('busy') for rec in
-                                 self.cloud_nodes.nodes.itervalues()
-                                 if rec.cloud_node.size.id == size.id)
-                   if busy)
-
-    def _nodes_missing(self, size):
-        return sum(1 for arv_node in
-                   pykka.get_all(rec.actor.arvados_node for rec in
-                                 self.cloud_nodes.nodes.itervalues()
-                                 if rec.cloud_node.size.id == size.id and rec.actor.cloud_node.get().id not in self.shutdowns)
-                   if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
-
     def _size_wishlist(self, size):
         return sum(1 for c in self.last_wishlist if c.id == size.id)
 
-    def _size_shutdowns(self, size):
-        sh = 0
-        for c in self.shutdowns.iterkeys():
-            try:
-                if self.sizes_booting_shutdown[c].id == size.id:
-                    sh += 1
-            except pykka.ActorDeadError:
-                pass
-        return sh
-
     def _nodes_wanted(self, size):
-        total_up_count = self._nodes_up(None)
-        under_min = self.min_nodes - total_up_count
-        over_max = total_up_count - self.max_nodes
+        total_node_count = self._nodes_booting(None) + len(self.cloud_nodes)
+        under_min = self.min_nodes - total_node_count
+        over_max = total_node_count - self.max_nodes
         total_price = self._total_price()
 
-        if over_max >= 0:
-            return -over_max
-        elif under_min > 0 and size.id == self.min_cloud_size.id:
-            return under_min
+        counts = self._state_counts(size)
 
-        booting_count = self._nodes_booting(size) + self._nodes_unpaired(size)
-        shutdown_count = self._size_shutdowns(size)
-        busy_count = self._nodes_busy(size)
-        up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+        up_count = self._nodes_up(counts)
+        busy_count = counts["busy"]
 
-        self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
+        self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.name,
                           self._size_wishlist(size),
-                          up_count + busy_count,
-                          booting_count,
-                          up_count - booting_count,
+                          up_count,
+                          counts["booting"],
+                          counts["unpaired"],
+                          counts["idle"],
                           busy_count,
-                          shutdown_count)
+                          counts["down"],
+                          counts["shutdown"])
+
+        if over_max >= 0:
+            return -over_max
+        elif under_min > 0 and size.id == self.min_cloud_size.id:
+            return under_min
 
-        wanted = self._size_wishlist(size) - up_count
+        wanted = self._size_wishlist(size) - (up_count - busy_count)
         if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
             can_boot = int((self.max_total_price - total_price) / size.price)
             if can_boot == 0:
@@ -313,10 +317,11 @@ class NodeManagerDaemonActor(actor_class):
             return wanted
 
     def _nodes_excess(self, size):
-        up_count = self._nodes_up(size) - self._size_shutdowns(size)
+        counts = self._state_counts(size)
+        up_count = self._nodes_up(counts)
         if size.id == self.min_cloud_size.id:
             up_count -= self.min_nodes
-        return up_count - self._nodes_busy(size) - self._size_wishlist(size)
+        return up_count - (counts["busy"] + self._size_wishlist(size))
 
     def update_server_wishlist(self, wishlist):
         self._update_poll_time('server_wishlist')
@@ -363,7 +368,7 @@ class NodeManagerDaemonActor(actor_class):
             cloud_client=self._new_cloud(),
             cloud_size=cloud_size).proxy()
         self.booting[new_setup.actor_ref.actor_urn] = new_setup
-        self.sizes_booting_shutdown[new_setup.actor_ref.actor_urn] = cloud_size
+        self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
 
         if arvados_node is not None:
             self.arvados_nodes[arvados_node['uuid']].assignment_time = (
@@ -376,18 +381,19 @@ class NodeManagerDaemonActor(actor_class):
         return pykka.get_all([getattr(actor, name) for name in attr_names])
 
     def node_up(self, setup_proxy):
-        cloud_node = setup_proxy.cloud_node.get()
-        del self.booting[setup_proxy.actor_ref.actor_urn]
-        del self.sizes_booting_shutdown[setup_proxy.actor_ref.actor_urn]
-
+        # Called when a SetupActor has completed.
+        cloud_node, arvados_node = self._get_actor_attrs(
+            setup_proxy, 'cloud_node', 'arvados_node')
         setup_proxy.stop()
+
+        # If cloud_node is None then the node create wasn't
+        # successful and so there isn't anything to do.
         if cloud_node is not None:
-            record = self.cloud_nodes.get(cloud_node.id)
-            if record is None:
-                record = self._new_node(cloud_node)
-                self.booted[cloud_node.id] = record
-            self._timer.schedule(time.time() + self.boot_fail_after,
-                                 self._later.shutdown_unpaired_node, cloud_node.id)
+            # Node creation succeeded.  Update cloud node list.
+            cloud_node._nodemanager_recently_booted = True
+            self._register_cloud_node(cloud_node)
+        del self.booting[setup_proxy.actor_ref.actor_urn]
+        del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
 
     @_check_poll_freshness
     def stop_booting_node(self, size):
@@ -395,9 +401,9 @@ class NodeManagerDaemonActor(actor_class):
         if (nodes_excess < 1) or not self.booting:
             return None
         for key, node in self.booting.iteritems():
-            if node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get():
+            if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get():
                 del self.booting[key]
-                del self.sizes_booting_shutdown[key]
+                del self.sizes_booting[key]
 
                 if nodes_excess > 1:
                     self._later.stop_booting_node(size)
@@ -406,43 +412,49 @@ class NodeManagerDaemonActor(actor_class):
     def _begin_node_shutdown(self, node_actor, cancellable):
         cloud_node_obj = node_actor.cloud_node.get()
         cloud_node_id = cloud_node_obj.id
-        if cloud_node_id in self.shutdowns:
+        record = self.cloud_nodes[cloud_node_id]
+        if record.shutdown_actor is not None:
             return None
         shutdown = self._node_shutdown.start(
             timer_actor=self._timer, cloud_client=self._new_cloud(),
             arvados_client=self._new_arvados(),
             node_monitor=node_actor.actor_ref, cancellable=cancellable)
-        self.shutdowns[cloud_node_id] = shutdown.proxy()
-        self.sizes_booting_shutdown[cloud_node_id] = cloud_node_obj.size
+        record.shutdown_actor = shutdown.proxy()
         shutdown.tell_proxy().subscribe(self._later.node_finished_shutdown)
 
     @_check_poll_freshness
     def node_can_shutdown(self, node_actor):
         if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
             self._begin_node_shutdown(node_actor, cancellable=True)
-
-    def shutdown_unpaired_node(self, cloud_node_id):
-        for record_dict in [self.cloud_nodes, self.booted]:
-            if cloud_node_id in record_dict:
-                record = record_dict[cloud_node_id]
-                break
-        else:
-            return None
-        if not record.actor.in_state('idle', 'busy').get():
-            self._begin_node_shutdown(record.actor, cancellable=False)
+        elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
+            # Node is unpaired, which means it probably exceeded its booting
+            # grace period without a ping, so shut it down so we can boot a new
+            # node in its place.
+            self._begin_node_shutdown(node_actor, cancellable=False)
+        elif node_actor.in_state('down').get():
+            # Node is down and unlikely to come back.
+            self._begin_node_shutdown(node_actor, cancellable=False)
 
     def node_finished_shutdown(self, shutdown_actor):
-        cloud_node, success, cancel_reason = self._get_actor_attrs(
-            shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
-        shutdown_actor.stop()
+        try:
+            cloud_node, success, cancel_reason = self._get_actor_attrs(
+                shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
+        except pykka.ActorDeadError:
+            return
         cloud_node_id = cloud_node.id
+        record = self.cloud_nodes[cloud_node_id]
+        shutdown_actor.stop()
         if not success:
             if cancel_reason == self._node_shutdown.NODE_BROKEN:
                 self.cloud_nodes.blacklist(cloud_node_id)
-        elif cloud_node_id in self.booted:
-            self.booted.pop(cloud_node_id).actor.stop()
-        del self.shutdowns[cloud_node_id]
-        del self.sizes_booting_shutdown[cloud_node_id]
+            record.shutdown_actor = None
+        else:
+            # If the node went from being booted to being shut down without ever
+            # appearing in the cloud node list, it will have the
+            # _nodemanager_recently_booted tag, so get rid of it so that the node
+            # can be forgotten completely.
+            if hasattr(self.cloud_nodes[cloud_node_id].cloud_node, "_nodemanager_recently_booted"):
+                del self.cloud_nodes[cloud_node_id].cloud_node._nodemanager_recently_booted
 
     def shutdown(self):
         self._logger.info("Shutting down after signal.")
index 78bd2db5cc05fe9516c10e718506ef11734055db..1be7e46387ff6c5bfe38d4e4805694fb7986cfa7 100644 (file)
@@ -12,6 +12,7 @@ import daemon
 import pykka
 
 from . import config as nmconfig
+from .baseactor import WatchdogActor
 from .daemon import NodeManagerDaemonActor
 from .jobqueue import JobQueueMonitorActor, ServerCalculator
 from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
@@ -125,6 +126,12 @@ def main(args=None):
             node_setup, node_shutdown, node_monitor,
             max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
 
+        WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+                            cloud_node_poller.actor_ref,
+                            arvados_node_poller.actor_ref,
+                            job_queue_poller.actor_ref,
+                            node_daemon.actor_ref)
+
         signal.pause()
         daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
         while not daemon_stopped():
index 95b1329fa603009dd65eb7420981c0d1cb1ed5b2..227b5e5f3471ba4cf2e484461cd2c651f26a96e1 100644 (file)
@@ -205,7 +205,9 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         cloud_node = testutil.cloud_node_mock(61)
         arv_node = testutil.arvados_node_mock(61)
         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+        self.cloud_client.destroy_node.return_value = False
         self.make_actor(cancellable=True)
+        self.shutdown_actor.cancel_shutdown("test")
         self.check_success_flag(False, 2)
         self.assertFalse(self.arvados_client.nodes().update.called)
 
@@ -219,14 +221,6 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.check_success_flag(True)
         self.assertTrue(self.cloud_client.destroy_node.called)
 
-    def test_shutdown_cancelled_when_window_closes(self):
-        self.make_mocks(shutdown_open=False)
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.cloud_client.destroy_node.called)
-        self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
-                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
     def test_shutdown_retries_when_cloud_fails(self):
         self.make_mocks()
         self.cloud_client.destroy_node.return_value = False
@@ -267,6 +261,16 @@ class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
 
+    @testutil.no_sleep
+    def test_node_sync_error(self):
+        self.make_actor()
+        cloud_node = testutil.cloud_node_mock()
+        arv_node = testutil.arvados_node_mock()
+        self.driver().sync_node.side_effect = (IOError, Exception, True)
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
 
 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
                                       unittest.TestCase):
@@ -296,17 +300,24 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
 
     def test_in_state_when_unpaired(self):
         self.make_actor()
-        self.assertIsNone(self.node_state('idle', 'busy'))
+        self.assertTrue(self.node_state('unpaired'))
 
     def test_in_state_when_pairing_stale(self):
         self.make_actor(arv_node=testutil.arvados_node_mock(
                 job_uuid=None, age=90000))
-        self.assertIsNone(self.node_state('idle', 'busy'))
+        self.assertTrue(self.node_state('down'))
 
     def test_in_state_when_no_state_available(self):
         self.make_actor(arv_node=testutil.arvados_node_mock(
                 crunch_worker_state=None))
-        self.assertIsNone(self.node_state('idle', 'busy'))
+        print(self.node_actor.get_state().get())
+        self.assertTrue(self.node_state('idle'))
+
+    def test_in_state_when_no_state_available_old(self):
+        self.make_actor(arv_node=testutil.arvados_node_mock(
+                crunch_worker_state=None, age=90000))
+        print(self.node_actor.get_state().get())
+        self.assertTrue(self.node_state('down'))
 
     def test_in_idle_state(self):
         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
@@ -346,28 +357,32 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_booting(self):
         self.make_actor()
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_missing(self):
+    def test_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
                                               crunch_worker_state="down",
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_running_broken(self):
+    def test_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
                                               crunch_worker_state="down")
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -376,27 +391,31 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
+        self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_when_node_state_unknown(self):
+    def test_shutdown_when_node_state_unknown(self):
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state=None))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((False, "node state is stale"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_arvados_node_match(self):
         self.make_actor(2)
index 8648783bac5889f11a328af3b277bd1f21da5665..85a40ceeb25964e15e3696141b373d9ef87cee4d 100644 (file)
@@ -3,6 +3,7 @@
 from __future__ import absolute_import, print_function
 
 import subprocess
+import time
 import unittest
 
 import mock
@@ -40,11 +41,11 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
             self.check_success_after_reset(proc_mock, end_state)
         return test
 
-    for wait_state in ['alloc\n', 'drng\n', 'idle*\n']:
+    for wait_state in ['alloc\n', 'drng\n']:
         locals()['test_wait_while_' + wait_state.strip()
                  ] = make_wait_state_test(start_state=wait_state)
 
-    for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']:
+    for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
         locals()['test_wait_until_' + end_state.strip()
                  ] = make_wait_state_test(end_state=end_state)
 
@@ -54,27 +55,30 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
 
     def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
         # Test we correctly handle a node that failed to bootstrap.
-        proc_mock.return_value = 'idle\n'
+        proc_mock.return_value = 'down\n'
         self.make_actor(start_time=0)
         self.check_success_flag(True)
         self.assertFalse(proc_mock.called)
 
-    def test_node_undrained_when_shutdown_window_closes(self, proc_mock):
-        proc_mock.side_effect = iter(['drng\n', 'idle\n'])
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
-
-    def test_alloc_node_undrained_when_shutdown_window_closes(self, proc_mock):
-        proc_mock.side_effect = iter(['alloc\n'])
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
+    def test_node_undrained_when_shutdown_cancelled(self, proc_mock):
+        try:
+            proc_mock.side_effect = iter(['', 'drng\n', 'drng\n', ''])
+            self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+            self.timer = testutil.MockTimer(False)
+            self.make_actor()
+            self.busywait(lambda: proc_mock.call_args is not None)
+            self.shutdown_actor.cancel_shutdown("test").get(self.TIMEOUT)
+            self.check_success_flag(False, 2)
+            self.assertEqual(proc_mock.call_args_list,
+                             [mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=DRAIN', 'Reason=Node Manager shutdown']),
+                              mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+                              mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+                              mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=RESUME'])])
+        finally:
+            self.shutdown_actor.actor_ref.stop()
 
     def test_cancel_shutdown_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n'])
+        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
         self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
         self.make_actor()
         self.check_success_flag(False, 2)
diff --git a/services/nodemanager/tests/test_computenode_driver.py b/services/nodemanager/tests/test_computenode_driver.py
new file mode 100644 (file)
index 0000000..71a39a6
--- /dev/null
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import libcloud.common.types as cloud_types
+import mock
+
+import arvnodeman.computenode.driver as driver_base
+from . import testutil
+
+class ComputeNodeDriverTestCase(unittest.TestCase):
+    def setUp(self):
+        self.driver_mock = mock.MagicMock(name='driver_mock')
+        driver_base.BaseComputeNodeDriver.SEARCH_CACHE = {}
+
+    def test_search_for_now_uses_public_method(self):
+        image = testutil.cloud_object_mock(1)
+        self.driver_mock().list_images.return_value = [image]
+        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+        self.assertIs(image, driver.search_for_now('id_1', 'list_images'))
+        self.assertEqual(1, self.driver_mock().list_images.call_count)
+
+    def test_search_for_now_uses_private_method(self):
+        net = testutil.cloud_object_mock(1)
+        self.driver_mock().ex_list_networks.return_value = [net]
+        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+        self.assertIs(net, driver.search_for_now('id_1', 'ex_list_networks'))
+        self.assertEqual(1, self.driver_mock().ex_list_networks.call_count)
+
+    def test_search_for_now_raises_ValueError_on_zero_results(self):
+        self.driver_mock().list_images.return_value = []
+        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+        with self.assertRaises(ValueError) as test:
+            driver.search_for_now('id_1', 'list_images')
+
+    def test_search_for_now_raises_ValueError_on_extra_results(self):
+        image = testutil.cloud_object_mock(1)
+        self.driver_mock().list_images.return_value = [image, image]
+        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+        with self.assertRaises(ValueError) as test:
+            driver.search_for_now('id_1', 'list_images')
+
+    def test_search_for_now_does_not_cache_results(self):
+        image1 = testutil.cloud_object_mock(1)
+        image2 = testutil.cloud_object_mock(1)
+        self.driver_mock().list_images.side_effect = [[image1], [image2]]
+        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+        self.assertIsNot(driver.search_for_now('id_1', 'list_images'),
+                         driver.search_for_now('id_1', 'list_images'))
+        self.assertEqual(2, self.driver_mock().list_images.call_count)
+
+    def test_search_for_returns_cached_results(self):
+        image1 = testutil.cloud_object_mock(1)
+        image2 = testutil.cloud_object_mock(1)
+        self.driver_mock().list_images.side_effect = [[image1], [image2]]
+        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+        self.assertIs(driver.search_for('id_1', 'list_images'),
+                      driver.search_for('id_1', 'list_images'))
+        self.assertEqual(1, self.driver_mock().list_images.call_count)
index 5721abc5f87efeaf029c2eb476bb0fcdf6a14f2a..59fc503128aef69be02a6a45aabf86f80fde6540 100644 (file)
@@ -110,3 +110,27 @@ echo z1.test > /var/tmp/arv-node-data/meta-data/instance-type
         self.driver_mock().create_node.side_effect = IOError
         n = driver.create_node(testutil.MockSize(1), arv_node)
         self.assertEqual('compute-000000000000001-zzzzz', n.name)
+
+    def test_ex_fetch_nic_false(self):
+        arv_node = testutil.arvados_node_mock(1, hostname=None)
+        driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
+        nodelist = [testutil.cloud_node_mock(1, tags={"arvados-class": "dynamic-compute"})]
+        nodelist[0].name = 'compute-000000000000001-zzzzz'
+        self.driver_mock().list_nodes.return_value = nodelist
+        n = driver.list_nodes()
+        self.assertEqual(nodelist, n)
+        self.driver_mock().list_nodes.assert_called_with(ex_fetch_nic=False, ex_resource_group='TestResourceGroup')
+
+    def test_create_can_find_node_after_timeout(self):
+        super(AzureComputeNodeDriverTestCase,
+              self).test_create_can_find_node_after_timeout(
+                  create_kwargs={'tag_arvados-class': 'test'},
+                  node_extra={'tags': {'arvados-class': 'test'}})
+
+    def test_node_found_after_timeout_has_fixed_size(self):
+        size = testutil.MockSize(4)
+        node_props = {'hardwareProfile': {'vmSize': size.id}}
+        cloud_node = testutil.cloud_node_mock(
+            size=None, tags={'arvados-class': 'test'}, properties=node_props)
+        self.check_node_found_after_timeout_has_fixed_size(
+            size, cloud_node, {'tag_arvados-class': 'test'})
index e8b2fa36c582876359fa6e667f80e9a7cb1f3013..84e061d867ff42033fd526e92440695702a3dd8c 100644 (file)
@@ -231,6 +231,11 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
         self.assertIs(node, nodelist[0])
         self.assertIs(size, nodelist[0].size)
 
+    def test_node_found_after_timeout_has_fixed_size(self):
+        size = testutil.MockSize(4)
+        cloud_node = testutil.cloud_node_mock(size=size.id)
+        self.check_node_found_after_timeout_has_fixed_size(size, cloud_node)
+
     def test_list_empty_nodes(self):
         self.driver_mock().list_nodes.return_value = []
         self.assertEqual([], self.new_driver().list_nodes())
index 2daca08ecf7eb114173725bb88deff2969ad5bf3..fe7b0fe2c5cfeee879273ff4daaac821ca0ed2e8 100644 (file)
@@ -55,6 +55,11 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         for name in ['cloud_nodes', 'arvados_nodes', 'server_wishlist']:
             setattr(self, name + '_poller', mock.MagicMock(name=name + '_mock'))
         self.arv_factory = mock.MagicMock(name='arvados_mock')
+        api_client = mock.MagicMock(name='api_client')
+        api_client.nodes().create().execute.side_effect = [testutil.arvados_node_mock(1),
+                                                           testutil.arvados_node_mock(2)]
+        self.arv_factory.return_value = api_client
+
         self.cloud_factory = mock.MagicMock(name='cloud_mock')
         self.cloud_factory().node_start_time.return_value = time.time()
         self.cloud_updates = mock.MagicMock(name='updates_mock')
@@ -76,10 +81,10 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
             min_nodes, max_nodes, 600, 1800, 3600,
             self.node_setup, self.node_shutdown,
             max_total_price=max_total_price).proxy()
-        if cloud_nodes is not None:
-            self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
         if arvados_nodes is not None:
             self.daemon.update_arvados_nodes(arvados_nodes).get(self.TIMEOUT)
+        if cloud_nodes is not None:
+            self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
         if want_sizes is not None:
             self.daemon.update_server_wishlist(want_sizes).get(self.TIMEOUT)
 
@@ -167,7 +172,8 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
                                       testutil.cloud_node_mock(2)],
                          arvados_nodes=[testutil.arvados_node_mock(1),
-                                      testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
+                                      testutil.arvados_node_mock(2,
+                                                                 last_ping_at='1970-01-01T01:02:03.04050607Z')],
                          want_sizes=[size, size])
         self.stop_proxy(self.daemon)
         self.assertTrue(self.node_setup.start.called)
@@ -209,8 +215,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         mock_node_monitor.proxy.return_value = mock.NonCallableMock(cloud_node=get_cloud_node)
         mock_shutdown = self.node_shutdown.start(node_monitor=mock_node_monitor)
 
-        self.daemon.shutdowns.get()[cloud_nodes[1].id] = mock_shutdown.proxy()
-        self.daemon.sizes_booting_shutdown.get()[cloud_nodes[1].id] = size
+        self.daemon.cloud_nodes.get()[cloud_nodes[1].id].shutdown_actor = mock_shutdown.proxy()
 
         self.assertEqual(2, self.alive_monitor_count())
         for mon_ref in self.monitor_list():
@@ -233,6 +238,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         arv_node = testutil.arvados_node_mock(2, job_uuid=True)
         self.make_daemon([testutil.cloud_node_mock(2, size=size)], [arv_node],
                          [size], avail_sizes=[(size, {"cores":1})])
+        self.busywait(lambda: self.node_setup.start.called)
         self.stop_proxy(self.daemon)
         self.assertTrue(self.node_setup.start.called)
 
@@ -279,18 +285,19 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.last_setup.arvados_node.get.return_value = arv_node
         return self.last_setup
 
-    def test_no_new_node_when_booted_node_not_usable(self):
+    def test_new_node_when_booted_node_not_usable(self):
         cloud_node = testutil.cloud_node_mock(4)
         arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
         setup = self.start_node_boot(cloud_node, arv_node)
         self.daemon.node_up(setup).get(self.TIMEOUT)
         self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
         self.daemon.update_arvados_nodes([arv_node])
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-1801
         self.daemon.update_server_wishlist(
             [testutil.MockSize(1)]).get(self.TIMEOUT)
         self.stop_proxy(self.daemon)
-        self.assertEqual(1, self.node_setup.start.call_count)
+        self.assertEqual(2, self.node_setup.start.call_count)
 
     def test_no_duplication_when_booting_node_listed_fast(self):
         # Test that we don't start two ComputeNodeMonitorActors when
@@ -350,6 +357,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         shutdown = self.node_shutdown.start().proxy()
         shutdown.cloud_node.get.return_value = cloud_node
         self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
+        self.daemon.update_cloud_nodes([])
         self.assertTrue(shutdown.stop.called,
                         "shutdown actor not stopped after finishing")
         self.assertTrue(monitor.actor_ref.actor_stopped.wait(self.TIMEOUT),
@@ -362,20 +370,25 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
     def test_booted_node_shut_down_when_never_listed(self):
         setup = self.start_node_boot()
+        self.cloud_factory().node_start_time.return_value = time.time() - 3601
         self.daemon.node_up(setup).get(self.TIMEOUT)
         self.assertEqual(1, self.alive_monitor_count())
         self.assertFalse(self.node_shutdown.start.called)
-        self.timer.deliver()
+        now = time.time()
+        self.monitor_list()[0].tell_proxy().consider_shutdown()
+        self.busywait(lambda: self.node_shutdown.start.called)
         self.stop_proxy(self.daemon)
         self.assertShutdownCancellable(False)
 
     def test_booted_node_shut_down_when_never_paired(self):
         cloud_node = testutil.cloud_node_mock(2)
         setup = self.start_node_boot(cloud_node)
+        self.cloud_factory().node_start_time.return_value = time.time() - 3601
         self.daemon.node_up(setup).get(self.TIMEOUT)
         self.assertEqual(1, self.alive_monitor_count())
         self.daemon.update_cloud_nodes([cloud_node])
-        self.timer.deliver()
+        self.monitor_list()[0].tell_proxy().consider_shutdown()
+        self.busywait(lambda: self.node_shutdown.start.called)
         self.stop_proxy(self.daemon)
         self.assertShutdownCancellable(False)
 
@@ -383,11 +396,12 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         cloud_node = testutil.cloud_node_mock(4)
         arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
         setup = self.start_node_boot(cloud_node, arv_node)
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
         self.daemon.node_up(setup).get(self.TIMEOUT)
         self.assertEqual(1, self.alive_monitor_count())
+        self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-3601
         self.daemon.update_cloud_nodes([cloud_node])
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.timer.deliver()
+        self.busywait(lambda: self.node_shutdown.start.called)
         self.stop_proxy(self.daemon)
         self.assertShutdownCancellable(False)
 
@@ -441,8 +455,9 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
     def test_shutdown_declined_at_wishlist_capacity(self):
         cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(1)
         size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[cloud_node], want_sizes=[size])
+        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], want_sizes=[size])
         self.assertEqual(1, self.alive_monitor_count())
         monitor = self.monitor_list()[0].proxy()
         self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
@@ -451,7 +466,8 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
     def test_shutdown_declined_below_min_nodes(self):
         cloud_node = testutil.cloud_node_mock(1)
-        self.make_daemon(cloud_nodes=[cloud_node], min_nodes=1)
+        arv_node = testutil.arvados_node_mock(1)
+        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], min_nodes=1)
         self.assertEqual(1, self.alive_monitor_count())
         monitor = self.monitor_list()[0].proxy()
         self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
@@ -522,7 +538,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
     def test_nodes_shutting_down_replaced_below_max_nodes(self):
         size = testutil.MockSize(6)
         cloud_node = testutil.cloud_node_mock(6, size=size)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6)],
+        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
                          avail_sizes=[(size, {"cores":1})])
         self.assertEqual(1, self.alive_monitor_count())
         monitor = self.monitor_list()[0].proxy()
@@ -602,13 +618,6 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertEqual(1, self.last_shutdown.stop.call_count)
 
-    def busywait(self, f):
-        n = 0
-        while not f() and n < 10:
-            time.sleep(.1)
-            n += 1
-        self.assertTrue(f())
-
     def test_node_create_two_sizes(self):
         small = testutil.MockSize(1)
         big = testutil.MockSize(2)
@@ -670,7 +679,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
             self.daemon.node_can_shutdown(c.actor)
 
         booting = self.daemon.booting.get()
-        shutdowns = self.daemon.shutdowns.get()
+        cloud_nodes = self.daemon.cloud_nodes.get()
 
         self.stop_proxy(self.daemon)
 
@@ -686,8 +695,9 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
         # shutting down a small node
         sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in shutdowns.itervalues():
-            sizecounts[b.cloud_node.get().size.id] += 1
+        for b in cloud_nodes.nodes.itervalues():
+            if b.shutdown_actor is not None:
+                sizecounts[b.cloud_node.size.id] += 1
         self.assertEqual(1, sizecounts[small.id])
         self.assertEqual(0, sizecounts[big.id])
 
index 35605fcd8c564ef910bfcd352d90e36d0680e064..dea7230dd3abdfc4133841027f8b41370953346f 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import absolute_import, print_function
 
 import errno
 import logging
+import time
 import threading
 import unittest
 
@@ -22,18 +23,33 @@ class BogusActor(arvnodeman.baseactor.BaseNodeManagerActor):
     def doStuff(self):
         raise self.exp
 
+    def ping(self):
+        # Called by WatchdogActorTest, this delay is longer than the test timeout
+        # of 1 second, which should cause the watchdog ping to fail.
+        time.sleep(2)
+        return True
+
 class ActorUnhandledExceptionTest(unittest.TestCase):
     def test_fatal_error(self):
         for e in (MemoryError(), threading.ThreadError(), OSError(errno.ENOMEM, "")):
-            with mock.patch('os.killpg') as killpg_mock:
+            with mock.patch('os.kill') as kill_mock:
                 act = BogusActor.start(e).tell_proxy()
                 act.doStuff()
                 act.actor_ref.stop(block=True)
-                self.assertTrue(killpg_mock.called)
-
-    def test_nonfatal_error(self):
-        with mock.patch('os.killpg') as killpg_mock:
-            act = BogusActor.start(OSError(errno.ENOENT, "")).tell_proxy()
-            act.doStuff()
-            act.actor_ref.stop(block=True)
-            self.assertFalse(killpg_mock.called)
+                self.assertTrue(kill_mock.called)
+
+    @mock.patch('os.kill')
+    def test_nonfatal_error(self, kill_mock):
+        act = BogusActor.start(OSError(errno.ENOENT, "")).tell_proxy()
+        act.doStuff()
+        act.actor_ref.stop(block=True)
+        self.assertFalse(kill_mock.called)
+
+class WatchdogActorTest(unittest.TestCase):
+    @mock.patch('os.kill')
+    def test_time_timout(self, kill_mock):
+        act = BogusActor.start(OSError(errno.ENOENT, ""))
+        watch = arvnodeman.baseactor.WatchdogActor.start(1, act)
+        watch.stop(block=True)
+        act.stop(block=True)
+        self.assertTrue(kill_mock.called)
index b9e7beabb5ca1237cc1b64619c9a412872c2923b..1b6aab3cafed16cfc0960d1a39a32d669fe53ffb 100644 (file)
@@ -6,6 +6,7 @@ import datetime
 import threading
 import time
 
+import libcloud.common.types as cloud_types
 import mock
 import pykka
 
@@ -118,6 +119,13 @@ class ActorTestMixin(object):
             if result is not unassigned:
                 return result
 
+    def busywait(self, f):
+        n = 0
+        while not f() and n < 10:
+            time.sleep(.1)
+            n += 1
+        self.assertTrue(f())
+
 
 class DriverTestMixin(object):
     def setUp(self):
@@ -142,6 +150,44 @@ class DriverTestMixin(object):
             self.assertTrue(self.driver_mock.called)
             self.assertIs(driver.real, driver_mock2)
 
+    def test_create_can_find_node_after_timeout(self, create_kwargs={}, node_extra={}):
+        driver = self.new_driver(create_kwargs=create_kwargs)
+        arv_node = arvados_node_mock()
+        cloud_node = cloud_node_mock(**node_extra)
+        cloud_node.name = driver.create_cloud_name(arv_node)
+        create_method = self.driver_mock().create_node
+        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+        list_method = self.driver_mock().list_nodes
+        list_method.return_value = [cloud_node]
+        actual = driver.create_node(MockSize(1), arv_node)
+        self.assertIs(cloud_node, actual)
+
+    def test_create_can_raise_exception_after_timeout(self):
+        driver = self.new_driver()
+        arv_node = arvados_node_mock()
+        create_method = self.driver_mock().create_node
+        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+        list_method = self.driver_mock().list_nodes
+        list_method.return_value = []
+        with self.assertRaises(cloud_types.LibcloudError) as exc_test:
+            driver.create_node(MockSize(1), arv_node)
+        self.assertIs(create_method.side_effect, exc_test.exception)
+
+    def check_node_found_after_timeout_has_fixed_size(self, size, cloud_node,
+                                                      create_kwargs={}):
+        # This method needs to be called explicitly by driver test suites
+        # that need it.
+        self.driver_mock().list_sizes.return_value = [size]
+        driver = self.new_driver(create_kwargs=create_kwargs)
+        arv_node = arvados_node_mock()
+        cloud_node.name = driver.create_cloud_name(arv_node)
+        create_method = self.driver_mock().create_node
+        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+        self.driver_mock().list_nodes.return_value = [cloud_node]
+        actual = driver.create_node(size, arv_node)
+        self.assertIs(size, actual.size)
+
+
 class RemotePollLoopActorTestMixin(ActorTestMixin):
     def build_monitor(self, *args, **kwargs):
         self.timer = mock.MagicMock(name='timer_mock')
index 88726a46836e636dd895288d9244e15be34b8515..991ccec674e1bd65350e3e59ecbf79ee8eebd8c1 100755 (executable)
@@ -39,10 +39,6 @@ if test -z "$ARVADOS_ROOT" ; then
     ARVADOS_ROOT="$ARVBOX_DATA/arvados"
 fi
 
-if test -z "$ARVADOS_DEV_ROOT" ; then
-    ARVADOS_DEV_ROOT="$ARVBOX_DATA/arvados-dev"
-fi
-
 if test -z "$SSO_ROOT" ; then
     SSO_ROOT="$ARVBOX_DATA/sso-devise-omniauth-provider"
 fi
@@ -51,6 +47,8 @@ PG_DATA="$ARVBOX_DATA/postgres"
 VAR_DATA="$ARVBOX_DATA/var"
 PASSENGER="$ARVBOX_DATA/passenger"
 GEMS="$ARVBOX_DATA/gems"
+PIPCACHE="$ARVBOX_DATA/pip"
+GOSTUFF="$ARVBOX_DATA/gopath"
 
 getip() {
     docker inspect $ARVBOX_CONTAINER | grep \"IPAddress\" | head -n1 | tr -d ' ":,\n' | cut -c10-
@@ -103,8 +101,13 @@ wait_for_arvbox() {
 }
 
 run() {
+    if docker ps -a --filter "status=running" | grep -E "$ARVBOX_CONTAINER$" -q ; then
+        echo "Container $ARVBOX_CONTAINER is already running"
+        exit 0
+    fi
+
     if docker ps -a | grep -E "$ARVBOX_CONTAINER$" -q ; then
-        echo "Container $ARVBOX_CONTAINER is already running, use stop, restart or rebuild"
+        echo "Container $ARVBOX_CONTAINER already exists but is not running; use restart or rebuild"
         exit 1
     fi
 
@@ -127,7 +130,8 @@ run() {
               --publish=25100:25100
               --publish=25107:25107
               --publish=25108:25108
-              --publish=8001:8001"
+              --publish=8001:8001
+              --publish=8002:8002"
     else
         PUBLIC=""
     fi
@@ -153,7 +157,8 @@ run() {
         updateconf
         wait_for_arvbox
     else
-        mkdir -p "$PG_DATA" "$VAR_DATA" "$PASSENGER" "$GEMS"
+        mkdir -p "$PG_DATA" "$VAR_DATA" "$PASSENGER" "$GEMS" "$PIPCACHE" "$GOSTUFF"
+
 
         if ! test -d "$ARVADOS_ROOT" ; then
             git clone https://github.com/curoverse/arvados.git "$ARVADOS_ROOT"
@@ -165,10 +170,6 @@ run() {
         if test "$1" = test ; then
             shift
 
-            if ! test -d "$ARVADOS_DEV_ROOT" ; then
-                git clone https://github.com/curoverse/arvados-dev.git "$ARVADOS_DEV_ROOT"
-            fi
-
             mkdir -p $VAR_DATA/test
 
             docker run \
@@ -176,12 +177,13 @@ run() {
                    --name=$ARVBOX_CONTAINER \
                    --privileged \
                    "--volume=$ARVADOS_ROOT:/usr/src/arvados:rw" \
-                   "--volume=$ARVADOS_DEV_ROOT:/usr/src/arvados-dev:rw" \
                    "--volume=$SSO_ROOT:/usr/src/sso:rw" \
                    "--volume=$PG_DATA:/var/lib/postgresql:rw" \
                    "--volume=$VAR_DATA:/var/lib/arvados:rw" \
                    "--volume=$PASSENGER:/var/lib/passenger:rw" \
                    "--volume=$GEMS:/var/lib/gems:rw" \
+                   "--volume=$PIPCACHE:/var/lib/pip:rw" \
+                   "--volume=$GOSTUFF:/var/lib/gopath:rw" \
                    arvados/arvbox-dev \
                    /usr/local/bin/runsvinit -svdir=/etc/test-service
 
@@ -203,7 +205,7 @@ run() {
             docker exec -ti \
                    $ARVBOX_CONTAINER \
                    /usr/local/lib/arvbox/runsu.sh \
-                   /usr/src/arvados-dev/jenkins/run-tests.sh \
+                   /usr/src/arvados/build/run-tests.sh \
                    --temp /var/lib/arvados/test \
                    WORKSPACE=/usr/src/arvados \
                    GEM_HOME=/var/lib/gems \
@@ -219,6 +221,8 @@ run() {
                    "--volume=$VAR_DATA:/var/lib/arvados:rw" \
                    "--volume=$PASSENGER:/var/lib/passenger:rw" \
                    "--volume=$GEMS:/var/lib/gems:rw" \
+                   "--volume=$PIPCACHE:/var/lib/pip:rw" \
+                   "--volume=$GOSTUFF:/var/lib/gopath:rw" \
                    $PUBLIC \
                    arvados/arvbox-dev
             updateconf
@@ -249,11 +253,11 @@ build() {
         echo "Could not find Dockerfile (expected it at $ARVBOX_DOCKER/Dockerfile.base)"
         exit 1
     fi
-    docker build -t arvados/arvbox-base -f "$ARVBOX_DOCKER/Dockerfile.base" "$ARVBOX_DOCKER"
+    docker build $NO_CACHE -t arvados/arvbox-base -f "$ARVBOX_DOCKER/Dockerfile.base" "$ARVBOX_DOCKER"
     if test "$1" = localdemo -o "$1" = publicdemo ; then
-        docker build -t arvados/arvbox-demo -f "$ARVBOX_DOCKER/Dockerfile.demo" "$ARVBOX_DOCKER"
+        docker build $NO_CACHE -t arvados/arvbox-demo -f "$ARVBOX_DOCKER/Dockerfile.demo" "$ARVBOX_DOCKER"
     else
-        docker build -t arvados/arvbox-dev -f "$ARVBOX_DOCKER/Dockerfile.dev" "$ARVBOX_DOCKER"
+        docker build $NO_CACHE -t arvados/arvbox-dev -f "$ARVBOX_DOCKER/Dockerfile.dev" "$ARVBOX_DOCKER"
     fi
 }
 
@@ -279,6 +283,11 @@ case "$subcmd" in
         build $@
         ;;
 
+    rebuild)
+        check $@
+        NO_CACHE=--no-cache build $@
+        ;;
+
     start|run)
         check $@
         run $@
@@ -302,7 +311,7 @@ case "$subcmd" in
         run $@
         ;;
 
-    rebuild)
+    reboot)
         check $@
         stop
         build $@
@@ -415,9 +424,8 @@ case "$subcmd" in
     *)
         echo "Arvados-in-a-box                      http://arvados.org"
         echo
-        echo "$(basename $0) (build|start|run|open|shell|ip|stop|rebuild|reset|destroy|log|svrestart)"
-        echo
-        echo "build <config>      build arvbox Docker image"
+        echo "build   <config>      build arvbox Docker image"
+        echo "rebuild <config>      build arvbox Docker image, no layer cache"
         echo "start|run <config>  start $ARVBOX_CONTAINER container"
         echo "open       open arvbox workbench in a web browser"
         echo "shell      enter arvbox shell"
@@ -426,7 +434,7 @@ case "$subcmd" in
         echo "status     print some information about current arvbox"
         echo "stop       stop arvbox container"
         echo "restart <config>  stop, then run again"
-        echo "rebuild <config>  stop, build arvbox Docker image, run"
+        echo "reboot  <config>  stop, build arvbox Docker image, run"
         echo "reset      delete arvbox arvados data (be careful!)"
         echo "destroy    delete all arvbox code and data (be careful!)"
         echo "log <service> tail log of specified service"
index 280ac6854e1540f8ff82905a29d5572953731d50..2da80685ba292d7890ddd4ca2512731c96ad5806 100644 (file)
@@ -2,7 +2,7 @@ FROM debian:8
 
 RUN apt-get update && \
     DEBIAN_FRONTEND=noninteractive apt-get -yq install \
-    postgresql-9.4 git gcc golang-go runit \
+    postgresql-9.4 git gcc runit \
     ruby rake bundler curl libpq-dev \
     libcurl4-openssl-dev libssl-dev zlib1g-dev libpcre3-dev \
     openssh-server python-setuptools netcat-traditional \
@@ -12,6 +12,13 @@ RUN apt-get update && \
     libjson-perl nginx gitolite3 lsof python-epydoc graphviz \
     apt-transport-https ca-certificates slurm-wlm
 
+RUN cd /usr/local && \
+    curl -O http://storage.googleapis.com/golang/go1.6.2.linux-amd64.tar.gz && \
+    tar -xzf go1.6.2.linux-amd64.tar.gz && \
+    rm go1.6.2.linux-amd64.tar.gz && \
+    cd bin && \
+    ln -s /usr/local/go/bin/* .
+
 VOLUME /var/lib/docker
 VOLUME /var/log/nginx
 VOLUME /etc/ssl/private
@@ -35,7 +42,7 @@ ADD fuse.conf /etc/
 ADD crunch-setup.sh gitolite.rc \
     keep-setup.sh common.sh createusers.sh \
     logger runsu.sh waitforpostgres.sh \
-    application_yml_override.py \
+    application_yml_override.py api-setup.sh \
     /usr/local/lib/arvbox/
 
 # Start the supervisor.
index 1f134159f7236f4e7d3c5aaddab13b0a7146b0fe..a04c06da44292f7643edbeaa6973371b5d05def8 100644 (file)
@@ -14,4 +14,8 @@ RUN sudo -u arvbox /var/lib/arvbox/service/api/run-service --only-deps
 RUN sudo -u arvbox /var/lib/arvbox/service/workbench/run-service --only-deps
 RUN sudo -u arvbox /var/lib/arvbox/service/doc/run-service --only-deps
 RUN sudo -u arvbox /var/lib/arvbox/service/vm/run-service --only-deps
+RUN sudo -u arvbox /var/lib/arvbox/service/keep-web/run-service --only-deps
+RUN sudo -u arvbox /var/lib/arvbox/service/keepproxy/run-service --only-deps
+RUN sudo -u arvbox /var/lib/arvbox/service/arv-git-httpd/run-service --only-deps
+RUN sudo -u arvbox /usr/local/lib/arvbox/keep-setup.sh --only-deps
 RUN sudo -u arvbox /var/lib/arvbox/service/sdk/run-service
diff --git a/tools/arvbox/lib/arvbox/docker/api-setup.sh b/tools/arvbox/lib/arvbox/docker/api-setup.sh
new file mode 100755 (executable)
index 0000000..67c43b4
--- /dev/null
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+set -u
+
+if ! test -s /var/lib/arvados/api_uuid_prefix ; then
+    ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
+fi
+uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
+
+if ! test -s /var/lib/arvados/api_secret_token ; then
+    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
+fi
+secret_token=$(cat /var/lib/arvados/api_secret_token)
+
+if ! test -s /var/lib/arvados/blob_signing_key ; then
+    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
+fi
+blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
+
+# self signed key will be created by SSO server script.
+test -s /var/lib/arvados/self-signed.key
+
+sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
+
+if test -s /var/lib/arvados/vm-uuid ; then
+    vm_uuid=$(cat /var/lib/arvados/vm-uuid)
+else
+    vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
+    echo $vm_uuid > /var/lib/arvados/vm-uuid
+fi
+
+cat >config/application.yml <<EOF
+development:
+  uuid_prefix: $uuid_prefix
+  secret_token: $secret_token
+  blob_signing_key: $blob_signing_key
+  sso_app_secret: $sso_app_secret
+  sso_app_id: arvados-server
+  sso_provider_url: "https://$localip:${services[sso]}"
+  sso_insecure: true
+  workbench_address: "http://$localip/"
+  websocket_address: "ws://$localip:${services[websockets]}/websocket"
+  git_repo_ssh_base: "git@$localip:"
+  git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
+  new_users_are_active: true
+  auto_admin_first_user: true
+  auto_setup_new_users: true
+  auto_setup_new_users_with_vm_uuid: $vm_uuid
+  auto_setup_new_users_with_repository: true
+  default_collection_replication: 1
+EOF
+
+(cd config && /usr/local/lib/arvbox/application_yml_override.py)
+
+if ! test -f /var/lib/arvados/api_database_pw ; then
+    ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
+fi
+database_pw=$(cat /var/lib/arvados/api_database_pw)
+
+if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
+    psql postgres -c "create user arvados with password '$database_pw'"
+    psql postgres -c "ALTER USER arvados CREATEDB;"
+fi
+
+sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
+
+if ! test -f /var/lib/arvados/api_database_setup ; then
+   bundle exec rake db:setup
+   touch /var/lib/arvados/api_database_setup
+fi
+
+if ! test -s /var/lib/arvados/superuser_token ; then
+    bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
+fi
+
+rm -rf tmp
+
+bundle exec rake db:migrate
index 4c2de4798c2d0731d92b1db64dc00753c243925e..3733fa2ecb187b7a1daa5ce7851546faf5856603 100644 (file)
@@ -21,6 +21,7 @@ services=(
   [keepstore1]=25108
   [ssh]=22
   [doc]=8001
+  [websockets]=8002
 )
 
 if test "$(id arvbox -u 2>/dev/null)" = 0 ; then
@@ -37,13 +38,13 @@ run_bundler() {
     else
         frozen=""
     fi
-    if ! flock /var/lib/arvados/gems.lock bundle install --path $GEM_HOME --local --no-deployment $frozen "$@" ; then
-        flock /var/lib/arvados/gems.lock bundle install --path $GEM_HOME --no-deployment $frozen "$@"
+    if ! flock /var/lib/gems/gems.lock bundle install --path $GEM_HOME --local --no-deployment $frozen "$@" ; then
+        flock /var/lib/gems/gems.lock bundle install --path $GEM_HOME --no-deployment $frozen "$@"
     fi
 }
 
 pip_install() {
-    pushd /var/lib/arvados/pip
+    pushd /var/lib/pip
     for p in $(ls http*.tar.gz) ; do
         if test -f $p ; then
             ln -sf $p $(echo $p | sed 's/.*%2F\(.*\)/\1/')
@@ -56,7 +57,7 @@ pip_install() {
     done
     popd
 
-    if ! pip install --no-index --find-links /var/lib/arvados/pip $1 ; then
+    if ! pip install --no-index --find-links /var/lib/pip $1 ; then
         pip install $1
     fi
 }
index b77c9c27071021d40af771e61a12e0907a51bc98..9ef37921ec18f01b85f9b77bd91fc69f24b48415 100755 (executable)
@@ -7,7 +7,8 @@ if ! grep "^arvbox:" /etc/passwd >/dev/null 2>/dev/null ; then
     HOSTGID=$(ls -nd /usr/src/arvados | sed 's/ */ /' | cut -d' ' -f5)
     FUSEGID=$(ls -nd /dev/fuse | sed 's/ */ /' | cut -d' ' -f5)
 
-    mkdir -p /var/lib/arvados/git /var/lib/gems /var/lib/passenger
+    mkdir -p /var/lib/arvados/git /var/lib/gems \
+          /var/lib/passenger /var/lib/gopath /var/lib/pip
 
     groupadd --gid $HOSTGID --non-unique arvbox
     groupadd --gid $FUSEGID --non-unique fuse
@@ -22,7 +23,8 @@ if ! grep "^arvbox:" /etc/passwd >/dev/null 2>/dev/null ; then
 
     chown arvbox:arvbox -R /usr/local /var/lib/arvados /var/lib/gems \
           /var/lib/passenger /var/lib/postgresql \
-          /var/lib/nginx /var/log/nginx /etc/ssl/private
+          /var/lib/nginx /var/log/nginx /etc/ssl/private \
+          /var/lib/gopath /var/lib/pip
 
     mkdir -p /var/lib/gems/ruby/2.1.0
     chown arvbox:arvbox -R /var/lib/gems/ruby/2.1.0
index 178fec175945ed2095341b5400f294f71052f8a1..9efe1164c3c44a7b24d1c9b14316b34eeccc883d 100755 (executable)
@@ -5,14 +5,15 @@ set -eux -o pipefail
 
 . /usr/local/lib/arvbox/common.sh
 
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
 
 export GOPATH=$PWD
 mkdir -p "$GOPATH/src/git.curoverse.com"
 ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/crunchstat"
-install bin/crunchstat /usr/local/bin
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/crunchstat"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/sdk/go/crunchrunner"
+install bin/crunchstat bin/crunchrunner /usr/local/bin
 
 export ARVADOS_API_HOST=$localip:${services[api]}
 export ARVADOS_API_HOST_INSECURE=1
@@ -25,4 +26,8 @@ export CRUNCH_JOB_DOCKER_BIN=docker
 export HOME=/tmp/$1
 
 cd /usr/src/arvados/services/api
-exec bundle exec ./script/crunch-dispatch.rb development
+if test "$1" = "crunch0" ; then
+    exec bundle exec ./script/crunch-dispatch.rb development --jobs --pipelines
+else
+    exec bundle exec ./script/crunch-dispatch.rb development --jobs
+fi
index b66463f1c3e363c1f96913928ca00d851885e07a..f77de10ce97faa5838eb2d0348252852f247d947 100755 (executable)
@@ -6,15 +6,19 @@ set -eux -o pipefail
 
 . /usr/local/lib/arvbox/common.sh
 
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
 
 export GOPATH=$PWD
 mkdir -p "$GOPATH/src/git.curoverse.com"
 ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/keepstore"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/keepstore"
 install bin/keepstore /usr/local/bin
 
+if test "$1" = "--only-deps" ; then
+    exit
+fi
+
 mkdir -p /var/lib/arvados/$1
 
 export ARVADOS_API_HOST=$localip:${services[api]}
@@ -47,5 +51,6 @@ exec /usr/local/bin/keepstore \
      -listen=:$2 \
      -enforce-permissions=true \
      -blob-signing-key-file=/var/lib/arvados/blob_signing_key \
+     -data-manager-token-file=/var/lib/arvados/superuser_token \
      -max-buffers=20 \
      -volume=/var/lib/arvados/$1
index 058939c477723d703960b19ccb4331641b1d56eb..a36205c9678d4e67063bbec141072df1a737cea9 100755 (executable)
@@ -15,88 +15,14 @@ if test "$1" = "--only-deps" ; then
     exit
 fi
 
-set -u
-
-if ! test -s /var/lib/arvados/api_uuid_prefix ; then
-    ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
-fi
-uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
-
-if ! test -s /var/lib/arvados/api_secret_token ; then
-    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
-fi
-secret_token=$(cat /var/lib/arvados/api_secret_token)
-
-if ! test -s /var/lib/arvados/blob_signing_key ; then
-    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
-fi
-blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
-
-# self signed key will be created by SSO server script.
-test -s /var/lib/arvados/self-signed.key
-
-sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
-
-if test -s /var/lib/arvados/vm-uuid ; then
-    vm_uuid=$(cat /var/lib/arvados/vm-uuid)
-else
-    vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
-    echo $vm_uuid > /var/lib/arvados/vm-uuid
-fi
-
-cat >config/application.yml <<EOF
-development:
-  uuid_prefix: $uuid_prefix
-  secret_token: $secret_token
-  blob_signing_key: $blob_signing_key
-  sso_app_secret: $sso_app_secret
-  sso_app_id: arvados-server
-  sso_provider_url: "https://$localip:${services[sso]}"
-  sso_insecure: true
-  workbench_address: "http://$localip/"
-  git_repo_ssh_base: "git@$localip:"
-  git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
-  new_users_are_active: true
-  auto_admin_first_user: true
-  auto_setup_new_users: true
-  auto_setup_new_users_with_vm_uuid: $vm_uuid
-  auto_setup_new_users_with_repository: true
-  default_collection_replication: 1
-EOF
-
-(cd config && /usr/local/lib/arvbox/application_yml_override.py)
-
-if ! test -f /var/lib/arvados/api_database_pw ; then
-    ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
-fi
-database_pw=$(cat /var/lib/arvados/api_database_pw)
-
-if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
-    psql postgres -c "create user arvados with password '$database_pw'"
-    psql postgres -c "ALTER USER arvados CREATEDB;"
-fi
-
-sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
-
-if ! test -f /var/lib/arvados/api_database_setup ; then
-   bundle exec rake db:setup
-   touch /var/lib/arvados/api_database_setup
-fi
-
-if ! test -s /var/lib/arvados/superuser_token ; then
-    bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
-fi
-
-rm -rf tmp
-
-bundle exec rake db:migrate
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
 
 set +u
 if test "$1" = "--only-setup" ; then
     exit
 fi
 
-ARVADOS_WEBSOCKETS=1 exec bundle exec passenger start --port=${services[api]} \
+exec bundle exec passenger start --port=${services[api]} \
                   --runtime-dir=/var/lib/passenger \
                   --ssl --ssl-certificate=/var/lib/arvados/self-signed.pem \
                   --ssl-certificate-key=/var/lib/arvados/self-signed.key
index 854464efd0fc20f7e611f7c8f11576f0d761310c..518fe33d049a753cd9cece8b416c46dc4045e483 100755 (executable)
@@ -1,19 +1,23 @@
 #!/bin/bash
 
 exec 2>&1
-set -eux -o pipefail
+set -ex -o pipefail
 
 . /usr/local/lib/arvbox/common.sh
 
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
 
 export GOPATH=$PWD
 mkdir -p "$GOPATH/src/git.curoverse.com"
 ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/arv-git-httpd"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/arv-git-httpd"
 install bin/arv-git-httpd /usr/local/bin
 
+if test "$1" = "--only-deps" ; then
+    exit
+fi
+
 export ARVADOS_API_HOST=$localip:${services[api]}
 export ARVADOS_API_HOST_INSECURE=1
 export GITOLITE_HTTP_HOME=/var/lib/arvados/git
index 211b43885d6e49c5585ee57c359f1d29b5a55b90..c2d2cb88ebd1e269ff0c96603e2a456ca196149d 100755 (executable)
@@ -5,14 +5,14 @@ set -eux -o pipefail
 
 . /usr/local/lib/arvbox/common.sh
 
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
 
 export GOPATH=$PWD
 mkdir -p "$GOPATH/src/git.curoverse.com"
 ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/crunch-run"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/crunch-dispatch-local"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/crunch-run"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/crunch-dispatch-local"
 install bin/crunch-run bin/crunch-dispatch-local /usr/local/bin
 
 export ARVADOS_API_HOST=$localip:${services[api]}
index a2c6aa195fbed303eb7ffcf7261a10bb79090f50..fe53725228d91bc1181a2bae6a1045d188f98694 100755 (executable)
@@ -1,19 +1,23 @@
 #!/bin/bash
 
 exec 2>&1
-set -eux -o pipefail
+set -ex -o pipefail
 
 . /usr/local/lib/arvbox/common.sh
 
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
 
 export GOPATH=$PWD
 mkdir -p "$GOPATH/src/git.curoverse.com"
 ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/keep-web"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/keep-web"
 install bin/keep-web /usr/local/bin
 
+if test "$1" = "--only-deps" ; then
+    exit
+fi
+
 export ARVADOS_API_HOST=$localip:${services[api]}
 export ARVADOS_API_HOST_INSECURE=1
 export ARVADOS_API_TOKEN=$(cat /var/lib/arvados/superuser_token)
index 413a67ed5640907f1b1809b497328f42c129e7cb..00b2e01f8305a4d21c17a48cfe79ba7fa31049c8 100755 (executable)
@@ -2,19 +2,23 @@
 
 exec 2>&1
 sleep 2
-set -eux -o pipefail
+set -ex -o pipefail
 
 . /usr/local/lib/arvbox/common.sh
 
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
 
 export GOPATH=$PWD
 mkdir -p "$GOPATH/src/git.curoverse.com"
 ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/keepproxy"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/keepproxy"
 install bin/keepproxy /usr/local/bin
 
+if test "$1" = "--only-deps" ; then
+    exit
+fi
+
 export ARVADOS_API_HOST=$localip:${services[api]}
 export ARVADOS_API_HOST_INSECURE=1
 export ARVADOS_API_TOKEN=$(cat /var/lib/arvados/superuser_token)
index 3ee6f2a04265103f4dcf14fcc33742d26e636b22..29452ab9943c7853da919fa130de0b5690e249ba 100755 (executable)
@@ -5,10 +5,10 @@ set -eux -o pipefail
 
 . /usr/local/lib/arvbox/common.sh
 
-mkdir -p ~/.pip /var/lib/arvados/pip
+mkdir -p ~/.pip /var/lib/pip
 cat > ~/.pip/pip.conf <<EOF
 [global]
-download_cache = /var/lib/arvados/pip
+download_cache = /var/lib/pip
 EOF
 
 cd /usr/src/arvados/sdk/cli
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/log/main/.gitstub b/tools/arvbox/lib/arvbox/docker/service/websockets/log/main/.gitstub
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/log/run b/tools/arvbox/lib/arvbox/docker/service/websockets/log/run
new file mode 120000 (symlink)
index 0000000..d6aef4a
--- /dev/null
@@ -0,0 +1 @@
+/usr/local/lib/arvbox/logger
\ No newline at end of file
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/run b/tools/arvbox/lib/arvbox/docker/service/websockets/run
new file mode 120000 (symlink)
index 0000000..a388c8b
--- /dev/null
@@ -0,0 +1 @@
+/usr/local/lib/arvbox/runsu.sh
\ No newline at end of file
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/run-service b/tools/arvbox/lib/arvbox/docker/service/websockets/run-service
new file mode 100755 (executable)
index 0000000..d0c0b5d
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+run_bundler --without=development
+
+if test "$1" = "--only-deps" ; then
+    exit
+fi
+
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
+
+set +u
+if test "$1" = "--only-setup" ; then
+    exit
+fi
+
+export ARVADOS_WEBSOCKETS=ws-only
+
+# serving ssl directly doesn't work, gets
+# Rack app error: #<TypeError: no implicit conversion of Puma::MiniSSL::Socket into Integer>
+#exec bundle exec puma -b "ssl://0.0.0.0:${services[websockets]}?cert=/var/lib/arvados/self-signed.pem&key=/var/lib/arvados/self-signed.key"
+
+exec bundle exec puma -p${services[websockets]}
index 2b6ebce16dc55ff5ad63af14d0924c4cd245a545..f6b5b586ef8217775ed180ea5c51fd5ee501b821 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import print_function
 import arvados
 import Queue
 import threading
+import _strptime
 
 from crunchstat_summary import logger
 
index 2ac12abcba23e381073589cf209915b88a9d8cef..a88e4d5c41f06f017863965d43a9b60818426130 100644 (file)
@@ -11,6 +11,7 @@ import math
 import re
 import sys
 import threading
+import _strptime
 
 from arvados.api import OrderedJsonModel
 from crunchstat_summary import logger
diff --git a/tools/keep-block-check/.gitignore b/tools/keep-block-check/.gitignore
new file mode 100644 (file)
index 0000000..97eb5da
--- /dev/null
@@ -0,0 +1 @@
+keep-block-check
diff --git a/tools/keep-block-check/keep-block-check.go b/tools/keep-block-check/keep-block-check.go
new file mode 100644 (file)
index 0000000..6cf11a7
--- /dev/null
@@ -0,0 +1,239 @@
+package main
+
+import (
+       "crypto/tls"
+       "errors"
+       "flag"
+       "fmt"
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+       "io/ioutil"
+       "log"
+       "net/http"
+       "os"
+       "regexp"
+       "strings"
+       "time"
+)
+
+func main() {
+       err := doMain(os.Args[1:])
+       if err != nil {
+               log.Fatalf("%v", err)
+       }
+}
+
+func doMain(args []string) error {
+       flags := flag.NewFlagSet("keep-block-check", flag.ExitOnError)
+
+       configFile := flags.String(
+               "config",
+               "",
+               "Configuration filename. May be either a pathname to a config file, or (for example) 'foo' as shorthand for $HOME/.config/arvados/foo.conf file. This file is expected to specify the values for ARVADOS_API_TOKEN, ARVADOS_API_HOST, ARVADOS_API_HOST_INSECURE, and ARVADOS_BLOB_SIGNING_KEY for the source.")
+
+       keepServicesJSON := flags.String(
+               "keep-services-json",
+               "",
+               "An optional list of available keepservices. "+
+                       "If not provided, this list is obtained from api server configured in config-file.")
+
+       locatorFile := flags.String(
+               "block-hash-file",
+               "",
+               "Filename containing the block hashes to be checked. This is required. "+
+                       "This file contains the block hashes one per line.")
+
+       prefix := flags.String(
+               "prefix",
+               "",
+               "Block hash prefix. When a prefix is specified, only hashes listed in the file with this prefix will be checked.")
+
+       blobSignatureTTLFlag := flags.Duration(
+               "blob-signature-ttl",
+               0,
+               "Lifetime of blob permission signatures on the keepservers. If not provided, this will be retrieved from the API server's discovery document.")
+
+       verbose := flags.Bool(
+               "v",
+               false,
+               "Log progress of each block verification")
+
+       // Parse args; omit the first arg which is the command name
+       flags.Parse(args)
+
+       config, blobSigningKey, err := loadConfig(*configFile)
+       if err != nil {
+               return fmt.Errorf("Error loading configuration from file: %s", err.Error())
+       }
+
+       // get list of block locators to be checked
+       blockLocators, err := getBlockLocators(*locatorFile, *prefix)
+       if err != nil {
+               return fmt.Errorf("Error reading block hashes to be checked from file: %s", err.Error())
+       }
+
+       // setup keepclient
+       kc, blobSignatureTTL, err := setupKeepClient(config, *keepServicesJSON, *blobSignatureTTLFlag)
+       if err != nil {
+               return fmt.Errorf("Error configuring keepclient: %s", err.Error())
+       }
+
+       return performKeepBlockCheck(kc, blobSignatureTTL, blobSigningKey, blockLocators, *verbose)
+}
+
+type apiConfig struct {
+       APIToken        string
+       APIHost         string
+       APIHostInsecure bool
+       ExternalClient  bool
+}
+
+// Load config from given file
+func loadConfig(configFile string) (config apiConfig, blobSigningKey string, err error) {
+       if configFile == "" {
+               err = errors.New("Client config file not specified")
+               return
+       }
+
+       config, blobSigningKey, err = readConfigFromFile(configFile)
+       return
+}
+
+var matchTrue = regexp.MustCompile("^(?i:1|yes|true)$")
+
+// Read config from file
+func readConfigFromFile(filename string) (config apiConfig, blobSigningKey string, err error) {
+       if !strings.Contains(filename, "/") {
+               filename = os.Getenv("HOME") + "/.config/arvados/" + filename + ".conf"
+       }
+
+       content, err := ioutil.ReadFile(filename)
+
+       if err != nil {
+               return
+       }
+
+       lines := strings.Split(string(content), "\n")
+       for _, line := range lines {
+               if line == "" {
+                       continue
+               }
+
+               kv := strings.SplitN(line, "=", 2)
+               if len(kv) == 2 {
+                       key := strings.TrimSpace(kv[0])
+                       value := strings.TrimSpace(kv[1])
+
+                       switch key {
+                       case "ARVADOS_API_TOKEN":
+                               config.APIToken = value
+                       case "ARVADOS_API_HOST":
+                               config.APIHost = value
+                       case "ARVADOS_API_HOST_INSECURE":
+                               config.APIHostInsecure = matchTrue.MatchString(value)
+                       case "ARVADOS_EXTERNAL_CLIENT":
+                               config.ExternalClient = matchTrue.MatchString(value)
+                       case "ARVADOS_BLOB_SIGNING_KEY":
+                               blobSigningKey = value
+                       }
+               }
+       }
+
+       return
+}
+
+// setup keepclient using the config provided
+func setupKeepClient(config apiConfig, keepServicesJSON string, blobSignatureTTL time.Duration) (kc *keepclient.KeepClient, ttl time.Duration, err error) {
+       arv := arvadosclient.ArvadosClient{
+               ApiToken:    config.APIToken,
+               ApiServer:   config.APIHost,
+               ApiInsecure: config.APIHostInsecure,
+               Client: &http.Client{Transport: &http.Transport{
+                       TLSClientConfig: &tls.Config{InsecureSkipVerify: config.APIHostInsecure}}},
+               External: config.ExternalClient,
+       }
+
+       // if keepServicesJSON is provided, use it to load services; else, use DiscoverKeepServers
+       if keepServicesJSON == "" {
+               kc, err = keepclient.MakeKeepClient(&arv)
+               if err != nil {
+                       return
+               }
+       } else {
+               kc = keepclient.New(&arv)
+               err = kc.LoadKeepServicesFromJSON(keepServicesJSON)
+               if err != nil {
+                       return
+               }
+       }
+
+       // Get if blobSignatureTTL is not provided
+       ttl = blobSignatureTTL
+       if blobSignatureTTL == 0 {
+               value, err := arv.Discovery("blobSignatureTtl")
+               if err == nil {
+                       ttl = time.Duration(int(value.(float64))) * time.Second
+               } else {
+                       return nil, 0, err
+               }
+       }
+
+       return
+}
+
+// Get list of unique block locators from the given file
+func getBlockLocators(locatorFile, prefix string) (locators []string, err error) {
+       if locatorFile == "" {
+               err = errors.New("block-hash-file not specified")
+               return
+       }
+
+       content, err := ioutil.ReadFile(locatorFile)
+       if err != nil {
+               return
+       }
+
+       locatorMap := make(map[string]bool)
+       for _, line := range strings.Split(string(content), "\n") {
+               line = strings.TrimSpace(line)
+               if line == "" || !strings.HasPrefix(line, prefix) || locatorMap[line] {
+                       continue
+               }
+               locators = append(locators, line)
+               locatorMap[line] = true
+       }
+
+       return
+}
+
+// Get block headers from keep. Log any errors.
+func performKeepBlockCheck(kc *keepclient.KeepClient, blobSignatureTTL time.Duration, blobSigningKey string, blockLocators []string, verbose bool) error {
+       totalBlocks := len(blockLocators)
+       notFoundBlocks := 0
+       current := 0
+       for _, locator := range blockLocators {
+               current++
+               if verbose {
+                       log.Printf("Verifying block %d of %d: %v", current, totalBlocks, locator)
+               }
+               getLocator := locator
+               if blobSigningKey != "" {
+                       expiresAt := time.Now().AddDate(0, 0, 1)
+                       getLocator = keepclient.SignLocator(locator, kc.Arvados.ApiToken, expiresAt, blobSignatureTTL, []byte(blobSigningKey))
+               }
+
+               _, _, err := kc.Ask(getLocator)
+               if err != nil {
+                       notFoundBlocks++
+                       log.Printf("Error verifying block %v: %v", locator, err)
+               }
+       }
+
+       log.Printf("Verify block totals: %d attempts, %d successes, %d errors", totalBlocks, totalBlocks-notFoundBlocks, notFoundBlocks)
+
+       if notFoundBlocks > 0 {
+               return fmt.Errorf("Block verification failed for %d out of %d blocks with matching prefix.", notFoundBlocks, totalBlocks)
+       }
+
+       return nil
+}
diff --git a/tools/keep-block-check/keep-block-check_test.go b/tools/keep-block-check/keep-block-check_test.go
new file mode 100644 (file)
index 0000000..e49fe68
--- /dev/null
@@ -0,0 +1,352 @@
+package main
+
+import (
+       "bytes"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "log"
+       "os"
+       "regexp"
+       "strings"
+       "testing"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+
+       . "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+       TestingT(t)
+}
+
+// Gocheck boilerplate
+var _ = Suite(&ServerRequiredSuite{})
+var _ = Suite(&DoMainTestSuite{})
+
+type ServerRequiredSuite struct{}
+type DoMainTestSuite struct{}
+
+var kc *keepclient.KeepClient
+var logBuffer bytes.Buffer
+
+var TestHash = "aaaa09c290d0fb1ca068ffaddf22cbd0"
+var TestHash2 = "aaaac516f788aec4f30932ffb6395c39"
+
+var blobSignatureTTL = time.Duration(2*7*24) * time.Hour
+
+func (s *ServerRequiredSuite) SetUpSuite(c *C) {
+       arvadostest.StartAPI()
+}
+
+func (s *ServerRequiredSuite) TearDownSuite(c *C) {
+       arvadostest.StopAPI()
+       arvadostest.ResetEnv()
+}
+
+func (s *ServerRequiredSuite) SetUpTest(c *C) {
+       logOutput := io.MultiWriter(&logBuffer)
+       log.SetOutput(logOutput)
+}
+
+func (s *ServerRequiredSuite) TearDownTest(c *C) {
+       arvadostest.StopKeep(2)
+       log.SetOutput(os.Stdout)
+       log.Printf("%v", logBuffer.String())
+}
+
+func (s *DoMainTestSuite) SetUpSuite(c *C) {
+}
+
+func (s *DoMainTestSuite) SetUpTest(c *C) {
+       logOutput := io.MultiWriter(&logBuffer)
+       log.SetOutput(logOutput)
+}
+
+func (s *DoMainTestSuite) TearDownTest(c *C) {
+       log.SetOutput(os.Stdout)
+       log.Printf("%v", logBuffer.String())
+}
+
+func setupKeepBlockCheck(c *C, enforcePermissions bool, keepServicesJSON string) {
+       setupKeepBlockCheckWithTTL(c, enforcePermissions, keepServicesJSON, blobSignatureTTL)
+}
+
+func setupKeepBlockCheckWithTTL(c *C, enforcePermissions bool, keepServicesJSON string, ttl time.Duration) {
+       var config apiConfig
+       config.APIHost = os.Getenv("ARVADOS_API_HOST")
+       config.APIToken = arvadostest.DataManagerToken
+       config.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
+
+       // Start Keep servers
+       arvadostest.StartKeep(2, enforcePermissions)
+
+       // setup keepclients
+       var err error
+       kc, ttl, err = setupKeepClient(config, keepServicesJSON, ttl)
+       c.Assert(ttl, Equals, blobSignatureTTL)
+       c.Check(err, IsNil)
+}
+
+// Setup test data
+func setupTestData(c *C) []string {
+       allLocators := []string{}
+
+       // Put a few blocks
+       for i := 0; i < 5; i++ {
+               hash, _, err := kc.PutB([]byte(fmt.Sprintf("keep-block-check-test-data-%d", i)))
+               c.Check(err, IsNil)
+               allLocators = append(allLocators, strings.Split(hash, "+A")[0])
+       }
+
+       return allLocators
+}
+
+func setupConfigFile(c *C, fileName string) string {
+       // Setup a config file
+       file, err := ioutil.TempFile(os.TempDir(), fileName)
+       c.Check(err, IsNil)
+
+       // Add config to file. While at it, throw some extra white space
+       fileContent := "ARVADOS_API_HOST=" + os.Getenv("ARVADOS_API_HOST") + "\n"
+       fileContent += "ARVADOS_API_TOKEN=" + arvadostest.DataManagerToken + "\n"
+       fileContent += "\n"
+       fileContent += "ARVADOS_API_HOST_INSECURE=" + os.Getenv("ARVADOS_API_HOST_INSECURE") + "\n"
+       fileContent += " ARVADOS_EXTERNAL_CLIENT = false \n"
+       fileContent += " NotANameValuePairAndShouldGetIgnored \n"
+       fileContent += "ARVADOS_BLOB_SIGNING_KEY=abcdefg\n"
+
+       _, err = file.Write([]byte(fileContent))
+       c.Check(err, IsNil)
+
+       return file.Name()
+}
+
+func setupBlockHashFile(c *C, name string, blocks []string) string {
+       // Setup a block hash file
+       file, err := ioutil.TempFile(os.TempDir(), name)
+       c.Check(err, IsNil)
+
+       // Add the hashes to the file. While at it, throw some extra white space
+       fileContent := ""
+       for _, hash := range blocks {
+               fileContent += fmt.Sprintf(" %s \n", hash)
+       }
+       fileContent += "\n"
+       _, err = file.Write([]byte(fileContent))
+       c.Check(err, IsNil)
+
+       return file.Name()
+}
+
+func checkErrorLog(c *C, blocks []string, prefix, suffix string) {
+       for _, hash := range blocks {
+               expected := prefix + `.*` + hash + `.*` + suffix
+               match, _ := regexp.MatchString(expected, logBuffer.String())
+               c.Assert(match, Equals, true)
+       }
+}
+
+func checkNoErrorsLogged(c *C, prefix, suffix string) {
+       expected := prefix + `.*` + suffix
+       match, _ := regexp.MatchString(expected, logBuffer.String())
+       c.Assert(match, Equals, false)
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck(c *C) {
+       setupKeepBlockCheck(c, false, "")
+       allLocators := setupTestData(c)
+       err := performKeepBlockCheck(kc, blobSignatureTTL, "", allLocators, true)
+       c.Check(err, IsNil)
+       checkNoErrorsLogged(c, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheckWithBlobSigning(c *C) {
+       setupKeepBlockCheck(c, true, "")
+       allLocators := setupTestData(c)
+       err := performKeepBlockCheck(kc, blobSignatureTTL, arvadostest.BlobSigningKey, allLocators, true)
+       c.Check(err, IsNil)
+       checkNoErrorsLogged(c, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheckWithBlobSigningAndTTLFromDiscovery(c *C) {
+       setupKeepBlockCheckWithTTL(c, true, "", 0)
+       allLocators := setupTestData(c)
+       err := performKeepBlockCheck(kc, blobSignatureTTL, arvadostest.BlobSigningKey, allLocators, true)
+       c.Check(err, IsNil)
+       checkNoErrorsLogged(c, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_NoSuchBlock(c *C) {
+       setupKeepBlockCheck(c, false, "")
+       allLocators := setupTestData(c)
+       allLocators = append(allLocators, TestHash)
+       allLocators = append(allLocators, TestHash2)
+       err := performKeepBlockCheck(kc, blobSignatureTTL, "", allLocators, true)
+       c.Check(err, NotNil)
+       c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 7 blocks with matching prefix.")
+       checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_NoSuchBlock_WithMatchingPrefix(c *C) {
+       setupKeepBlockCheck(c, false, "")
+       allLocators := setupTestData(c)
+       allLocators = append(allLocators, TestHash)
+       allLocators = append(allLocators, TestHash2)
+       locatorFile := setupBlockHashFile(c, "block-hash", allLocators)
+       defer os.Remove(locatorFile)
+       locators, err := getBlockLocators(locatorFile, "aaa")
+       c.Check(err, IsNil)
+       err = performKeepBlockCheck(kc, blobSignatureTTL, "", locators, true)
+       c.Check(err, NotNil)
+       // Of the 7 blocks in allLocators, only two match the prefix and hence only those are checked
+       c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+       checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_NoSuchBlock_WithPrefixMismatch(c *C) {
+       setupKeepBlockCheck(c, false, "")
+       allLocators := setupTestData(c)
+       allLocators = append(allLocators, TestHash)
+       allLocators = append(allLocators, TestHash2)
+       locatorFile := setupBlockHashFile(c, "block-hash", allLocators)
+       defer os.Remove(locatorFile)
+       locators, err := getBlockLocators(locatorFile, "999")
+       c.Check(err, IsNil)
+       err = performKeepBlockCheck(kc, blobSignatureTTL, "", locators, true)
+       c.Check(err, IsNil) // there were no matching locators in file and hence nothing was checked
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_BadSignature(c *C) {
+       setupKeepBlockCheck(c, true, "")
+       setupTestData(c)
+       err := performKeepBlockCheck(kc, blobSignatureTTL, "badblobsigningkey", []string{TestHash, TestHash2}, false)
+       c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+       checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "HTTP 403")
+       // verbose logging not requested
+       c.Assert(strings.Contains(logBuffer.String(), "Verifying block 1 of 2"), Equals, false)
+}
+
+var testKeepServicesJSON = `{
+  "kind":"arvados#keepServiceList",
+  "etag":"",
+  "self_link":"",
+  "offset":null, "limit":null,
+  "items":[
+    {"href":"/keep_services/zzzzz-bi6l4-123456789012340",
+     "kind":"arvados#keepService",
+     "uuid":"zzzzz-bi6l4-123456789012340",
+     "service_host":"keep0.zzzzz.arvadosapi.com",
+     "service_port":25107,
+     "service_ssl_flag":false,
+     "service_type":"disk",
+     "read_only":false },
+    {"href":"/keep_services/zzzzz-bi6l4-123456789012341",
+     "kind":"arvados#keepService",
+     "uuid":"zzzzz-bi6l4-123456789012341",
+     "service_host":"keep0.zzzzz.arvadosapi.com",
+     "service_port":25108,
+     "service_ssl_flag":false,
+     "service_type":"disk",
+     "read_only":false }
+    ],
+  "items_available":2 }`
+
+// Setup block-check using keepServicesJSON with fake keepservers.
+// Expect error during performKeepBlockCheck due to unreachable keepservers.
+func (s *ServerRequiredSuite) TestErrorDuringKeepBlockCheck_FakeKeepservers(c *C) {
+       setupKeepBlockCheck(c, false, testKeepServicesJSON)
+       err := performKeepBlockCheck(kc, blobSignatureTTL, "", []string{TestHash, TestHash2}, true)
+       c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+       checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "")
+}
+
+// Test keep-block-check initialization with keepServicesJSON
+func (s *ServerRequiredSuite) TestKeepBlockCheck_InitializeWithKeepServicesJSON(c *C) {
+       setupKeepBlockCheck(c, false, testKeepServicesJSON)
+       found := 0
+       for k := range kc.LocalRoots() {
+               if k == "zzzzz-bi6l4-123456789012340" || k == "zzzzz-bi6l4-123456789012341" {
+                       found++
+               }
+       }
+       c.Check(found, Equals, 2)
+}
+
+// Test loadConfig func
+func (s *ServerRequiredSuite) TestLoadConfig(c *C) {
+       // Setup config file
+       configFile := setupConfigFile(c, "config")
+       defer os.Remove(configFile)
+
+       // load configuration from the file
+       config, blobSigningKey, err := loadConfig(configFile)
+       c.Check(err, IsNil)
+
+       c.Assert(config.APIHost, Equals, os.Getenv("ARVADOS_API_HOST"))
+       c.Assert(config.APIToken, Equals, arvadostest.DataManagerToken)
+       c.Assert(config.APIHostInsecure, Equals, matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE")))
+       c.Assert(config.ExternalClient, Equals, false)
+       c.Assert(blobSigningKey, Equals, "abcdefg")
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoConfig(c *C) {
+       args := []string{"-prefix", "a"}
+       err := doMain(args)
+       c.Check(err, NotNil)
+       c.Assert(strings.Contains(err.Error(), "config file not specified"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoSuchConfigFile(c *C) {
+       args := []string{"-config", "no-such-file"}
+       err := doMain(args)
+       c.Check(err, NotNil)
+       c.Assert(strings.Contains(err.Error(), "no such file or directory"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoBlockHashFile(c *C) {
+       config := setupConfigFile(c, "config")
+       defer os.Remove(config)
+
+       // Start keepservers.
+       arvadostest.StartKeep(2, false)
+       defer arvadostest.StopKeep(2)
+
+       args := []string{"-config", config}
+       err := doMain(args)
+       c.Assert(strings.Contains(err.Error(), "block-hash-file not specified"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoSuchBlockHashFile(c *C) {
+       config := setupConfigFile(c, "config")
+       defer os.Remove(config)
+
+       arvadostest.StartKeep(2, false)
+       defer arvadostest.StopKeep(2)
+
+       args := []string{"-config", config, "-block-hash-file", "no-such-file"}
+       err := doMain(args)
+       c.Assert(strings.Contains(err.Error(), "no such file or directory"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain(c *C) {
+       // Start keepservers.
+       arvadostest.StartKeep(2, false)
+       defer arvadostest.StopKeep(2)
+
+       config := setupConfigFile(c, "config")
+       defer os.Remove(config)
+
+       locatorFile := setupBlockHashFile(c, "block-hash", []string{TestHash, TestHash2})
+       defer os.Remove(locatorFile)
+
+       args := []string{"-config", config, "-block-hash-file", locatorFile, "-v"}
+       err := doMain(args)
+       c.Check(err, NotNil)
+       c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+       checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "Block not found")
+       c.Assert(strings.Contains(logBuffer.String(), "Verifying block 1 of 2"), Equals, true)
+}
index 820772eb5b6a040d15683549b1e617c5d9e718e0..c6e7665caa2a312c327b8a603159a7da07941450 100644 (file)
@@ -60,6 +60,11 @@ func doMain() error {
                "",
                "Index prefix")
 
+       srcBlobSignatureTTLFlag := flags.Duration(
+               "src-blob-signature-ttl",
+               0,
+               "Lifetime of blob permission signatures on source keepservers. If not provided, this will be retrieved from the API server's discovery document.")
+
        // Parse args; omit the first arg which is the command name
        flags.Parse(os.Args[1:])
 
@@ -74,18 +79,18 @@ func doMain() error {
        }
 
        // setup src and dst keepclients
-       kcSrc, err := setupKeepClient(srcConfig, *srcKeepServicesJSON, false, 0)
+       kcSrc, srcBlobSignatureTTL, err := setupKeepClient(srcConfig, *srcKeepServicesJSON, false, 0, *srcBlobSignatureTTLFlag)
        if err != nil {
                return fmt.Errorf("Error configuring src keepclient: %s", err.Error())
        }
 
-       kcDst, err := setupKeepClient(dstConfig, *dstKeepServicesJSON, true, *replications)
+       kcDst, _, err := setupKeepClient(dstConfig, *dstKeepServicesJSON, true, *replications, 0)
        if err != nil {
                return fmt.Errorf("Error configuring dst keepclient: %s", err.Error())
        }
 
        // Copy blocks not found in dst from src
-       err = performKeepRsync(kcSrc, kcDst, srcBlobSigningKey, *prefix)
+       err = performKeepRsync(kcSrc, kcDst, srcBlobSignatureTTL, srcBlobSigningKey, *prefix)
        if err != nil {
                return fmt.Errorf("Error while syncing data: %s", err.Error())
        }
@@ -155,7 +160,7 @@ func readConfigFromFile(filename string) (config apiConfig, blobSigningKey strin
 }
 
 // setup keepclient using the config provided
-func setupKeepClient(config apiConfig, keepServicesJSON string, isDst bool, replications int) (kc *keepclient.KeepClient, err error) {
+func setupKeepClient(config apiConfig, keepServicesJSON string, isDst bool, replications int, srcBlobSignatureTTL time.Duration) (kc *keepclient.KeepClient, blobSignatureTTL time.Duration, err error) {
        arv := arvadosclient.ArvadosClient{
                ApiToken:    config.APIToken,
                ApiServer:   config.APIHost,
@@ -169,13 +174,13 @@ func setupKeepClient(config apiConfig, keepServicesJSON string, isDst bool, repl
        if keepServicesJSON == "" {
                kc, err = keepclient.MakeKeepClient(&arv)
                if err != nil {
-                       return nil, err
+                       return nil, 0, err
                }
        } else {
                kc = keepclient.New(&arv)
                err = kc.LoadKeepServicesFromJSON(keepServicesJSON)
                if err != nil {
-                       return kc, err
+                       return kc, 0, err
                }
        }
 
@@ -186,19 +191,30 @@ func setupKeepClient(config apiConfig, keepServicesJSON string, isDst bool, repl
                        if err == nil {
                                replications = int(value.(float64))
                        } else {
-                               return nil, err
+                               return nil, 0, err
                        }
                }
 
                kc.Want_replicas = replications
        }
 
-       return kc, nil
+       // If srcBlobSignatureTTL is not provided, get it from API server discovery doc
+       blobSignatureTTL = srcBlobSignatureTTL
+       if !isDst && srcBlobSignatureTTL == 0 {
+               value, err := arv.Discovery("blobSignatureTtl")
+               if err == nil {
+                       blobSignatureTTL = time.Duration(int(value.(float64))) * time.Second
+               } else {
+                       return nil, 0, err
+               }
+       }
+
+       return kc, blobSignatureTTL, nil
 }
 
 // Get unique block locators from src and dst
 // Copy any blocks missing in dst
-func performKeepRsync(kcSrc, kcDst *keepclient.KeepClient, blobSigningKey, prefix string) error {
+func performKeepRsync(kcSrc, kcDst *keepclient.KeepClient, srcBlobSignatureTTL time.Duration, blobSigningKey, prefix string) error {
        // Get unique locators from src
        srcIndex, err := getUniqueLocators(kcSrc, prefix)
        if err != nil {
@@ -218,7 +234,7 @@ func performKeepRsync(kcSrc, kcDst *keepclient.KeepClient, blobSigningKey, prefi
        log.Printf("Before keep-rsync, there are %d blocks in src and %d blocks in dst. Start copying %d blocks from src not found in dst.",
                len(srcIndex), len(dstIndex), len(toBeCopied))
 
-       err = copyBlocksToDst(toBeCopied, kcSrc, kcDst, blobSigningKey)
+       err = copyBlocksToDst(toBeCopied, kcSrc, kcDst, srcBlobSignatureTTL, blobSigningKey)
 
        return err
 }
@@ -254,7 +270,7 @@ func getMissingLocators(srcLocators, dstLocators map[string]bool) []string {
 }
 
 // Copy blocks from src to dst; only those that are missing in dst are copied
-func copyBlocksToDst(toBeCopied []string, kcSrc, kcDst *keepclient.KeepClient, blobSigningKey string) error {
+func copyBlocksToDst(toBeCopied []string, kcSrc, kcDst *keepclient.KeepClient, srcBlobSignatureTTL time.Duration, blobSigningKey string) error {
        total := len(toBeCopied)
 
        startedAt := time.Now()
@@ -271,7 +287,7 @@ func copyBlocksToDst(toBeCopied []string, kcSrc, kcDst *keepclient.KeepClient, b
                getLocator := locator
                expiresAt := time.Now().AddDate(0, 0, 1)
                if blobSigningKey != "" {
-                       getLocator = keepclient.SignLocator(getLocator, kcSrc.Arvados.ApiToken, expiresAt, []byte(blobSigningKey))
+                       getLocator = keepclient.SignLocator(getLocator, kcSrc.Arvados.ApiToken, expiresAt, srcBlobSignatureTTL, []byte(blobSigningKey))
                }
 
                reader, len, _, err := kcSrc.Get(getLocator)
index 94281fa8bcbb89f5432614adf715376ad666beab..09609eb7498bb8dc28d95bc41892f4cca9ec8563 100644 (file)
@@ -49,6 +49,7 @@ func (s *DoMainTestSuite) SetUpSuite(c *C) {
 
 var kcSrc, kcDst *keepclient.KeepClient
 var srcKeepServicesJSON, dstKeepServicesJSON, blobSigningKey string
+var blobSignatureTTL = time.Duration(2*7*24) * time.Hour
 
 func (s *ServerRequiredSuite) SetUpTest(c *C) {
        // reset all variables between tests
@@ -91,7 +92,7 @@ func setupRsync(c *C, enforcePermissions bool, replications int) {
        dstConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
 
        if enforcePermissions {
-               blobSigningKey = "zfhgfenhffzltr9dixws36j1yhksjoll2grmku38mi7yxd66h5j4q9w4jzanezacp8s6q0ro3hxakfye02152hncy6zml2ed0uc"
+               blobSigningKey = arvadostest.BlobSigningKey
        }
 
        // Start Keep servers
@@ -99,10 +100,10 @@ func setupRsync(c *C, enforcePermissions bool, replications int) {
 
        // setup keepclients
        var err error
-       kcSrc, err = setupKeepClient(srcConfig, srcKeepServicesJSON, false, 0)
+       kcSrc, _, err = setupKeepClient(srcConfig, srcKeepServicesJSON, false, 0, blobSignatureTTL)
        c.Check(err, IsNil)
 
-       kcDst, err = setupKeepClient(dstConfig, dstKeepServicesJSON, true, replications)
+       kcDst, _, err = setupKeepClient(dstConfig, dstKeepServicesJSON, true, replications, 0)
        c.Check(err, IsNil)
 
        for uuid := range kcSrc.LocalRoots() {
@@ -174,7 +175,7 @@ func testNoCrosstalk(c *C, testData string, kc1, kc2 *keepclient.KeepClient) {
        c.Assert(err, Equals, nil)
 
        locator = strings.Split(locator, "+")[0]
-       _, _, _, err = kc2.Get(keepclient.SignLocator(locator, kc2.Arvados.ApiToken, time.Now().AddDate(0, 0, 1), []byte(blobSigningKey)))
+       _, _, _, err = kc2.Get(keepclient.SignLocator(locator, kc2.Arvados.ApiToken, time.Now().AddDate(0, 0, 1), blobSignatureTTL, []byte(blobSigningKey)))
        c.Assert(err, NotNil)
        c.Check(err.Error(), Equals, "Block not found")
 }
@@ -256,7 +257,7 @@ func testKeepRsync(c *C, enforcePermissions bool, prefix string) {
        // setupTestData
        setupTestData(c, prefix)
 
-       err := performKeepRsync(kcSrc, kcDst, blobSigningKey, prefix)
+       err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, blobSigningKey, prefix)
        c.Check(err, IsNil)
 
        // Now GetIndex from dst and verify that all 5 from src and the 2 extra blocks are found
@@ -327,7 +328,7 @@ func (s *ServerRequiredSuite) TestErrorDuringRsync_FakeSrcKeepservers(c *C) {
 
        setupRsync(c, false, 1)
 
-       err := performKeepRsync(kcSrc, kcDst, "", "")
+       err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, "", "")
        log.Printf("Err = %v", err)
        c.Check(strings.Contains(err.Error(), "no such host"), Equals, true)
 }
@@ -339,7 +340,7 @@ func (s *ServerRequiredSuite) TestErrorDuringRsync_FakeDstKeepservers(c *C) {
 
        setupRsync(c, false, 1)
 
-       err := performKeepRsync(kcSrc, kcDst, "", "")
+       err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, "", "")
        log.Printf("Err = %v", err)
        c.Check(strings.Contains(err.Error(), "no such host"), Equals, true)
 }
@@ -354,7 +355,7 @@ func (s *ServerRequiredSuite) TestErrorDuringRsync_ErrorGettingBlockFromSrc(c *C
        // Change blob signing key to a fake key, so that Get from src fails
        blobSigningKey = "thisisfakeblobsigningkey"
 
-       err := performKeepRsync(kcSrc, kcDst, blobSigningKey, "")
+       err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, blobSigningKey, "")
        c.Check(strings.Contains(err.Error(), "HTTP 403 \"Forbidden\""), Equals, true)
 }
 
@@ -368,7 +369,7 @@ func (s *ServerRequiredSuite) TestErrorDuringRsync_ErrorPuttingBlockInDst(c *C)
        // Increase Want_replicas on dst to result in insufficient replicas error during Put
        kcDst.Want_replicas = 2
 
-       err := performKeepRsync(kcSrc, kcDst, blobSigningKey, "")
+       err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, blobSigningKey, "")
        c.Check(strings.Contains(err.Error(), "Could not write sufficient replicas"), Equals, true)
 }
 
@@ -416,6 +417,18 @@ func (s *ServerNotRequiredSuite) TestLoadConfig_ErrorLoadingSrcConfig(c *C) {
        c.Assert(strings.Contains(err.Error(), "no such file or directory"), Equals, true)
 }
 
+func (s *ServerNotRequiredSuite) TestSetupKeepClient_NoBlobSignatureTTL(c *C) {
+       var srcConfig apiConfig
+       srcConfig.APIHost = os.Getenv("ARVADOS_API_HOST")
+       srcConfig.APIToken = arvadostest.DataManagerToken
+       srcConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
+       arvadostest.StartKeep(2, false)
+
+       _, ttl, err := setupKeepClient(srcConfig, srcKeepServicesJSON, false, 0, 0)
+       c.Check(err, IsNil)
+       c.Assert(ttl, Equals, blobSignatureTTL)
+}
+
 func setupConfigFile(c *C, name string) *os.File {
        // Setup a config file
        file, err := ioutil.TempFile(os.TempDir(), name)