Merge branch 'master' into 3618-column-ordering
authorPhil Hodgson <bitbucket@philhodgson.net>
Fri, 24 Oct 2014 12:06:45 +0000 (14:06 +0200)
committerPhil Hodgson <bitbucket@philhodgson.net>
Fri, 24 Oct 2014 12:06:45 +0000 (14:06 +0200)
144 files changed:
apps/workbench/Gemfile
apps/workbench/Gemfile.lock
apps/workbench/app/assets/javascripts/dates.js
apps/workbench/app/assets/javascripts/infinite_scroll.js
apps/workbench/app/assets/stylesheets/application.css.scss
apps/workbench/app/controllers/actions_controller.rb
apps/workbench/app/controllers/collections_controller.rb
apps/workbench/app/controllers/pipeline_instances_controller.rb
apps/workbench/app/controllers/projects_controller.rb
apps/workbench/app/helpers/application_helper.rb
apps/workbench/app/helpers/pipeline_instances_helper.rb
apps/workbench/app/models/api_client_authorization.rb
apps/workbench/app/models/arvados_base.rb
apps/workbench/app/models/authorized_key.rb
apps/workbench/app/models/collection.rb
apps/workbench/app/models/job.rb
apps/workbench/app/models/pipeline_instance.rb
apps/workbench/app/models/user.rb
apps/workbench/app/models/virtual_machine.rb
apps/workbench/app/views/application/_job_progress.html.erb
apps/workbench/app/views/application/_job_status_label.html.erb [deleted file]
apps/workbench/app/views/application/_pipeline_status_label.html.erb
apps/workbench/app/views/application/_title_and_buttons.html.erb
apps/workbench/app/views/collections/hash_matches.html.erb [new file with mode: 0644]
apps/workbench/app/views/collections/show.html.erb
apps/workbench/app/views/jobs/_show_recent.html.erb
apps/workbench/app/views/pipeline_instances/_running_component.html.erb
apps/workbench/app/views/pipeline_instances/_show_components_running.html.erb
apps/workbench/app/views/pipeline_instances/_show_recent.html.erb
apps/workbench/app/views/projects/_show_dashboard.html.erb
apps/workbench/app/views/projects/_show_sharing.html.erb
apps/workbench/app/views/users/_tables.html.erb
apps/workbench/test/functional/collections_controller_test.rb
apps/workbench/test/functional/projects_controller_test.rb
apps/workbench/test/integration/collections_test.rb
apps/workbench/test/integration/pipeline_instances_test.rb
apps/workbench/test/integration/projects_test.rb
apps/workbench/test/integration/users_test.rb
apps/workbench/test/test_helper.rb
apps/workbench/test/unit/collection_test.rb
apps/workbench/test/unit/group_test.rb
apps/workbench/test/unit/job_test.rb
apps/workbench/test/unit/pipeline_instance_test.rb
crunch_scripts/crunchutil/subst.py
crunch_scripts/run-command
doc/_config.yml
doc/_includes/_example_docker.liquid [new file with mode: 0644]
doc/_includes/_run_command_foreach_example.liquid [new file with mode: 0644]
doc/_includes/_run_command_simple_example.liquid [new file with mode: 0644]
doc/_includes/_skip_sso_server_install.liquid [new file with mode: 0644]
doc/_includes/_tutorial_submit_job.liquid [new file with mode: 0644]
doc/api/methods/nodes.html.textile.liquid
doc/api/schema/Group.html.textile.liquid
doc/api/schema/User.html.textile.liquid
doc/images/workbench-dashboard.png
doc/install/install-api-server.html.textile.liquid
doc/install/install-keep.html.textile.liquid
doc/install/install-sso.html.textile.liquid
doc/install/install-workbench-app.html.textile.liquid
doc/user/reference/job-pipeline-ref.html.textile.liquid [new file with mode: 0644]
doc/user/topics/arv-docker.html.textile.liquid [new file with mode: 0644]
doc/user/topics/run-command.html.textile.liquid [new file with mode: 0644]
doc/user/tutorials/running-external-program.html.textile.liquid
doc/user/tutorials/tutorial-keep-mount.html.textile.liquid
doc/user/tutorials/tutorial-new-pipeline.html.textile.liquid [deleted file]
doc/user/tutorials/tutorial-pipeline-workbench.html.textile.liquid
doc/user/tutorials/tutorial-submit-job.html.textile.liquid
sdk/cli/bin/arv
sdk/cli/bin/arv-run-pipeline-instance
sdk/cli/bin/crunch-job
sdk/python/arvados/api.py
sdk/python/arvados/commands/keepdocker.py
sdk/python/arvados/commands/put.py
sdk/python/arvados/commands/ws.py [new file with mode: 0644]
sdk/python/arvados/config.py
sdk/python/arvados/errors.py
sdk/python/arvados/events.py
sdk/python/bin/arv-ws
sdk/python/tests/run_test_server.py
sdk/python/tests/test_api.py
sdk/python/tests/test_websockets.py
services/api/Gemfile
services/api/Gemfile.lock
services/api/app/controllers/arvados/v1/jobs_controller.rb
services/api/app/controllers/arvados/v1/nodes_controller.rb
services/api/app/models/arvados_model.rb
services/api/app/models/job.rb
services/api/app/models/log.rb
services/api/app/models/node.rb
services/api/app/models/user.rb
services/api/config/application.default.yml
services/api/config/initializers/eventbus.rb
services/api/lib/eventbus.rb
services/api/script/crunch-dispatch.rb
services/api/test/fixtures/api_client_authorizations.yml
services/api/test/fixtures/collections.yml
services/api/test/fixtures/groups.yml
services/api/test/fixtures/jobs.yml
services/api/test/fixtures/links.yml
services/api/test/fixtures/pipeline_instances.yml
services/api/test/fixtures/users.yml
services/api/test/functional/arvados/v1/jobs_controller_test.rb
services/api/test/functional/arvados/v1/nodes_controller_test.rb
services/api/test/functional/arvados/v1/users_controller_test.rb
services/api/test/integration/websocket_test.rb
services/api/test/test_helper.rb
services/crunchstat/crunchstat.go
services/crunchstat/crunchstat_test.go [new file with mode: 0644]
services/fuse/arvados_fuse/__init__.py
services/keepproxy/keepproxy.go
services/keepproxy/keepproxy_test.go
services/keepstore/handler_test.go
services/keepstore/handlers.go
services/keepstore/perms.go
services/keepstore/perms_test.go
services/keepstore/volume_unix.go
services/keepstore/volume_unix_test.go
services/nodemanager/.gitignore [new file with mode: 0644]
services/nodemanager/README.rst [new file with mode: 0644]
services/nodemanager/arvnodeman/__init__.py [new file with mode: 0644]
services/nodemanager/arvnodeman/clientactor.py [new file with mode: 0644]
services/nodemanager/arvnodeman/computenode/__init__.py [new file with mode: 0644]
services/nodemanager/arvnodeman/computenode/dummy.py [new file with mode: 0644]
services/nodemanager/arvnodeman/computenode/ec2.py [new file with mode: 0644]
services/nodemanager/arvnodeman/config.py [new file with mode: 0644]
services/nodemanager/arvnodeman/daemon.py [new file with mode: 0644]
services/nodemanager/arvnodeman/jobqueue.py [new file with mode: 0644]
services/nodemanager/arvnodeman/launcher.py [new file with mode: 0644]
services/nodemanager/arvnodeman/nodelist.py [new file with mode: 0644]
services/nodemanager/arvnodeman/timedcallback.py [new file with mode: 0644]
services/nodemanager/bin/arvados-node-manager [new file with mode: 0644]
services/nodemanager/doc/ec2.example.cfg [new file with mode: 0644]
services/nodemanager/doc/local.example.cfg [new file with mode: 0644]
services/nodemanager/setup.py [new file with mode: 0644]
services/nodemanager/tests/__init__.py [new file with mode: 0644]
services/nodemanager/tests/test_clientactor.py [new file with mode: 0644]
services/nodemanager/tests/test_computenode.py [new file with mode: 0644]
services/nodemanager/tests/test_computenode_ec2.py [new file with mode: 0644]
services/nodemanager/tests/test_config.py [new file with mode: 0644]
services/nodemanager/tests/test_daemon.py [new file with mode: 0644]
services/nodemanager/tests/test_jobqueue.py [new file with mode: 0644]
services/nodemanager/tests/test_nodelist.py [new file with mode: 0644]
services/nodemanager/tests/test_timedcallback.py [new file with mode: 0644]
services/nodemanager/tests/testutil.py [new file with mode: 0644]

index a20e4e6ccf79dfea4c64669b4715ef6b08cbf28f..68f609d8db2bd60d21c88d14e692f9623e9e7a55 100644 (file)
@@ -76,7 +76,7 @@ gem 'andand'
 gem 'RedCloth'
 
 gem 'piwik_analytics'
-gem 'httpclient'
+gem 'httpclient', '~> 2.5.0'
 
 # This fork has Rails 4 compatible routes
 gem 'themes_for_rails', git: 'https://github.com/holtkampw/themes_for_rails', ref: '1fd2d7897d75ae0d6375f4c390df87b8e91ad417'
index 301ef926669ad9f6f00e15a7a581570b96c0121e..1e882a9af76165c7c47af7eafb7160742f7cb429 100644 (file)
@@ -100,7 +100,7 @@ GEM
     headless (1.0.1)
     highline (1.6.20)
     hike (1.2.3)
-    httpclient (2.3.4.1)
+    httpclient (2.5.0)
     i18n (0.6.9)
     jquery-rails (3.0.4)
       railties (>= 3.0, < 5.0)
@@ -242,7 +242,7 @@ DEPENDENCIES
   coffee-rails
   deep_merge
   headless
-  httpclient
+  httpclient (~> 2.5.0)
   jquery-rails
   less
   less-rails
index 903fb01cb503d670fc989e92e5e9ed37e7b53522..5e4b804a2d83e0b977f208dd99be588d9ecc8271 100644 (file)
@@ -14,7 +14,7 @@ $(document).on('ajax:complete arv:pane:loaded ready', function() {
             if ($(elm).attr('data-utc-date-opts') && $(elm).attr('data-utc-date-opts').match(/noseconds/)) {
                 $(elm).text((ts.getHours() > 12 ? (ts.getHours()-12) : ts.getHours())
                             + ":" + (ts.getMinutes() < 10 ? '0' : '') + ts.getMinutes()
-                            + (ts.getHours() > 12 ? " PM " : " AM ")
+                            + (ts.getHours() >= 12 ? " PM " : " AM ")
                             + ts.toLocaleDateString());
             } else {
                 $(elm).text(ts.toLocaleTimeString() + " " + ts.toLocaleDateString());
index 1f947e1c14c8232a13a022dd791d472859d0de3b..2ca45484baaa974b842c74bb3c35b5f926d17f31 100644 (file)
@@ -41,45 +41,26 @@ function maybe_load_more_content(event) {
         $container.append(spinner);
         $container.attr('data-infinite-serial', serial);
 
-        // Combine infiniteContentParams from multiple sources. This
-        // mechanism allows each of several components to set and
-        // update its own set of filters, without having to worry
-        // about stomping on some other component's filters.
-        //
-        // For example, filterable.js writes filters in
-        // infiniteContentParamsFilterable ("search for text foo")
-        // without worrying about clobbering the filters set up by the
-        // tab pane ("only show jobs and pipelines in this tab").
-        params = {};
-        $.each($container.data(), function(datakey, datavalue) {
-            // Note: We attach these data to DOM elements using
-            // <element data-foo-bar="baz">. We store/retrieve them
-            // using $('element').data('foo-bar'), although
-            // .data('fooBar') would also work. The "all data" hash
-            // returned by $('element').data(), however, always has
-            // keys like 'fooBar'. In other words, where we have a
-            // choice, we stick with the 'foo-bar' style to be
-            // consistent with HTML. Here, our only option is
-            // 'fooBar'.
-            if (/^infiniteContentParams/.exec(datakey)) {
-                if (datavalue instanceof Object) {
-                    $.each(datavalue, function(hkey, hvalue) {
-                        if (hvalue instanceof Array) {
-                            params[hkey] = (params[hkey] || []).concat(hvalue);
-                        } else if (hvalue instanceof Object) {
-                            $.extend(params[hkey], hvalue);
-                        } else {
-                            params[hkey] = hvalue;
-                        }
-                    });
+        if (src == $container.attr('data-infinite-content-href0')) {
+            // If we're loading the first page, collect filters from
+            // various sources.
+            params = mergeInfiniteContentParams($container);
+            $.each(params, function(k,v) {
+                if (v instanceof Object) {
+                    params[k] = JSON.stringify(v);
                 }
-            }
-        });
-        $.each(params, function(k,v) {
-            if (v instanceof Object) {
-                params[k] = JSON.stringify(v);
-            }
-        });
+            });
+        } else {
+            // If we're loading page >1, ignore other filtering
+            // mechanisms and just use the "next page" URI from the
+            // previous page's response. Aside from avoiding race
+            // conditions (where page 2 could have different filters
+            // than page 1), this allows the server to use filters in
+            // the "next page" URI to achieve paging. (To apply any
+            // new filters effectively, we need to load page 1 again
+            // anyway.)
+            params = {};
+        }
 
         $.ajax(src,
                {dataType: 'json',
@@ -127,6 +108,45 @@ function ping_all_scrollers() {
     $('.infinite-scroller').add(window).trigger('scroll');
 }
 
+function mergeInfiniteContentParams($container) {
+    var params = {};
+    // Combine infiniteContentParams from multiple sources. This
+    // mechanism allows each of several components to set and
+    // update its own set of filters, without having to worry
+    // about stomping on some other component's filters.
+    //
+    // For example, filterable.js writes filters in
+    // infiniteContentParamsFilterable ("search for text foo")
+    // without worrying about clobbering the filters set up by the
+    // tab pane ("only show jobs and pipelines in this tab").
+    $.each($container.data(), function(datakey, datavalue) {
+        // Note: We attach these data to DOM elements using
+        // <element data-foo-bar="baz">. We store/retrieve them
+        // using $('element').data('foo-bar'), although
+        // .data('fooBar') would also work. The "all data" hash
+        // returned by $('element').data(), however, always has
+        // keys like 'fooBar'. In other words, where we have a
+        // choice, we stick with the 'foo-bar' style to be
+        // consistent with HTML. Here, our only option is
+        // 'fooBar'.
+        if (/^infiniteContentParams/.exec(datakey)) {
+            if (datavalue instanceof Object) {
+                $.each(datavalue, function(hkey, hvalue) {
+                    if (hvalue instanceof Array) {
+                        params[hkey] = (params[hkey] || []).
+                            concat(hvalue);
+                    } else if (hvalue instanceof Object) {
+                        $.extend(params[hkey], hvalue);
+                    } else {
+                        params[hkey] = hvalue;
+                    }
+                });
+            }
+        }
+    });
+    return params;
+}
+
 $(document).
     on('click', 'div.infinite-retry button', function() {
         var $retry_div = $(this).closest('.infinite-retry');
index 77fe7126c05e0ea9b64c368d226dad63e36462f0..7007d8c348657c11dc08b19d8c051db360b603d5 100644 (file)
@@ -261,7 +261,9 @@ span.editable-textile {
   min-width: 1em;
   padding: 0px 2px 0px 0px;
 }
-
+.task-summary-status {
+  font-size: 80%;
+}
 #page-wrapper > div > h2 {
   margin-top: 0px;
 }
index 62533d81b4ab0a50927379401443dab54f4169e8..59dcbb92bb9c57db69fc15277233a0072fd73dac 100644 (file)
@@ -150,7 +150,7 @@ class ActionsController < ApplicationController
     files.each do |m|
       mt = chash[m[1]+m[2]].andand.manifest_text
       if not m[4].nil? and m[4].size > 1
-        combined += arv_normalize mt, '--extract', m[4][1..-1]
+        combined += arv_normalize mt, '--extract', ".#{m[4]}"
       else
         combined += mt
       end
index 3c6c9d7a8571842ce55fc594b7242e8ce5d193b8..4e0008d93cf63887926c7d3ef7907d4f1f377bd3 100644 (file)
@@ -195,51 +195,63 @@ class CollectionsController < ApplicationController
     end
   end
 
+  def find_object_by_uuid
+    if not Keep::Locator.parse params[:id]
+      super
+    end
+  end
+
   def show
     return super if !@object
     if current_user
-      jobs_with = lambda do |conds|
-        Job.limit(RELATION_LIMIT).where(conds)
-          .results.sort_by { |j| j.finished_at || j.created_at }
-      end
-      @output_of = jobs_with.call(output: @object.portable_data_hash)
-      @log_of = jobs_with.call(log: @object.portable_data_hash)
-      @project_links = Link.limit(RELATION_LIMIT).order("modified_at DESC")
-        .where(head_uuid: @object.uuid, link_class: 'name').results
-      project_hash = Group.where(uuid: @project_links.map(&:tail_uuid)).to_hash
-      @projects = project_hash.values
-
-      if @object.uuid.match /[0-9a-f]{32}/
-        @same_pdh = Collection.filter([["portable_data_hash", "=", @object.portable_data_hash]])
-        owners = @same_pdh.map {|s| s.owner_uuid}.to_a
+      if Keep::Locator.parse params["uuid"]
+        @same_pdh = Collection.filter([["portable_data_hash", "=", @object.portable_data_hash]]).limit(1000)
+        if @same_pdh.results.size == 1
+          redirect_to collection_path(@same_pdh[0]["uuid"])
+          return
+        end
+        owners = @same_pdh.map(&:owner_uuid).to_a.uniq
         preload_objects_for_dataclass Group, owners
         preload_objects_for_dataclass User, owners
+        render 'hash_matches'
+        return
+      else
+        jobs_with = lambda do |conds|
+          Job.limit(RELATION_LIMIT).where(conds)
+            .results.sort_by { |j| j.finished_at || j.created_at }
+        end
+        @output_of = jobs_with.call(output: @object.portable_data_hash)
+        @log_of = jobs_with.call(log: @object.portable_data_hash)
+        @project_links = Link.limit(RELATION_LIMIT).order("modified_at DESC")
+          .where(head_uuid: @object.uuid, link_class: 'name').results
+        project_hash = Group.where(uuid: @project_links.map(&:tail_uuid)).to_hash
+        @projects = project_hash.values
+
+        @permissions = Link.limit(RELATION_LIMIT).order("modified_at DESC")
+          .where(head_uuid: @object.uuid, link_class: 'permission',
+                 name: 'can_read').results
+        @logs = Log.limit(RELATION_LIMIT).order("created_at DESC")
+          .where(object_uuid: @object.uuid).results
+        @is_persistent = Link.limit(1)
+          .where(head_uuid: @object.uuid, tail_uuid: current_user.uuid,
+                 link_class: 'resources', name: 'wants')
+          .results.any?
+        @search_sharing = search_scopes
+
+        if params["tab_pane"] == "Provenance_graph"
+          @prov_svg = ProvenanceHelper::create_provenance_graph(@object.provenance, "provenance_svg",
+                                                                {:request => request,
+                                                                  :direction => :bottom_up,
+                                                                  :combine_jobs => :script_only}) rescue nil
+        end
+        if params["tab_pane"] == "Used_by"
+          @used_by_svg = ProvenanceHelper::create_provenance_graph(@object.used_by, "used_by_svg",
+                                                                   {:request => request,
+                                                                     :direction => :top_down,
+                                                                     :combine_jobs => :script_only,
+                                                                     :pdata_only => true}) rescue nil
+        end
       end
-
-      @permissions = Link.limit(RELATION_LIMIT).order("modified_at DESC")
-        .where(head_uuid: @object.uuid, link_class: 'permission',
-               name: 'can_read').results
-      @logs = Log.limit(RELATION_LIMIT).order("created_at DESC")
-        .where(object_uuid: @object.uuid).results
-      @is_persistent = Link.limit(1)
-        .where(head_uuid: @object.uuid, tail_uuid: current_user.uuid,
-               link_class: 'resources', name: 'wants')
-        .results.any?
-      @search_sharing = search_scopes
-    end
-
-    if params["tab_pane"] == "Provenance_graph"
-      @prov_svg = ProvenanceHelper::create_provenance_graph(@object.provenance, "provenance_svg",
-                                                            {:request => request,
-                                                              :direction => :bottom_up,
-                                                              :combine_jobs => :script_only}) rescue nil
-    end
-    if params["tab_pane"] == "Used_by"
-      @used_by_svg = ProvenanceHelper::create_provenance_graph(@object.used_by, "used_by_svg",
-                                                               {:request => request,
-                                                                 :direction => :top_down,
-                                                                 :combine_jobs => :script_only,
-                                                                 :pdata_only => true}) rescue nil
     end
     super
   end
index 27e8f5b51b0e12eea2fc24fe523f7b4f88365f84..a618d435173b2a67db2cfcba3d5ef797db4cf8c0 100644 (file)
@@ -47,6 +47,13 @@ class PipelineInstancesController < ApplicationController
       component.delete :job
     end
     @object.state = 'New'
+
+    # set owner_uuid to that of source, provided it is a project and wriable by current user
+    current_project = Group.find(source.owner_uuid) rescue nil
+    if (current_project && current_project.writable_by.andand.include?(current_user.uuid))
+      @object.owner_uuid = source.owner_uuid
+    end
+
     super
   end
 
index 435e0cd5d68f9086a1cf83ea8d5377d360daedac..b77a48973f5a32f3a33685e6d0dc749f3c6c5027 100644 (file)
@@ -192,7 +192,7 @@ class ProjectsController < ApplicationController
                                      limit: @limit,
                                      include_linked: true,
                                      filters: (@filters - kind_filters + [['uuid', 'is_a', type]]),
-                                     offset: @offset)
+                                    )
           objects.each do |object|
             @name_link_for[object.andand.uuid] = objects.links_for(object, 'name').first
           end
index d343249eaf7bef019fdd5efda4d28ec9521451d2..66b7ed662abd86416f13e4b45a10faa5a786d8e0 100644 (file)
@@ -150,9 +150,7 @@ module ApplicationHelper
 
   def render_editable_attribute(object, attr, attrvalue=nil, htmloptions={})
     attrvalue = object.send(attr) if attrvalue.nil?
-    if !object.attribute_editable?(attr, :ever) or
-        (!object.editable? and
-         !object.owner_uuid.in?(my_projects.collect(&:uuid)))
+    if not object.attribute_editable?(attr)
       if attrvalue && attrvalue.length > 0
         return render_attribute_as_textile( object, attr, attrvalue, false )
       else
@@ -241,10 +239,7 @@ module ApplicationHelper
       preconfigured_search_str = value_info[:search_for]
     end
 
-    if !object or
-        !object.attribute_editable?(attr, :ever) or
-        (!object.editable? and
-         !object.owner_uuid.in?(my_projects.collect(&:uuid)))
+    if not object.andand.attribute_editable?(attr)
       return link_to_if_arvados_object attrvalue
     end
 
index 06d61e033f12caeb36fc4ed01596c1952e439fbb..b0d5216efd1588069050d5b0d5aa371abc07492d 100644 (file)
@@ -18,12 +18,12 @@ module PipelineInstancesHelper
   def render_pipeline_job pj
     pj[:progress_bar] = render partial: 'job_progress', locals: {:j => pj[:job]}
     pj[:output_link] = link_to_if_arvados_object pj[:output]
-    pj[:job_link] = link_to_if_arvados_object pj[:job][:uuid]
+    pj[:job_link] = link_to_if_arvados_object pj[:job][:uuid] if pj[:job]
     pj
   end
 
   # Merge (started_at, finished_at) time range into the list of time ranges in
-  # timestamps (timestamps must be sorted and non-overlapping).  
+  # timestamps (timestamps must be sorted and non-overlapping).
   # return the updated timestamps list.
   def merge_range timestamps, started_at, finished_at
     # in the comments below, 'i' is the entry in the timestamps array and 'j'
@@ -62,7 +62,7 @@ module PipelineInstancesHelper
 
     timestamps << [started_at, finished_at]
   end
-  
+
   # Accept a list of objects with [:started_at] and [:finshed_at] keys and
   # merge overlapping ranges to compute the time spent running after periods of
   # overlapping execution are factored out.
@@ -269,7 +269,7 @@ module PipelineInstancesHelper
 
     if round_to_min and seconds >= 30
       minutes += 1
-    end    
+    end
 
     if use_words
       s = []
index ac3a9bf8ed53998adcb1ebfdcc6dc038fdc84136..6d1558cc6eb28438003b2599571679370eb6b845 100644 (file)
@@ -1,6 +1,6 @@
 class ApiClientAuthorization < ArvadosBase
-  def attribute_editable? attr, *args
-    ['expires_at', 'default_owner_uuid'].index attr
+  def editable_attributes
+    %w(expires_at default_owner_uuid)
   end
   def self.creatable?
     false
index e0e93b9e2d0828cef0149f95fe51c817260ef3c7..f5be0e1edcba20ddfb1f80f4ece1912eac0d5dfd 100644 (file)
@@ -329,11 +329,20 @@ class ArvadosBase < ActiveRecord::Base
      (current_user.is_admin or
       current_user.uuid == self.owner_uuid or
       new_record? or
-      (writable_by.include? current_user.uuid rescue false))) or false
+      (respond_to?(:writable_by) ?
+       writable_by.include?(current_user.uuid) :
+       (ArvadosBase.find(owner_uuid).writable_by.include? current_user.uuid rescue false)))) or false
+  end
+
+  # Array of strings that are the names of attributes that can be edited
+  # with X-Editable.
+  def editable_attributes
+    self.class.columns.map(&:name) -
+      %w(created_at modified_at modified_by_user_uuid modified_by_client_uuid updated_at)
   end
 
   def attribute_editable?(attr, ever=nil)
-    if %w(created_at modified_at modified_by_user_uuid modified_by_client_uuid updated_at).include? attr.to_s
+    if not editable_attributes.include?(attr.to_s)
       false
     elsif not (current_user.andand.is_active)
       false
index 724c996e4a4ef173f432514349921ab249eddf2a..2d804e1a5345743957b71aa7389eb15ab223312c 100644 (file)
@@ -1,7 +1,7 @@
 class AuthorizedKey < ArvadosBase
-  def attribute_editable? attr, *args
-    if attr.to_s == 'authorized_user_uuid'
-      current_user and current_user.is_admin
+  def attribute_editable?(attr, ever=nil)
+    if (attr.to_s == 'authorized_user_uuid') and (not ever)
+      current_user.andand.is_admin
     else
       super
     end
index 87a083e24be4ee62d3e52cc5336bb6516ef2d5b2..b5347dce00f24c64a388baac1dd9011935ab99c9 100644 (file)
@@ -66,12 +66,8 @@ class Collection < ArvadosBase
     dir_to_tree.call('.')
   end
 
-  def attribute_editable? attr, *args
-    if %w(name description manifest_text).include? attr.to_s
-      true
-    else
-      super
-    end
+  def editable_attributes
+    %w(name description manifest_text)
   end
 
   def self.creatable?
index 977eef91bf278ac5e9a528e935262dce478e4fa5..c59bb89fe851306c80278b4b96ce192c9e064ea7 100644 (file)
@@ -7,12 +7,8 @@ class Job < ArvadosBase
     "#{script} job"
   end
 
-  def attribute_editable? attr, *args
-    if attr.to_sym == :description
-      super && attr.to_sym == :description
-    else
-      false
-    end
+  def editable_attributes
+    %w(description)
   end
 
   def self.creatable?
@@ -42,7 +38,7 @@ class Job < ArvadosBase
     arvados_api_client.api("jobs/", "queue_size", {"_method"=> "GET"})[:queue_size] rescue 0
   end
 
-  def self.queue 
+  def self.queue
     arvados_api_client.unpack_api_response arvados_api_client.api("jobs/", "queue", {"_method"=> "GET"})
   end
 
index 936905713e44891f22261bf6e1c0ba19e598ae75..83328b9e52ce31dd126812ba874de766282fb93c 100644 (file)
@@ -47,10 +47,12 @@ class PipelineInstance < ArvadosBase
     end
   end
 
-  def attribute_editable? attr, *args
-    super && (attr.to_sym == :name || attr.to_sym == :description ||
-              (attr.to_sym == :components and
-               (self.state == 'New' || self.state == 'Ready')))
+  def editable_attributes
+    %w(name description components)
+  end
+
+  def attribute_editable?(name, ever=nil)
+    (ever or %w(New Ready).include?(state)) and super
   end
 
   def attributes_for_display
index 967ea2ad7d238aebfc9f8df1d4b6e171949479cf..7aaa4fe93951ca831add8b7bae6778e251b8b871 100644 (file)
@@ -35,8 +35,9 @@ class User < ArvadosBase
     super.reject { |k,v| %w(owner_uuid default_owner_uuid identity_url prefs).index k }
   end
 
-  def attribute_editable? attr, *args
-    (not (self.uuid.andand.match(/000000000000000$/) and self.is_admin)) and super
+  def attribute_editable?(attr, ever=nil)
+    (ever or not (self.uuid.andand.match(/000000000000000$/) and
+                  self.is_admin)) and super
   end
 
   def friendly_link_name lookup=nil
index 083aae31ecb10c0f7b5b97e0e6d7b9c5129cff2e..3b44397df5459efb7074f46bd094826192e061eb 100644 (file)
@@ -6,8 +6,8 @@ class VirtualMachine < ArvadosBase
   def attributes_for_display
     super.append ['current_user_logins', @current_user_logins]
   end
-  def attribute_editable? attr, *args
-    attr != 'current_user_logins' and super
+  def editable_attributes
+    super - %w(current_user_logins)
   end
   def self.attribute_info
     merger = ->(k,a,b) { a.merge(b, &merger) }
index 5c19779cccfb29dc418597e60b1a498d5af61bcd..efe1819ebd6d5cf73512046b2fcefcbb084a44b1 100644 (file)
@@ -1,43 +1,51 @@
-<%
-   failed = j[:tasks_summary][:failed] || 0 rescue 0
-   done = j[:tasks_summary][:done] || 0 rescue 0
-   running = j[:tasks_summary][:running] || 0 rescue 0
-   todo = j[:tasks_summary][:todo] || 0 rescue 0
-
-   if j[:success] == false and done + running + failed == 0
-     # The job failed but no tasks were ever started (i.e. crunch-dispatch
-     # was unable to start the job). Display a full 100% failed progress bar.
-     failed_percent = 100
-     success_percent = 0
-     running_percent = 0
-   elsif done + running + failed + todo == 0
-     # No tasks were ever created for this job;
-     # render an empty progress bar.
-     failed_percent = 0
-     success_percent = 0
-     running_percent = 0
-   else
-     percent_total_tasks = 100.0 / (done + running + failed + todo)
-     if defined? scaleby
-       percent_total_tasks *= scaleby
-     end
-     failed_percent = (failed * percent_total_tasks).ceil
-     success_percent = (done * percent_total_tasks).ceil
-     running_percent = (running * percent_total_tasks).ceil
-   end
-%>
-
-<% if not defined? scaleby %>
-  <div class="progress">
-<% end %>
+<% if (j.andand[:state] == "Running" or defined? scaleby) and (not defined? show_progress_bar or show_progress_bar) %>
+  <%
+    failed = j[:tasks_summary][:failed] || 0 rescue 0
+    done = j[:tasks_summary][:done] || 0 rescue 0
+    running = j[:tasks_summary][:running] || 0 rescue 0
+    todo = j[:tasks_summary][:todo] || 0 rescue 0
+
+    if done + running + failed + todo == 0
+      # No tasks were ever created for this job;
+      # render an empty progress bar.
+      done_percent = 0
+    else
+      percent_total_tasks = 100.0 / (done + running + failed + todo)
+      if defined? scaleby
+        percent_total_tasks *= scaleby
+      end
+      done_percent = (done+failed) * percent_total_tasks
+    end
+    %>
+
+  <% if not defined? scaleby %>
+    <div class="progress" style="margin-bottom: 0px">
+  <% end %>
+
+  <span class="progress-bar <%= if failed == 0 then 'progress-bar-success' else 'progress-bar-warning' end %>" style="width: <%= done_percent %>%;">
+  </span>
+
+  <% if not defined? scaleby %>
+  </div>
+  <% end %>
+
+<% else %>
+
+<% to_label = {
+     "Cancelled" => "danger",
+     "Complete" => "success",
+     "Running" => "info",
+     "Failed" => "danger",
+     "Queued" => "default",
+     nil => "default"
+   } %>
 
-<span class="progress-bar progress-bar-success" style="width: <%= success_percent %>%;">
-</span>
-<span class="progress-bar progress-bar-danger" style="width: <%= failed_percent %>%;">
-</span>
-<span class="progress-bar" style="width: <%= running_percent %>%;">
-</span>
+  <span class="label label-<%= to_label[j.andand[:state]] %>">
+    <%= if defined? title
+          title
+        else
+          if j.andand[:state] then j[:state].downcase else "Not ready" end
+        end
+        %></span>
 
-<% if not defined? scaleby %>
-</div>
 <% end %>
diff --git a/apps/workbench/app/views/application/_job_status_label.html.erb b/apps/workbench/app/views/application/_job_status_label.html.erb
deleted file mode 100644 (file)
index 17073fe..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-<% to_label = {
-     "Cancelled" => "danger",
-     "Complete" => "success",
-     "Running" => "info",
-     "Failed" => "danger",
-     "Queued" => "default",
-     nil => "default"
-   } %>
-
-  <span class="label label-<%= to_label[j[:state]] %>"><%= if defined? title then title else j[:state].downcase end %></span>
index 9e5b71047b3147daf54fe9478429d26555d03e8a..88722726dc61169f3468cc00b2b2d72af6bfac17 100644 (file)
@@ -1,5 +1,5 @@
 <% if p.state == 'Complete' %>
-  <span class="label label-success">finished</span>
+  <span class="label label-success">complete</span>
 <% elsif p.state == 'Failed' %>
   <span class="label label-danger">failed</span>
 <% elsif p.state == 'RunningOnServer' || p.state == 'RunningOnClient' %>
index 4d54fd08ba0ce0464d561fcb4ed26d16b5ffaeaf..4a40510dc41956cd665f7f8cdfff7236aeeee939 100644 (file)
@@ -31,7 +31,7 @@
         <i class="fa fa-fw fa-copy"></i> Copy to project...
       <% end %>
     <% end %>
-    <% if @object.owner_uuid == current_user.uuid or (Group.find(@object.owner_uuid).writable_by.include?(current_user.uuid) rescue nil) %>
+    <% if (ArvadosBase.find(@object.owner_uuid).writable_by.include?(current_user.uuid) rescue nil) %>
       <%= link_to(
           choose_projects_path(
            title: "Move this #{object_class} to:",
diff --git a/apps/workbench/app/views/collections/hash_matches.html.erb b/apps/workbench/app/views/collections/hash_matches.html.erb
new file mode 100644 (file)
index 0000000..7c4abb0
--- /dev/null
@@ -0,0 +1,23 @@
+<div class="row">
+  <div class="col-md-10 col-md-offset-1">
+    <div class="panel panel-info">
+      <div class="panel-heading">
+        <h3 class="panel-title"><%= params["uuid"] %></h3>
+      </div>
+      <div class="panel-body">
+        <p><i>The following collections have this content:</i></p>
+        <% @same_pdh.sort { |a,b| b.created_at <=> a.created_at }.each do |c| %>
+          <div class="row">
+            <div class="col-md-8">
+              <% owner = object_for_dataclass(Group, c.owner_uuid) || object_for_dataclass(User, c.owner_uuid) %>
+              <%= link_to_if_arvados_object owner, {:friendly_name => true} %> / <%= link_to_if_arvados_object c, {:friendly_name => true} %><br>
+            </div>
+            <div class="col-md-4">
+              <%= render_localized_date c.created_at %>
+            </div>
+          </div>
+        <% end %>
+      </div>
+    </div>
+  </div>
+</div>
index e1b08290b04559dae63a1b6091d8d4cdc135ce71..c3e0b7cb2eb1aa64fc210dc90daca92b1443d107 100644 (file)
           <%= render_editable_attribute @object, 'description', nil, { 'data-emptytext' => "(No description provided)", 'data-toggle' => 'manual' } %>
         </div>
         <img src="/favicon.ico" class="pull-right" alt="" style="opacity: 0.3"/>
-        <% if defined? @same_pdh %>
-          <p>Found in collections:<p>
-          <p>
-            <% @same_pdh.each do |c| %>
-              <%= link_to_if_arvados_object get_object(c.owner_uuid), {:friendly_name => true} %> / <%= link_to_if_arvados_object c, {:friendly_name => true} %><br>
-            <% end %>
-          </p>
-        <% else %>
-         <p><i>Content hash:</i><br />
-           <span class="arvados-uuid"><%= link_to @object.portable_data_hash, collection_path(@object.portable_data_hash) %></span></p>
-        <% end %>
+       <p><i>Content hash:</i><br />
+         <span class="arvados-uuid"><%= link_to @object.portable_data_hash, collection_path(@object.portable_data_hash) %></span>
+        </p>
         <%= render partial: "show_source_summary" %>
       </div>
     </div>
index c823fc590017d5d8ac304445f3d0c01d3065ec74..d12ebb649bee1ef64deb99795588d1760d258be6 100644 (file)
@@ -15,8 +15,6 @@
       <th>
       </th><th>
        status
-      </th><th>
-       progress
       </th><th>
        uuid
       </th><th>
@@ -36,9 +34,6 @@
       <td>
         <i class="icon-plus-sign expand-collapse-row" data-id="<%= j.uuid %>" style="cursor: pointer"></i>
       </td>
-      <td>
-        <%= render partial: 'job_status_label', locals: {:j => j} %>
-      </td>
       <td>
         <div class="inline-progress-container">
           <%= render partial: 'job_progress', locals: {:j => j} %>
 
   </tbody>
 </table>
-
index 038efece35e06fb00dc5431d95e58a287c78e72d..caa8377ad03a4d2e11c2662b4b16aef1bfee7391 100644 (file)
@@ -1,84 +1,97 @@
-  <% current_job = pj[:job] if pj[:job] != {} and pj[:job][:uuid] %>
-  <div class="panel panel-default">
-    <div class="panel-heading">
-      <div class="container-fluid">
-        <div class="row">
+<% current_job = pj[:job] if pj[:job] != {} and pj[:job][:uuid] %>
+<div class="panel panel-default">
+  <div class="panel-heading">
+    <div class="container-fluid">
+      <div class="row-fluid">
+        <%# column offset 0 %>
+        <div class="col-md-3">
+          <h4 class="panel-title">
+            <a data-toggle="collapse" href="#collapse<%= i %>" style="white-space: nowrap;">
+              <%= pj[:name] %> <span class="caret"></span>
+            </a>
+          </h4>
+        </div>
+
+        <%# column offset 3 %>
+        <div class="col-md-2 pipeline-instance-spacing">
+          <%= pj[:progress_bar] %>
+        </div>
+
+        <% if current_job %>
+          <%# column offset 5 %>
+          <% if current_job[:state] != "Queued" %>
           <div class="col-md-3">
-            <h4 class="panel-title">
-              <a data-toggle="collapse" href="#collapse<%= i %>" style="white-space: nowrap;">
-                <%= pj[:name] %> <span class="caret"></span>
-              </a>
-            </h4>
+            <% if current_job[:started_at] %>
+              <% walltime = ((if current_job[:finished_at] then current_job[:finished_at] else Time.now() end) - current_job[:started_at]) %>
+              <% cputime = tasks.map { |task|
+                   if task.started_at and task.job_uuid == current_job[:uuid]
+                     (if task.finished_at then task.finished_at else Time.now() end) - task.started_at
+                   else
+                     0
+                   end
+                 }.reduce(:+) || 0 %>
+              <%= render_runtime(walltime, false, false) %>
+              <% if cputime > 0 %> / <%= render_runtime(cputime, false, false) %> (<%= (cputime/walltime).round(1) %>&Cross;)<% end %>
+            <% end %>
           </div>
+          <% end %>
 
-          <% if current_job %>
-            <div class="col-md-1">
-              <%= render(partial: 'job_status_label', locals: { j: current_job }) %>
+          <% if current_job[:state] == "Queued" %>
+            <%# column offset 5 %>
+            <div class="col-md-6">
+              <% queuetime = Time.now - current_job[:created_at] %>
+              Queued for <%= render_runtime(queuetime, true) %>.
+              <% begin %>
+                <% if current_job[:queue_position] == 0 %>
+                  This job is next in the queue to run.
+                <% elsif current_job[:queue_position] == 1 %>
+                  There is 1 job in the queue ahead of this one.
+                <% else %>
+                  There are <%= current_job[:queue_position] %> jobs in the queue ahead of this one.
+                <% end %>
+              <% rescue %>
+              <% end %>
             </div>
-
+          <% elsif current_job[:state] == "Running" %>
+            <%# column offset 8 %>
             <div class="col-md-3">
-              <% if current_job[:started_at] %>
-                <% walltime = ((if current_job[:finished_at] then current_job[:finished_at] else Time.now() end) - current_job[:started_at]) %>
-                <% cputime = tasks.map { |task|
-                     if task.started_at and task.job_uuid == current_job[:uuid]
-                       (if task.finished_at then task.finished_at else Time.now() end) - task.started_at
-                     else
-                       0
-                     end
-                   }.reduce(:+) || 0 %>
-                <%= render_runtime(walltime, false, false) %>
-                <% if cputime > 0 %> / <%= render_runtime(cputime, false, false) %> (<%= (cputime/walltime).round(1) %>&Cross;)<% end %>
+              <span class="task-summary-status">
+                <%= current_job[:tasks_summary][:done] %>&nbsp;<%= "task".pluralize(current_job[:tasks_summary][:done]) %> done,
+                <%= current_job[:tasks_summary][:failed] %>&nbsp;failed,
+                <%= current_job[:tasks_summary][:running] %>&nbsp;running,
+                <%= current_job[:tasks_summary][:todo] %>&nbsp;pending
+              </span>
+            </div>
+          <% elsif current_job[:state].in? ["Complete", "Failed", "Cancelled"] %>
+            <%# column offset 8 %>
+            <div class="col-md-4 text-overflow-ellipsis">
+              <% if pj[:output_uuid] %>
+                <%= link_to_if_arvados_object pj[:output_uuid], friendly_name: true %>
+              <% elsif current_job[:output] %>
+                <%= link_to_if_arvados_object current_job[:output], link_text: "Output of #{pj[:name]}" %>
+              <% else %>
+                No output.
               <% end %>
             </div>
+          <% end %>
 
-            <% if current_job[:state].in? ["Complete", "Failed", "Cancelled"] %>
-              <div class="col-md-5 text-overflow-ellipsis">
-                <% if pj[:output_uuid] %>
-                  <%= link_to_if_arvados_object pj[:output_uuid], friendly_name: true %>
-                <% elsif current_job[:output] %>
-                  <%= link_to_if_arvados_object current_job[:output], link_text: "Output of #{pj[:name]}" %>
-                <% else %>
-                  No output.
-                <% end %>
-              </div>
-            <% elsif current_job[:state] == "Running" %>
-              <div class="col-md-3 pipeline-instance-spacing">
-                <%= pj[:progress_bar] %>
-              </div>
-              <div class="col-md-1 pipeline-instance-spacing">
-                <%= form_tag "/jobs/#{current_job[:uuid]}/cancel", style: "display:inline; padding-left: 1em" do |f| %>
+          <% if current_job[:state].in? ["Queued", "Running"] %>
+            <%# column offset 11 %>
+            <div class="col-md-1 pipeline-instance-spacing">
+              <%= form_tag "/jobs/#{current_job[:uuid]}/cancel", style: "display:inline; padding-left: 1em" do |f| %>
                 <%= hidden_field_tag :return_to, url_for(@object) %>
                 <%= button_tag "Cancel", {class: 'btn btn-xs btn-danger', id: "cancel-job-button"} %>
-            </div>
-            <% end %>
-          <% elsif current_job[:state] == "Queued" %>
-            <div class="col-md-5">
-              <% queuetime = Time.now - current_job[:created_at] %>
-              Queued for <%= render_runtime(queuetime, true) %>.
-              <% begin %>
-              <% if current_job[:queue_position] == 0 %>
-                This job is next in the queue to run.
-              <% elsif current_job[:queue_position] == 1 %>
-                There is 1 job in the queue ahead of this one.
-              <% else  %>
-                There are <%= current_job[:queue_position] %> jobs in the queue ahead of this one.
               <% end %>
-              <% rescue %>
-          <% end %>
             </div>
           <% end %>
-        <% else %>
-          <div class="col-md-3 col-md-offset-3">
-            <span class="label label-default">Not ready</span>
-          </div>
-<% end %>
-</div>
-</div>
-</div>
+        <% end %>
+      </div>
+    </div>
+  </div>
 
-<div id="collapse<%= i %>" class="panel-collapse collapse <%= if expanded then 'in' end %>">
-  <div class="panel-body">
-    <div class="container">
+  <div id="collapse<%= i %>" class="panel-collapse collapse <%= if expanded then 'in' end %>">
+    <div class="panel-body">
+      <div class="container">
         <% current_component = (if current_job then current_job else pj end) %>
         <div class="row">
           <div class="col-md-6">
             <p>script_parameters:</p>
             <pre><%= JSON.pretty_generate(current_component[:script_parameters]) rescue nil %></pre>
           </div>
-          <% if current_component[:tasks_summary] %>
-          <div class="col-md-3">
-            <table>
-              <% [:done, :running, :failed, :todo].each do |d| %>
-              <tr>
-                <td style="padding-right: 1em"><%= 'tasks:' if d == :done %></td>
-                <td style="padding-right: 1em"><%= d.to_s %></td>
-                <td><%= current_component[:tasks_summary][d] %></td>
-              </tr>
-              <% end %>
-            </table>
-          </div>
-          <% end %>
         </div>
+      </div>
     </div>
   </div>
 </div>
-</div>
index acaf9d34f47642593c05597ba867a15657cd792a..165a694e8b3e513ce470e698e9d91dc41af74d52 100644 (file)
@@ -7,13 +7,13 @@
   </p>
 <% end %>
 
-<% tasks = JobTask.filter([['job_uuid', 'in', render_pipeline_jobs.map { |j| j[:job].andand[:uuid] }]]).results %>
-<% runningtime = determine_wallclock_runtime(render_pipeline_jobs.map {|j| j[:job]}) %>
+<% tasks = JobTask.filter([['job_uuid', 'in', render_pipeline_jobs.map { |j| j[:job].andand[:uuid] }.compact]]).results %>
+<% runningtime = determine_wallclock_runtime(render_pipeline_jobs.map {|j| j[:job]}.compact) %>
 
 <p>
   <% if @object.started_at %>
     This pipeline started at <%= render_localized_date(@object.started_at) %>.
-    It 
+    It
     <% if @object.state == 'Complete' %>
       completed in
     <% elsif @object.state == 'Failed' %>
                   else
                     Time.now - @object.started_at
                   end %>
-    
-    <%= if walltime > runningtime 
-          render_runtime(walltime, true, false) 
-        else 
-          render_runtime(runningtime, true, false) 
+
+    <%= if walltime > runningtime
+          render_runtime(walltime, true, false)
+        else
+          render_runtime(runningtime, true, false)
         end %><% if @object.finished_at %> at <%= render_localized_date(@object.finished_at) %><% end %>.
     <% else %>
       This pipeline is <%= if @object.state.start_with? 'Running' then 'active' else @object.state.downcase end %>.
index 08b24f13cb03faa1dcb17c1c1830f90aa5706c64..f918e24e9dbab740b8a6300cfdcbe3a94ad0c2fc 100644 (file)
@@ -56,7 +56,7 @@
       <td style="border-top: 0; opacity: 0.5;" colspan="6">
         <% ob.components.each do |cname, c| %>
           <% if c.is_a?(Hash) and c[:job] %>
-            <%= render partial: "job_status_label", locals: {:j => c[:job], :title => cname.to_s } %>
+            <%= render partial: "job_progress", locals: {:j => c[:job], :title => cname.to_s, :show_progress_bar => false } %>
           <% else %>
             <span class="label label-default"><%= cname.to_s %></span>
           <% end %>
index 9ddd1d59d84c070f5d8ede4ff596954897f49b83..0d71f947c758aca3829326ef4230f280a529fbd3 100644 (file)
 
                 <div class="pull-right" style="width: 40%">
                   <div class="progress" style="margin-bottom: 0px">
-                    <% running = [] %>
-                    <% failed = [] %>
-                    <% completed = [] %>
-                    <% queued = [] %>
                     <% p.components.each do |k, v| %>
                       <% if v.is_a? Hash and v[:job] %>
-                        <% if v[:job][:state] == "Running" %>
-                          <% running << k %>
-                        <% elsif v[:job][:state] == "Failed" or v[:job][:state] == "Cancelled" %>
-                          <% failed << k %>
-                        <% elsif v[:job][:state] == "Complete" %>
-                          <% completed << k %>
-                        <% elsif v[:job][:state] == "Queued" %>
-                          <% queued << k %>
-                        <% end %>
+                        <%= render partial: 'job_progress', locals: {:j => v[:job], :scaleby => (1.0/p.components.size)} %>
                       <% end %>
                     <% end %>
-                    <% completed_pct = (completed.size * 100) / p.components.size %>
-                    <% failed_pct = (failed.size * 100) / p.components.size %>
-                    <% running_pct = (running.size * 100) / p.components.size %>
-                    <% queued_pct = (queued.size * 100) / p.components.size %>
-
-                    <div class="progress-bar progress-bar-success" style="width: <%= completed_pct %>%">
-                      <span class="sr-only"></span>
-                    </div>
-                    <div class="progress-bar progress-bar-danger" style="width: <%= failed_pct %>%">
-                      <span class="sr-only"></span>
-                    </div>
-                    <div class="progress-bar progress-bar-primary" style="width: <%= running_pct %>%">
-                      <span class="sr-only"></span>
-                    </div>
-                    <div class="progress-bar progress-bar-default" style="width: <%= queued_pct %>%">
-                      <span class="sr-only"></span>
-                    </div>
                   </div>
                 </div>
               </div>
 
+              <%
+                running = p.components.select { |k, c| c.is_a? Hash and c[:job].andand[:state] == "Running" }
+                queued = p.components.select { |k, c| c.is_a? Hash and c[:job].andand[:state] == "Queued" }
+                %>
+
               <div class="clearfix">
                 Started at <%= render_localized_date(p[:started_at] || p[:created_at], "noseconds") %>.
                 <% pipeline_time = Time.now - (p[:started_at] || p[:created_at]) %>
                 Active for <%= render_runtime(pipeline_time, false) %>.
 
                 <div class="pull-right">
-                  <% running.each do |k| %>
-                    <span class="label label-primary"><%= k %></span>
+                  <% running.each do |k,v| %>
+                    <%= render partial: 'job_progress', locals: {:j => v[:job], :show_progress_bar => false, :title => k} %>
                   <% end %>
-                  <% queued.each do |k| %>
-                    <span class="label label-default"><%= k %></span>
+                  <% queued.each do |k,v| %>
+                    <%= render partial: 'job_progress', locals: {:j => v[:job], :show_progress_bar => false, :title => k} %>
                   <% end %>
                 </div>
               </div>
index cafaf6dae78ad539992cb50516fda73cefae219a..95a7ee100dacf5f495d2ac4c12d86ac795ad99fd 100644 (file)
@@ -61,6 +61,7 @@
 <table id="project_sharing" class="topalign table" style="clear: both; margin-top: 1em;">
   <tr>
     <th>User/Group Name</th>
+    <th>Email Address</th>
     <th colspan="2">Project Access</th>
   </tr>
 
          link_name = shared_with.full_name
        else
          link_name = shared_with.name
-       end %>
+       end
+       if shared_with && shared_with.respond_to?(:email)
+         email = shared_with.email
+       end
+  %>
   <tr data-object-uuid="<%= link.uuid %>">
     <td>
       <i class="fa fa-fw <%= fa_icon_class_for_uuid(link.tail_uuid) %>"></i>
       <%= link_to_if_arvados_object(link.tail_uuid, link_text: link_name) %>
     </td>
+    <td>
+      <%= email %>
+    </td>
     <td><%= link_to perm_name_desc_map[link.name], '#', {
       "data-emptytext" => "Read",
       "data-placement" => "bottom",
index acde5ce8fdd88ee376abfdb294ec1fb225390951..45ca939281431a2aaee8301a80d489d77771e2a0 100644 (file)
@@ -22,7 +22,6 @@
           <th>Log</th>
           <th>Created at</th>
           <th>Status</th>
-          <th>Progress</th>
         </tr>
 
         <%# Preload collections, logs, and pipeline instance objects %>
@@ -96,9 +95,6 @@
   </small>
 </td>
 
-<td>
-  <%= render partial: 'job_status_label', locals: {:j => j} %>
-</td>
 <td>
   <div class="inline-progress-container">
   <%= render partial: 'job_progress', locals: {:j => j} %>
index 50577ede8547d32e949a18c97269fc1dda355736..ff53777da42ea35ea243418eb2180f0d05be8617 100644 (file)
@@ -188,6 +188,10 @@ class CollectionsControllerTest < ActionController::TestCase
     fakefiledata.expects(:read).twice.with() do |length|
       # Fail the test if read() is called with length>1MiB:
       length < 2**20
+      ## Force the ActionController::Live thread to lose the race to
+      ## verify that @response.body.length actually waits for the
+      ## response (see below):
+      # sleep 3
     end.returns("foo\n", nil)
     fakefiledata.expects(:close)
     foo_file = api_fixture('collections')['foo_file']
@@ -196,5 +200,10 @@ class CollectionsControllerTest < ActionController::TestCase
       uuid: foo_file['uuid'],
       file: foo_file['manifest_text'].match(/ \d+:\d+:(\S+)/)[1]
     }, session_for(:active)
+    # Wait for the whole response to arrive before deciding whether
+    # mocks' expectations were met. Otherwise, Mocha will fail the
+    # test depending on how slowly the ActionController::Live thread
+    # runs.
+    @response.body.length
   end
 end
index d76430cfdf523f246e4a60d03abf3f85e2b9a85c..8eb0cdcf04ec86654bf928c369b3da6d761fc2e9 100644 (file)
@@ -93,6 +93,23 @@ class ProjectsControllerTest < ActionController::TestCase
     refute user_can_manage(:project_viewer, "asubproject")
   end
 
+  test "subproject_admin can_manage asubproject" do
+    assert user_can_manage(:subproject_admin, "asubproject")
+  end
+
+  test "project admin can remove items from the project" do
+    coll_key = "collection_to_remove_from_subproject"
+    coll_uuid = api_fixture("collections")[coll_key]["uuid"]
+    delete(:remove_item,
+           { id: api_fixture("groups")["asubproject"]["uuid"],
+             item_uuid: coll_uuid,
+             format: "js" },
+           session_for(:subproject_admin))
+    assert_response :success
+    assert_match(/\b#{coll_uuid}\b/, @response.body,
+                 "removed object not named in response")
+  end
+
   test 'projects#show tab infinite scroll partial obeys limit' do
     get_contents_rows(limit: 1, filters: [['uuid','is_a',['arvados#job']]])
     assert_response :success
index f62f28534bcd2b21f8d2215b81bdf099019fceec..625e4819ecec55b46ad5cc2b113b9754d2f27e90 100644 (file)
@@ -147,4 +147,57 @@ class CollectionsTest < ActionDispatch::IntegrationTest
       headless.stop
     end
   end
+
+  test "combine selected collection files from collection subdirectory" do
+    headless = Headless.new
+    headless.start
+    Capybara.current_driver = :selenium
+
+    visit page_with_token('user1_with_load', "/collections/zzzzz-4zz18-filesinsubdir00")
+
+    # now in collection page
+    input_files = page.all('input[type=checkbox]')
+    (0..input_files.count-1).each do |i|
+      input_files[i].click
+    end
+
+    click_button 'Selection...'
+    within('.selection-action-container') do
+      click_link 'Create new collection with selected files'
+    end
+
+    # now in the newly created collection page
+    assert(page.has_text?('file_in_subdir1'), 'file not found - file_in_subdir1')
+    assert(page.has_text?('file1_in_subdir3.txt'), 'file not found - file1_in_subdir3.txt')
+    assert(page.has_text?('file2_in_subdir3.txt'), 'file not found - file2_in_subdir3.txt')
+    assert(page.has_text?('file1_in_subdir4.txt'), 'file not found - file1_in_subdir4.txt')
+    assert(page.has_text?('file2_in_subdir4.txt'), 'file not found - file1_in_subdir4.txt')
+
+    headless.stop
+  end
+
+  test "Collection portable data hash redirect" do
+    di = api_fixture('collections')['docker_image']
+    visit page_with_token('active', "/collections/#{di['portable_data_hash']}")
+
+    # check redirection
+    assert current_path.end_with?("/collections/#{di['uuid']}")
+    assert page.has_text?("docker_image")
+    assert page.has_text?("Activity")
+    assert page.has_text?("Sharing and permissions")
+  end
+
+  test "Collection portable data hash with multiple matches" do
+    pdh = api_fixture('collections')['baz_file']['portable_data_hash']
+    visit page_with_token('admin', "/collections/#{pdh}")
+
+    matches = api_fixture('collections').select {|k,v| v["portable_data_hash"] == pdh}
+    assert matches.size > 1
+
+    matches.each do |k,v|
+      assert page.has_link?(v["name"]), "Page /collections/#{pdh} should contain link '#{v['name']}'"
+    end
+    assert page.has_no_text?("Activity")
+    assert page.has_no_text?("Sharing and permissions")
+  end
 end
index 3e8663daa8086dccd0122f4bbfb056150c9c0ff7..3d7c34812f21910f8501b129f8c19bab443ac6db 100644 (file)
@@ -83,6 +83,7 @@ class PipelineInstancesTest < ActionDispatch::IntegrationTest
     assert page.has_text? 'Paused'
     page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
     page.assert_selector 'a,button', text: 'Re-run with latest'
+    page.assert_selector 'a,button', text: 'Re-run options'
 
     # Since it is test env, no jobs are created to run. So, graph not visible
     assert_not page.has_text? 'Graph'
@@ -113,80 +114,15 @@ class PipelineInstancesTest < ActionDispatch::IntegrationTest
       wait_for_ajax
     end
 
-    # create a pipeline instance
-    find('.btn', text: 'Run a pipeline').click
-    within('.modal-dialog') do
-      find('.selectable', text: 'Two Part Pipeline Template').click
-      find('.btn', text: 'Next: choose inputs').click
-    end
-
-    assert find('p', text: 'Provide a value')
-
-    find('div.form-group', text: 'Foo/bar pair').
-      find('.btn', text: 'Choose').
-      click
-
-    within('.modal-dialog') do
-      assert_selector 'button.dropdown-toggle', text: 'A Project'
-      wait_for_ajax
-      first('span', text: 'foo_tag').click
-      find('button', text: 'OK').click
-    end
-    wait_for_ajax
-
-    # "Run" button present and enabled
-    page.assert_no_selector 'a.disabled,button.disabled', text: 'Run'
-    first('a,button', text: 'Run').click
-
-    # Pipeline is running. We have a "Pause" button instead now.
-    page.assert_no_selector 'a,button', text: 'Run'
-    page.assert_selector 'a,button', text: 'Pause'
-
-    # Since it is test env, no jobs are created to run. So, graph not visible
-    assert_not page.has_text? 'Graph'
+    create_and_run_pipeline_in_aproject true
   end
 
   # Create a pipeline instance from within a project and run
   test 'Run a pipeline from dashboard' do
     visit page_with_token('active_trustedclient')
-
-    # create a pipeline instance
-    find('.btn', text: 'Run a pipeline').click
-    within('.modal-dialog') do
-      find('.selectable', text: 'Two Part Pipeline Template').click
-      find('.btn', text: 'Next: choose inputs').click
-    end
-
-    assert find('p', text: 'Provide a value')
-
-    find('div.form-group', text: 'Foo/bar pair').
-      find('.btn', text: 'Choose').
-      click
-
-    within('.modal-dialog') do
-      assert_selector 'button.dropdown-toggle', text: 'Home'
-      wait_for_ajax
-      click_button "Home"
-      click_link "A Project"
-      wait_for_ajax
-      first('span', text: 'foo_tag').click
-      find('button', text: 'OK').click
-    end
-    wait_for_ajax
-
-    # "Run" button present and enabled
-    page.assert_no_selector 'a.disabled,button.disabled', text: 'Run'
-    first('a,button', text: 'Run').click
-
-    # Pipeline is running. We have a "Pause" button instead now.
-    page.assert_no_selector 'a,button', text: 'Run'
-    page.assert_selector 'a,button', text: 'Pause'
-
-    # Since it is test env, no jobs are created to run. So, graph not visible
-    assert_not page.has_text? 'Graph'
+    create_and_run_pipeline_in_aproject false
   end
 
-
   test 'view pipeline with job and see graph' do
     visit page_with_token('active_trustedclient')
 
@@ -264,4 +200,142 @@ class PipelineInstancesTest < ActionDispatch::IntegrationTest
   test "Workbench preserves search_for parameter after project switch" do
     check_parameter_search("A Project")
   end
+
+  [
+    ['active', false, false, false],
+    ['active', false, false, true],
+    ['active', true, false, false],
+    ['active', true, true, false],
+    ['active', true, false, true],
+    ['active', true, true, true],
+    ['project_viewer', false, false, true],
+    ['project_viewer', true, false, true],
+    ['project_viewer', true, true, true],
+  ].each do |user, with_options, choose_options, in_aproject|
+    test "Rerun pipeline instance as #{user} using options #{with_options} #{choose_options} in #{in_aproject}" do
+      visit page_with_token('active')
+
+      if in_aproject
+        find("#projects-menu").click
+        find('.dropdown-menu a,button', text: 'A Project').click
+      end
+
+      create_and_run_pipeline_in_aproject in_aproject
+      instance_path = current_path
+
+      # Pause the pipeline
+      find('a,button', text: 'Pause').click
+      assert page.has_text? 'Paused'
+      page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
+      page.assert_selector 'a,button', text: 'Re-run with latest'
+      page.assert_selector 'a,button', text: 'Re-run options'
+
+      # Pipeline can be re-run now. Access it as the specified user, and re-run
+      if user == 'project_viewer'
+        visit page_with_token(user, instance_path)
+        assert page.has_text? 'A Project'
+        page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
+        page.assert_selector 'a,button', text: 'Re-run with latest'
+        page.assert_selector 'a,button', text: 'Re-run options'
+      end
+
+      # Now re-run the pipeline
+      if with_options
+        find('a,button', text: 'Re-run options').click
+        within('.modal-dialog') do
+          page.assert_selector 'a,button', text: 'Copy and edit inputs'
+          page.assert_selector 'a,button', text: 'Run now'
+          if choose_options
+            find('button', text: 'Copy and edit inputs').click
+          else
+            find('button', text: 'Run now').click
+          end
+        end
+      else
+        find('a,button', text: 'Re-run with latest').click
+      end
+
+      # Verify that the newly created instance is created in the right project.
+      # In case of project_viewer user, since the use cannot write to the project,
+      # the pipeline should have been created in the user's Home project.
+      rerun_instance_path = current_path
+      assert_not_equal instance_path, rerun_instance_path, 'Rerun instance path expected to be different'
+      assert page.has_text? 'Home'
+      if in_aproject && (user != 'project_viewer')
+        assert page.has_text? 'A Project'
+      else
+        assert page.has_no_text? 'A Project'
+      end
+    end
+  end
+
+  # Create and run a pipeline for 'Two Part Pipeline Template' in 'A Project'
+  def create_and_run_pipeline_in_aproject in_aproject
+    # create a pipeline instance
+    find('.btn', text: 'Run a pipeline').click
+    within('.modal-dialog') do
+      find('.selectable', text: 'Two Part Pipeline Template').click
+      find('.btn', text: 'Next: choose inputs').click
+    end
+
+    assert find('p', text: 'Provide a value')
+
+    find('div.form-group', text: 'Foo/bar pair').
+      find('.btn', text: 'Choose').
+      click
+
+    within('.modal-dialog') do
+      if in_aproject
+        assert_selector 'button.dropdown-toggle', text: 'A Project'
+        wait_for_ajax
+      else
+        assert_selector 'button.dropdown-toggle', text: 'Home'
+        wait_for_ajax
+        click_button "Home"
+        click_link "A Project"
+        wait_for_ajax
+      end
+      first('span', text: 'foo_tag').click
+      find('button', text: 'OK').click
+    end
+    wait_for_ajax
+
+    # "Run" button present and enabled
+    page.assert_no_selector 'a.disabled,button.disabled', text: 'Run'
+    first('a,button', text: 'Run').click
+
+    # Pipeline is running. We have a "Pause" button instead now.
+    page.assert_no_selector 'a,button', text: 'Run'
+    page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
+    page.assert_selector 'a,button', text: 'Pause'
+
+    # Since it is test env, no jobs are created to run. So, graph not visible
+    assert_not page.has_text? 'Graph'
+  end
+
+  [
+    [0, 0], # run time 0 minutes
+    [9, 17*60*60 + 51*60], # run time 17 hours and 51 minutes
+  ].each do |index, run_time|
+    test "pipeline start and finish time display #{index}" do
+      visit page_with_token("user1_with_load", "/pipeline_instances/zzzzz-d1hrv-10pipelines0#{index.to_s.rjust(3, '0')}")
+
+      assert page.has_text? 'This pipeline started at'
+      page_text = page.text
+
+      match = /This pipeline started at (.*)\. It failed after (.*) seconds at (.*)\. Check the Log/.match page_text
+      assert_not_nil(match, 'Did not find text - This pipeline started at . . . ')
+
+      start_at = match[1]
+      finished_at = match[3]
+      assert_not_nil(start_at, 'Did not find start_at time')
+      assert_not_nil(finished_at, 'Did not find finished_at time')
+
+      # start and finished time display is of the format '2:20 PM 10/20/2014'
+      start_time = DateTime.strptime(start_at, '%H:%M %p %m/%d/%Y').to_time
+      finished_time = DateTime.strptime(finished_at, '%H:%M %p %m/%d/%Y').to_time
+      assert_equal(run_time, finished_time-start_time,
+        "Time difference did not match for start_at #{start_at}, finished_at #{finished_at}, ran_for #{match[2]}")
+    end
+  end
 end
index 83565f939dbeef6a2ab6afdbea375b286cc77d45..66b62c3daead2bb27664498090e8a8d7c998ac91 100644 (file)
@@ -182,7 +182,7 @@ class ProjectsTest < ActionDispatch::IntegrationTest
     find('#project_sharing').all('tr')
   end
 
-  def add_share_and_check(share_type, name)
+  def add_share_and_check(share_type, name, obj=nil)
     assert(page.has_no_text?(name), "project is already shared with #{name}")
     start_share_count = share_rows.size
     click_on("Share with #{share_type}")
@@ -194,6 +194,9 @@ class ProjectsTest < ActionDispatch::IntegrationTest
       find(".selectable", text: name).click
       assert(has_no_selector?(".modal-dialog-preview-pane"),
              "preview pane available in sharing dialog")
+      if share_type == 'users' and obj and obj['email']
+        assert(page.has_text?(obj['email']), "Did not find user's email")
+      end
       assert_raises(Capybara::ElementNotFound,
                     "Projects pulldown available from sharing dialog") do
         click_on "All projects"
@@ -240,7 +243,7 @@ class ProjectsTest < ActionDispatch::IntegrationTest
 
     show_project_using("active")
     click_on "Sharing"
-    add_share_and_check("users", new_name)
+    add_share_and_check("users", new_name, add_user)
     modify_share_and_check(new_name)
   end
 
@@ -485,4 +488,146 @@ class ProjectsTest < ActionDispatch::IntegrationTest
     end
   end
 
+  [
+    ['project with 10 collections', 10],
+    ['project with 201 collections', 201], # two pages of data
+  ].each do |project_name, amount|
+    test "scroll collections tab for #{project_name} with #{amount} objects" do
+      headless = Headless.new
+      headless.start
+      Capybara.current_driver = :selenium
+
+      visit page_with_token 'user1_with_load'
+
+      find("#projects-menu").click
+      find(".dropdown-menu a", text: project_name).click
+
+      my_collections = []
+      for i in 1..amount
+        my_collections << "Collection_#{i}"
+      end
+
+      # verify Data collections scroll
+      assert(page.has_text?("Data collections (#{amount})"), "Number of collections did not match the input amount")
+
+      click_link 'Data collections'
+      begin
+        wait_for_ajax
+      rescue
+      end
+
+      verify_collections = my_collections.dup
+      unexpected_items = []
+      collections_count = 0
+      within('.arv-project-Data_collections') do
+        page.execute_script "window.scrollBy(0,999000)"
+        begin
+          wait_for_ajax
+        rescue
+        end
+
+        # Visit all rows. If not all expected collections are found, retry
+        found_collections = page.all('tr[data-kind="arvados#collection"]')
+        collections_count = found_collections.count
+
+        (0..collections_count-1).each do |i|
+          # Found row text would be of the format "Show Collection_#{n} "
+          collection_name = found_collections[i].text.split[1]
+          if !my_collections.include? collection_name
+            unexpected_items << collection_name
+          else
+            verify_collections.delete collection_name
+          end
+        end
+
+        assert_equal true, unexpected_items.empty?, "Found unexpected items #{unexpected_items.inspect}"
+        assert_equal amount, collections_count, "Found different number of collections"
+        assert_equal true, verify_collections.empty?, "Did not find all the collections"
+      end
+    end
+  end
+
+  [
+    ['project with 10 pipelines', 10, 0],
+#    ['project with 200 jobs and 10 pipelines', 2, 200],
+    ['project with 25 pipelines', 25, 0],
+  ].each do |project_name, num_pipelines, num_jobs|
+    test "scroll pipeline instances tab for #{project_name} with #{num_pipelines} pipelines and #{num_jobs} jobs" do
+      headless = Headless.new
+      headless.start
+      Capybara.current_driver = :selenium
+
+      visit page_with_token 'user1_with_load'
+
+      find("#projects-menu").click
+      find(".dropdown-menu a", text: project_name).click
+
+      my_pipelines = []
+      (0..num_pipelines-1).each do |i|
+        name = "pipeline_#{i}"
+        my_pipelines << name
+      end
+
+      # verify Jobs and pipelines tab scroll
+      assert(page.has_text?("Jobs and pipelines (#{num_pipelines+num_jobs})"), "Number of objects did not match the input counts")
+      click_link 'Jobs and pipelines'
+      begin
+        wait_for_ajax
+      rescue
+      end
+
+      verify_pipelines = my_pipelines.dup
+      unexpected_items = []
+      object_count = 0
+      within('.arv-project-Jobs_and_pipelines') do
+        page.execute_script "window.scrollBy(0,999000)"
+        begin
+          wait_for_ajax
+        rescue
+        end
+
+        # Visit all rows. Repeat if not all expected my_pipelines are found (inifinite scrolling should kick in)
+        pipelines_found = page.all('tr[data-kind="arvados#pipelineInstance"]')
+        found_pipeline_count = pipelines_found.count
+        (0..found_pipeline_count-1).each do |i|
+          name = pipelines_found[i].text.split[1]
+          if !my_pipelines.include? name
+            unexpected_items << name
+          else
+            verify_pipelines.delete name
+          end
+
+          assert_equal true, unexpected_items.empty?, "Found unexpected items #{unexpected_items.inspect}"
+        end
+
+        jobs_found = page.all('tr[data-kind="arvados#job"]')
+        found_job_count = jobs_found.count
+
+        assert_equal num_pipelines, found_pipeline_count, "Found different number of pipelines and jobs"
+        assert_equal num_jobs, found_job_count, 'Did not find expected number of jobs'
+        assert_equal true, verify_pipelines.empty?, "Did not find all the pipelines and jobs"
+      end
+    end
+  end
+
+  # Move button accessibility
+  [
+    ['admin', true],
+    ['active', true],  # project owner
+    ['project_viewer', false],
+    ].each do |user, can_move|
+    test "#{user} can move subproject under another user's Home #{can_move}" do
+      project = api_fixture('groups')['aproject']
+      collection = api_fixture('collections')['collection_to_move_around_in_aproject']
+
+      # verify the project move button
+      visit page_with_token user, "/projects/#{project['uuid']}"
+      if can_move
+        assert page.has_link? 'Move project...'
+      else
+        assert page.has_no_link? 'Move project...'
+      end
+    end
+  end
+
 end
index b5bf2415c252eeca5b83db764e1a5d2cda65876d..58432f7d5e13108115802afce78ad2ee884dd04a 100644 (file)
@@ -70,7 +70,7 @@ class UsersTest < ActionDispatch::IntegrationTest
 
     # verify that the new user showed up in the users page and find
     # the new user's UUID
-    new_user_uuid = 
+    new_user_uuid =
       find('tr[data-object-uuid]', text: 'foo@example.com')['data-object-uuid']
     assert new_user_uuid, "Expected new user uuid not found"
 
@@ -106,6 +106,7 @@ class UsersTest < ActionDispatch::IntegrationTest
     find('tr', text: 'zzzzz-tpzed-xurymjxw79nv3jz').
       find('a', text: 'Show').
       click
+    user_url = page.current_url
 
     # Setup user
     click_link 'Admin'
@@ -119,6 +120,7 @@ class UsersTest < ActionDispatch::IntegrationTest
       click_button "Submit"
     end
 
+    visit user_url
     assert page.has_text? 'modified_by_client_uuid'
 
     click_link 'Advanced'
@@ -136,6 +138,7 @@ class UsersTest < ActionDispatch::IntegrationTest
       click_button "Submit"
     end
 
+    visit user_url
     find '#Attributes', text: 'modified_by_client_uuid'
 
     click_link 'Advanced'
@@ -161,6 +164,7 @@ class UsersTest < ActionDispatch::IntegrationTest
     find('tr', text: 'zzzzz-tpzed-xurymjxw79nv3jz').
       find('a', text: 'Show').
       click
+    user_url = page.current_url
 
     # Verify that is_active is set
     find('a,button', text: 'Attributes').click
@@ -202,6 +206,7 @@ class UsersTest < ActionDispatch::IntegrationTest
       click_button "Submit"
     end
 
+    visit user_url
     assert page.has_text? 'modified_by_client_uuid'
 
     click_link 'Advanced'
index 5253676578e18b4eaf0692db37c1358b34c092d3..89341a6323ed5742a02bc457dfd4004c2872e9dd 100644 (file)
@@ -38,7 +38,10 @@ class ActiveSupport::TestCase
 
   teardown do
     Thread.current[:arvados_api_token] = nil
+    Thread.current[:user] = nil
     Thread.current[:reader_tokens] = nil
+    # Diagnostics suite doesn't run a server, so there's no cache to clear.
+    Rails.cache.clear unless (Rails.env == "diagnostics")
     # Restore configuration settings changed during tests
     $application_config.each do |k,v|
       if k.match /^[^.]*$/
@@ -55,18 +58,27 @@ module ApiFixtureLoader
 
   module ClassMethods
     @@api_fixtures = {}
-    def api_fixture(name)
+    def api_fixture(name, *keys)
       # Returns the data structure from the named API server test fixture.
       @@api_fixtures[name] ||= \
       begin
         path = File.join(ApiServerForTests::ARV_API_SERVER_DIR,
                          'test', 'fixtures', "#{name}.yml")
-        YAML.load(IO.read(path))
+        file = IO.read(path)
+        trim_index = file.index('# Test Helper trims the rest of the file')
+        file = file[0, trim_index] if trim_index
+        YAML.load(file)
       end
+      keys.inject(@@api_fixtures[name]) { |hash, key| hash[key] }
     end
   end
-  def api_fixture name
-    self.class.api_fixture name
+  def api_fixture(name, *keys)
+    self.class.api_fixture(name, *keys)
+  end
+
+  def find_fixture(object_class, name)
+    object_class.find(api_fixture(object_class.to_s.pluralize.underscore,
+                                  name, "uuid"))
   end
 end
 
index 512ad47c34dc3c5a12e7b92b40af102534c4355e..e71f9667ec97e75ac9960bed17636da618bbc02d 100644 (file)
@@ -40,4 +40,35 @@ class CollectionTest < ActiveSupport::TestCase
                  get_files_tree('multilevel_collection_2'),
                  "Collection file tree was malformed")
   end
+
+  test "portable_data_hash never editable" do
+    refute(Collection.new.attribute_editable?("portable_data_hash", :ever))
+  end
+
+  test "admin can edit name" do
+    use_token :admin
+    assert(find_fixture(Collection, "foo_file").attribute_editable?("name"),
+           "admin not allowed to edit collection name")
+  end
+
+  test "project owner can edit name" do
+    use_token :active
+    assert(find_fixture(Collection, "foo_collection_in_aproject")
+             .attribute_editable?("name"),
+           "project owner not allowed to edit collection name")
+  end
+
+  test "project admin can edit name" do
+    use_token :subproject_admin
+    assert(find_fixture(Collection, "baz_file_in_asubproject")
+             .attribute_editable?("name"),
+           "project admin not allowed to edit collection name")
+  end
+
+  test "project viewer cannot edit name" do
+    use_token :project_viewer
+    refute(find_fixture(Collection, "foo_collection_in_aproject")
+             .attribute_editable?("name"),
+           "project viewer allowed to edit collection name")
+  end
 end
index 3f5cebc9551823fb88c92f5cf09e17c9a188f2ee..4a4530ca5694114f465f50ab0dec2250916a90ea 100644 (file)
@@ -25,4 +25,16 @@ class GroupTest < ActiveSupport::TestCase
       assert_nil user.owner_uuid
     end
   end
+
+  test "project editable by its admin" do
+    use_token :subproject_admin
+    project = Group.find(api_fixture("groups")["asubproject"]["uuid"])
+    assert(project.editable?, "project not editable by admin")
+  end
+
+  test "project not editable by reader" do
+    use_token :project_viewer
+    project = Group.find(api_fixture("groups")["aproject"]["uuid"])
+    refute(project.editable?, "project editable by reader")
+  end
 end
index 5079316934eac12b586c564b52a8a8194257d85c..add4c0fd55b096a3dc829238ba8cfaefc7573ba7 100644 (file)
@@ -1,7 +1,31 @@
 require 'test_helper'
 
 class JobTest < ActiveSupport::TestCase
-  # test "the truth" do
-  #   assert true
-  # end
+  test "admin can edit description" do
+    use_token :admin
+    assert(find_fixture(Job, "job_in_subproject")
+             .attribute_editable?("description"),
+           "admin not allowed to edit job description")
+  end
+
+  test "project owner can edit description" do
+    use_token :active
+    assert(find_fixture(Job, "job_in_subproject")
+             .attribute_editable?("description"),
+           "project owner not allowed to edit job description")
+  end
+
+  test "project admin can edit description" do
+    use_token :subproject_admin
+    assert(find_fixture(Job, "job_in_subproject")
+             .attribute_editable?("description"),
+           "project admin not allowed to edit job description")
+  end
+
+  test "project viewer cannot edit description" do
+    use_token :project_viewer
+    refute(find_fixture(Job, "job_in_subproject")
+             .attribute_editable?("description"),
+           "project viewer allowed to edit job description")
+  end
 end
index 9b4c7c3787b26aa11545e402b6bc2c847fd86cc4..95ad8fa7cd11bf4abaabd7d04712945071255d28 100644 (file)
@@ -1,7 +1,31 @@
 require 'test_helper'
 
 class PipelineInstanceTest < ActiveSupport::TestCase
-  # test "the truth" do
-  #   assert true
-  # end
+  test "admin can edit name" do
+    use_token :admin
+    assert(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+             .attribute_editable?("name"),
+           "admin not allowed to edit pipeline instance name")
+  end
+
+  test "project owner can edit name" do
+    use_token :active
+    assert(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+             .attribute_editable?("name"),
+           "project owner not allowed to edit pipeline instance name")
+  end
+
+  test "project admin can edit name" do
+    use_token :subproject_admin
+    assert(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+             .attribute_editable?("name"),
+           "project admin not allowed to edit pipeline instance name")
+  end
+
+  test "project viewer cannot edit name" do
+    use_token :project_viewer
+    refute(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+             .attribute_editable?("name"),
+           "project viewer allowed to edit pipeline instance name")
+  end
 end
index ff3486354f325bd478c637c29227ea012c6c0766..06ef6c1198ffad07d888040794a42dce08e8af16 100644 (file)
@@ -1,5 +1,6 @@
 import os
 import glob
+import stat
 
 class SubstitutionError(Exception):
     pass
@@ -35,13 +36,23 @@ def search(c):
     return None
 
 def sub_file(v):
-    return os.path.join(os.environ['TASK_KEEPMOUNT'], v)
+    path = os.path.join(os.environ['TASK_KEEPMOUNT'], v)
+    st = os.stat(path)
+    if st and stat.S_ISREG(st.st_mode):
+        return path
+    else:
+        raise SubstitutionError("$(file {}) is not accessible or is not a regular file".format(path))
 
 def sub_dir(v):
     d = os.path.dirname(v)
     if d == '':
         d = v
-    return os.path.join(os.environ['TASK_KEEPMOUNT'], d)
+    path = os.path.join(os.environ['TASK_KEEPMOUNT'], d)
+    st = os.stat(path)
+    if st and stat.S_ISDIR(st.st_mode):
+        return path
+    else:
+        raise SubstitutionError("$(dir {}) is not accessible or is not a directory".format(path))
 
 def sub_basename(v):
     return os.path.splitext(os.path.basename(v))[0]
@@ -49,7 +60,7 @@ def sub_basename(v):
 def sub_glob(v):
     l = glob.glob(v)
     if len(l) == 0:
-        raise SubstitutionError("$(glob): No match on '%s'" % v)
+        raise SubstitutionError("$(glob {}) no match fonud".format(v))
     else:
         return l[0]
 
index c5fbcdfc024f5fd3c6dac20e2a8dc89ec5168a66..c1e747506091863a11b028656b662e50f484ef5b 100755 (executable)
@@ -25,25 +25,40 @@ import pprint
 import multiprocessing
 import crunchutil.robust_put as robust_put
 import crunchutil.vwd as vwd
+import argparse
+import json
+import tempfile
 
-os.umask(0077)
-
-t = arvados.current_task().tmpdir
+parser = argparse.ArgumentParser()
+parser.add_argument('--dry-run', action='store_true')
+parser.add_argument('--script-parameters', type=str, default="{}")
+args = parser.parse_args()
 
-api = arvados.api('v1')
+os.umask(0077)
 
-os.chdir(arvados.current_task().tmpdir)
-os.mkdir("tmpdir")
-os.mkdir("output")
+if not args.dry_run:
+    api = arvados.api('v1')
+    t = arvados.current_task().tmpdir
+    os.chdir(arvados.current_task().tmpdir)
+    os.mkdir("tmpdir")
+    os.mkdir("output")
 
-os.chdir("output")
+    os.chdir("output")
 
-outdir = os.getcwd()
+    outdir = os.getcwd()
 
-taskp = None
-jobp = arvados.current_job()['script_parameters']
-if len(arvados.current_task()['parameters']) > 0:
-    taskp = arvados.current_task()['parameters']
+    taskp = None
+    jobp = arvados.current_job()['script_parameters']
+    if len(arvados.current_task()['parameters']) > 0:
+        taskp = arvados.current_task()['parameters']
+else:
+    outdir = "/tmp"
+    jobp = json.loads(args.job_parameters)
+    os.environ['JOB_UUID'] = 'zzzzz-8i9sb-1234567890abcde'
+    os.environ['TASK_UUID'] = 'zzzzz-ot0gb-1234567890abcde'
+    os.environ['CRUNCH_SRC'] = '/tmp/crunche-src'
+    if 'TASK_KEEPMOUNT' not in os.environ:
+        os.environ['TASK_KEEPMOUNT'] = '/keep'
 
 links = []
 
@@ -80,6 +95,12 @@ class SigHandler(object):
         sp.send_signal(signum)
         self.sig = signum
 
+def add_to_group(gr, match):
+    m = match.groups()
+    if m not in gr:
+        gr[m] = []
+    gr[m].append(match.group(0))
+
 def expand_item(p, c):
     if isinstance(c, dict):
         if "foreach" in c and "command" in c:
@@ -91,17 +112,50 @@ def expand_item(p, c):
                 params[var] = i
                 r.extend(expand_list(params, c["command"]))
             return r
+        if "list" in c and "index" in c and "command" in c:
+            var = c["list"]
+            items = get_items(p, p[var])
+            params = copy.copy(p)
+            params[var] = items[int(c["index"])]
+            return expand_list(params, c["command"])
+        if "regex" in c:
+            pattern = re.compile(c["regex"])
+            if "filter" in c:
+                items = get_items(p, p[c["filter"]])
+                return [i for i in items if pattern.match(i)]
+            elif "group" in c:
+                items = get_items(p, p[c["group"]])
+                groups = {}
+                for i in items:
+                    match = pattern.match(i)
+                    if match:
+                        add_to_group(groups, match)
+                return [groups[k] for k in groups]
+            elif "extract" in c:
+                items = get_items(p, p[c["extract"]])
+                r = []
+                for i in items:
+                    match = pattern.match(i)
+                    if match:
+                        r.append(list(match.groups()))
+                return r
     elif isinstance(c, list):
         return expand_list(p, c)
-    elif isinstance(c, str) or isinstance(c, unicode):
+    elif isinstance(c, basestring):
         return [subst.do_substitution(p, c)]
 
     return []
 
 def expand_list(p, l):
-    return [exp for arg in l for exp in expand_item(p, arg)]
+    if isinstance(l, basestring):
+        return expand_item(p, l)
+    else:
+        return [exp for arg in l for exp in expand_item(p, arg)]
 
 def get_items(p, value):
+    if isinstance(value, dict):
+        return expand_item(p, value)
+
     if isinstance(value, list):
         return expand_list(p, value)
 
@@ -110,10 +164,10 @@ def get_items(p, value):
     prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
     if mode is not None:
         if stat.S_ISDIR(mode):
-            items = ["$(dir %s/%s/)" % (prefix, l) for l in os.listdir(fn)]
+            items = [os.path.join(fn, l) for l in os.listdir(fn)]
         elif stat.S_ISREG(mode):
             with open(fn) as f:
-                items = [line for line in f]
+                items = [line.rstrip("\r\n") for line in f]
         return items
     else:
         return None
@@ -124,53 +178,78 @@ stdinname = None
 stdinfile = None
 rcode = 1
 
-try:
-    if "task.foreach" in jobp:
-        if arvados.current_task()['sequence'] == 0:
-            var = jobp["task.foreach"]
-            items = get_items(jobp, jobp[var])
-            logger.info("parallelizing on %s with items %s" % (var, items))
-            if items is not None:
-                for i in items:
-                    params = copy.copy(jobp)
-                    params[var] = i
+def recursive_foreach(params, fvars):
+    var = fvars[0]
+    fvars = fvars[1:]
+    items = get_items(params, params[var])
+    logger.info("parallelizing on %s with items %s" % (var, items))
+    if items is not None:
+        for i in items:
+            params = copy.copy(params)
+            params[var] = i
+            if len(fvars) > 0:
+                recursive_foreach(params, fvars)
+            else:
+                if not args.dry_run:
                     arvados.api().job_tasks().create(body={
                         'job_uuid': arvados.current_job()['uuid'],
                         'created_by_job_task_uuid': arvados.current_task()['uuid'],
                         'sequence': 1,
                         'parameters': params
-                        }
-                    ).execute()
+                    }).execute()
+                else:
+                    logger.info(expand_list(params, params["command"]))
+    else:
+        logger.error("parameter %s with value %s in task.foreach yielded no items" % (var, params[var]))
+        sys.exit(1)
+
+try:
+    if "task.foreach" in jobp:
+        if args.dry_run or arvados.current_task()['sequence'] == 0:
+            # This is the first task to start the other tasks and exit
+            fvars = jobp["task.foreach"]
+            if isinstance(fvars, basestring):
+                fvars = [fvars]
+            if not isinstance(fvars, list) or len(fvars) == 0:
+                logger.error("value of task.foreach must be a string or non-empty list")
+                sys.exit(1)
+            recursive_foreach(jobp, jobp["task.foreach"])
+            if not args.dry_run:
                 if "task.vwd" in jobp:
-                    # Base vwd collection will be merged with output fragments from
-                    # the other tasks by crunch.
+                    # Set output of the first task to the base vwd collection so it
+                    # will be merged with output fragments from the other tasks by
+                    # crunch.
                     arvados.current_task().set_output(subst.do_substitution(jobp, jobp["task.vwd"]))
                 else:
                     arvados.current_task().set_output(None)
-                sys.exit(0)
-            else:
-                sys.exit(1)
+            sys.exit(0)
     else:
+        # This is the only task so taskp/jobp are the same
         taskp = jobp
 
-    if "task.vwd" in taskp:
-        # Populate output directory with symlinks to files in collection
-        vwd.checkout(subst.do_substitution(taskp, taskp["task.vwd"]), outdir)
+    if not args.dry_run:
+        if "task.vwd" in taskp:
+            # Populate output directory with symlinks to files in collection
+            vwd.checkout(subst.do_substitution(taskp, taskp["task.vwd"]), outdir)
 
-    if "task.cwd" in taskp:
-        os.chdir(subst.do_substitution(taskp, taskp["task.cwd"]))
+        if "task.cwd" in taskp:
+            os.chdir(subst.do_substitution(taskp, taskp["task.cwd"]))
 
     cmd = expand_list(taskp, taskp["command"])
 
-    if "task.stdin" in taskp:
-        stdinname = subst.do_substitution(taskp, taskp["task.stdin"])
-        stdinfile = open(stdinname, "rb")
+    if not args.dry_run:
+        if "task.stdin" in taskp:
+            stdinname = subst.do_substitution(taskp, taskp["task.stdin"])
+            stdinfile = open(stdinname, "rb")
 
-    if "task.stdout" in taskp:
-        stdoutname = subst.do_substitution(taskp, taskp["task.stdout"])
-        stdoutfile = open(stdoutname, "wb")
+        if "task.stdout" in taskp:
+            stdoutname = subst.do_substitution(taskp, taskp["task.stdout"])
+            stdoutfile = open(stdoutname, "wb")
 
     logger.info("{}{}{}".format(' '.join(cmd), (" < " + stdinname) if stdinname is not None else "", (" > " + stdoutname) if stdoutname is not None else ""))
+
+    if args.dry_run:
+        sys.exit(0)
 except subst.SubstitutionError as e:
     logger.error(str(e))
     logger.error("task parameters were:")
index 3301207250710e9d706d4177c2f12caa3ec257f9..3b31cb054396abd5f878e599eee26d0eb9b8a3b9 100644 (file)
@@ -39,6 +39,10 @@ navbar:
       - user/tutorials/tutorial-firstscript.html.textile.liquid
       - user/tutorials/tutorial-submit-job.html.textile.liquid
       - user/topics/tutorial-parallel.html.textile.liquid
+      - user/topics/arv-docker.html.textile.liquid
+    - Reference:
+      - user/topics/run-command.html.textile.liquid
+      - user/reference/job-pipeline-ref.html.textile.liquid
       - user/examples/crunch-examples.html.textile.liquid
     - Query the metadata database:
       - user/topics/tutorial-trait-search.html.textile.liquid
diff --git a/doc/_includes/_example_docker.liquid b/doc/_includes/_example_docker.liquid
new file mode 100644 (file)
index 0000000..9486ad7
--- /dev/null
@@ -0,0 +1,28 @@
+{
+    "name": "Example using R in a custom Docker image",
+    "components": {
+        "Rscript": {
+            "script": "run-command",
+            "script_version": "master",
+            "repository": "arvados",
+            "script_parameters": {
+                "command": [
+                    "Rscript",
+                    "$(glob $(file $(myscript))/*.r)",
+                    "$(glob $(dir $(mydata))/*.csv)"
+                ],
+                "myscript": {
+                    "required": true,
+                    "dataclass": "Collection"
+                },
+                "mydata": {
+                    "required": true,
+                    "dataclass": "Collection"
+                }
+            },
+            "runtime_constraints": {
+                "docker_image": "arvados/jobs-with-r"
+            }
+        }
+    }
+}
diff --git a/doc/_includes/_run_command_foreach_example.liquid b/doc/_includes/_run_command_foreach_example.liquid
new file mode 100644 (file)
index 0000000..3fb754f
--- /dev/null
@@ -0,0 +1,40 @@
+{
+    "name":"run-command example pipeline",
+    "components":{
+        "bwa-mem": {
+            "script": "run-command",
+            "script_version": "master",
+            "repository": "arvados",
+            "script_parameters": {
+                "command": [
+                    "bwa",
+                    "mem",
+                    "-t",
+                    "$(node.cores)",
+                    "$(glob $(dir $(reference_collection))/*.fasta)",
+                    {
+                        "foreach": "read_pair",
+                        "command": "$(read_pair)"
+                    }
+                ],
+                "task.stdout": "$(basename $(glob $(dir $(sample))/*_1.fastq)).sam",
+                "task.foreach": ["sample_subdir", "read_pair"],
+                "reference_collection": {
+                    "required": true,
+                    "dataclass": "Collection"
+                },
+                "sample": {
+                    "required": true,
+                    "dataclass": "Collection"
+                },
+                "sample_subdir": "$(dir $(samples))",
+                "read_pair": {
+                    "value": {
+                        "group": "sample_subdir",
+                        "regex": "(.*)_[12]\\.fastq(\\.gz)?$"
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/doc/_includes/_run_command_simple_example.liquid b/doc/_includes/_run_command_simple_example.liquid
new file mode 100644 (file)
index 0000000..abd0071
--- /dev/null
@@ -0,0 +1,30 @@
+{
+    "name":"run-command example pipeline",
+    "components":{
+        "bwa-mem": {
+            "script": "run-command",
+            "script_version": "master",
+            "repository": "arvados",
+            "script_parameters": {
+                "command": [
+                    "bwa",
+                    "mem",
+                    "-t",
+                    "$(node.cores)",
+                    "$(glob $(dir $(reference_collection))/*.fasta)",
+                    "$(glob $(dir $(sample))/*_1.fastq)",
+                    "$(glob $(dir $(sample))/*_2.fastq)"
+                ],
+                "task.stdout": "$(basename $(glob $(dir $(sample))/*_1.fastq)).sam",
+                "reference_collection": {
+                    "required": true,
+                    "dataclass": "Collection"
+                },
+                "sample": {
+                    "required": true,
+                    "dataclass": "Collection"
+                }
+            }
+        }
+    }
+}
diff --git a/doc/_includes/_skip_sso_server_install.liquid b/doc/_includes/_skip_sso_server_install.liquid
new file mode 100644 (file)
index 0000000..a5c1511
--- /dev/null
@@ -0,0 +1,6 @@
+<div class="alert alert-block alert-info">
+  <button type="button" class="close" data-dismiss="alert">&times;</button>
+  <h4>Note!</h4>
+  <p>The SSO server codebase currently uses OpenID 2.0 to talk to Google's authentication service. Google <a href="https://developers.google.com/accounts/docs/OpenID2">has deprecated that protocol</a>. This means that new clients will not be allowed to talk to Google's authentication services anymore over OpenID 2.0, and they will phase out the use of OpenID 2.0 completely in the coming monts. We are working on upgrading the SSO server codebase to a newer protocol. That work should be complete by the end of November 2014. In the mean time, anyone is free to use the existing Curoverse SSO server for any local Arvados installation. Instructions to do so are provided on the "API server":install-api-server.html page.</p>
+  <p><strong>Recommendation: skip this step</strong></p>
+</div>
diff --git a/doc/_includes/_tutorial_submit_job.liquid b/doc/_includes/_tutorial_submit_job.liquid
new file mode 100644 (file)
index 0000000..57063b3
--- /dev/null
@@ -0,0 +1,19 @@
+{
+  "name":"My md5 pipeline",
+  "components":{
+    "do_hash":{
+      "repository":"$USER",
+      "script":"hash.py",
+      "script_version":"master",
+      "runtime_constraints":{
+        "docker_image":"arvados/jobs-java-bwa-samtools"
+      },
+      "script_parameters":{
+        "input":{
+          "required": true,
+          "dataclass": "Collection"
+        }
+      }
+    }
+  }
+}
index b2c7924382bdf9aea3079522f9fa144ca6b97464..7aa5896ea5314e3535182fbe5ba194902d86bc43 100644 (file)
@@ -21,7 +21,7 @@ Arguments:
 
 table(table table-bordered table-condensed).
 |_. Argument |_. Type |_. Description |_. Location |_. Example |
-|node|object||query||
+{background:#ccffcc}.|node|object||query||
 
 h2. delete
 
index 8a542650d2ea482ff3198a8ff3c13824c54dd332..2bf67eb6ffc4d0edff8606613f5748b0d1450ae1 100644 (file)
@@ -22,3 +22,4 @@ table(table table-bordered table-condensed).
 |group_class|string|Type of group. This does not affect behavior, but determines how the group is presented in the user interface. For example, @project@ indicates that the group should be displayed by Workbench and arv-mount as a project for organizing and naming objects.|@"project"@
 null|
 |description|text|||
+|writable_by|array|List of UUID strings identifying Users and other Groups that have write permission for this Group.  Only users who are allowed to administer the Group will receive a full list.  Other users will receive a partial list that includes the Group's owner_uuid and (if applicable) their own user UUID.||
index c95a2439a24204a9db7bfbddb80cb84ab99f9c7e..9a1b0566d4b0861e340e3d198de81ccb60e67e38 100644 (file)
@@ -26,3 +26,4 @@ table(table table-bordered table-condensed).
 |prefs|hash|||
 |default_owner_uuid|string|||
 |is_active|boolean|||
+|writable_by|array|List of UUID strings identifying Groups and other Users that can modify this User object.  This will include the user's owner_uuid and, for administrators and users requesting their own User object, the requesting user's UUID.||
index d8dec0e69417ea2534a8b5ad0c96aa863119affb..76df32c9e2b27aefdb96e6119b4e1bb216d18174 100644 (file)
Binary files a/doc/images/workbench-dashboard.png and b/doc/images/workbench-dashboard.png differ
index 3b398356d3b4aab61ff27e02fb4aad981f5882a4..e1de8c3e602141c265781274e3bff20797b4b640 100644 (file)
@@ -78,6 +78,8 @@ Generate a new secret token for signing cookies:
 zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
 </code></pre></notextile>
 
+If you want access control on your Keep server(s), you should set @blob_signing_key@ to the same value as the permission key you provided to your "Keep server(s)":install-keep.html.
+
 Put it in @config/application.yml@ in the production or common section:
 
 <notextile>
@@ -124,7 +126,20 @@ Set up omniauth:
 <pre><code>~/arvados/services/api$ <span class="userinput">cp -i config/initializers/omniauth.rb.example config/initializers/omniauth.rb
 </code></pre></notextile>
 
-Edit @config/initializers/omniauth.rb@. Set @APP_SECRET@ to the value of @app_secret@ from "installing the single sign on server":install-sso.html .
+Edit @config/initializers/omniauth.rb@, and tell your api server to use the Curoverse SSO server for authentication:
+
+<notextile>
+<pre><code>APP_ID = 'local_docker_installation'
+APP_SECRET = 'yohbai4eecohshoo1Yoot7tea9zoca9Eiz3Tajahweo9eePaeshaegh9meiye2ph'
+CUSTOM_PROVIDER_URL = 'https://auth.curoverse.com'
+</code></pre></notextile>
+</pre>
+
+<div class="alert alert-block alert-info">
+  <button type="button" class="close" data-dismiss="alert">&times;</button>
+  <h4>Note!</h4>
+  <p>You can also run your own SSO server. However, the SSO server codebase currently uses OpenID 2.0 to talk to Google's authentication service. Google <a href="https://developers.google.com/accounts/docs/OpenID2">has deprecated that protocol</a>. This means that new clients will not be allowed to talk to Google's authentication services anymore over OpenID 2.0, and they will phase out the use of OpenID 2.0 completely in the coming monts. We are working on upgrading the SSO server codebase to a newer protocol. That work should be complete by the end of November 2014. In the mean time, anyone is free to use the existing Curoverse SSO server for any local Arvados installation.</p>
+</div>
 
 You can now run the development server:
 
index dbb411b46540a67f2d204cf85c2f692fcde4fdfe..20670f3740601f34b92bc1ae0ab2354079ee8a0a 100644 (file)
@@ -13,20 +13,20 @@ First add the Arvados apt repository, and then install the Keep package.
 ~$ <span class="userinput">echo "deb http://apt.arvados.org/ wheezy main" >> /etc/apt/sources.list.d/apt.arvados.org.list</span>
 ~$ <span class="userinput">/usr/bin/apt-key adv --keyserver pgp.mit.edu --recv 1078ECD7</span>
 ~$ <span class="userinput">/usr/bin/apt-get update</span>
-~$ <span class="userinput">/usr/bin/apt-get install keep</span>
+~$ <span class="userinput">/usr/bin/apt-get install keepstore</span>
 </code></pre>
 </notextile>
 
 Verify that Keep is functional:
 
 <notextile>
-<pre><code>~$ <span class="userinput">keep -h</span>
-keep -h
+<pre><code>~$ <span class="userinput">keepstore -h</span>
 2014/07/24 15:38:27 Keep started: pid 13606
-Usage of keep:
+Usage of keepstore:
   -data-manager-token-file="": File with the API token used by the Data Manager. All DELETE requests or GET /index requests must carry this token.
   -enforce-permissions=false: Enforce permission signatures on requests.
   -listen=":25107": Interface on which to listen for requests, in the format ipaddr:port. e.g. -listen=10.0.1.24:8000. Use -listen=:port to listen on all network interfaces.
+  -never-delete=false: If set, nothing will be deleted. HTTP 405 will be returned for valid DELETE requests.
   -permission-key-file="": File containing the secret key for generating and verifying permission signatures.
   -permission-ttl=1209600: Expiration time (in seconds) for newly generated permission signatures.
   -pid="": Path to write pid file
@@ -35,10 +35,12 @@ Usage of keep:
 </code></pre>
 </notextile>
 
+If you want access control on your Keep server(s), you should provide a permission key. The @-permission-key-file@ argument should contain the path to a file that contains a single line with a long random alphanumeric string. It should be the same as the @blob_signing_key@ that can be set in the "API server":install-api-server.html config/application.yml file.
+
 Prepare one or more volumes for Keep to use. Simply create a /keep directory on all the partitions you would like Keep to use, and then start Keep. For example, using 2 tmpfs volumes:
 
 <notextile>
-<pre><code>~$ <span class="userinput">keep</span>
+<pre><code>~$ <span class="userinput">keepstore</span>
 2014/07/24 11:41:37 Keep started: pid 20736
 2014/07/24 11:41:37 adding Keep volume: /tmp/tmp.vwSCtUCyeH/keep
 2014/07/24 11:41:37 adding Keep volume: /tmp/tmp.Lsn4w8N3Xv/keep
index 2f2ba5151b33a1c4103833c8d02187121917e60d..178673a62246923e15cae3bee743ac9052e0a5b1 100644 (file)
@@ -4,6 +4,21 @@ navsection: installguide
 title: Install Single Sign On (SSO) server
 ...
 
+{% include 'skip_sso_server_install' %}
+
+h2(#dependencies). Install dependencies
+
+You need to have ruby 2.1 or higher and the bundler gem installed.
+
+One way to install those dependencies is:
+
+<notextile>
+<pre><code>~$ <span class="userinput">\curl -sSL https://get.rvm.io | bash -s stable --ruby=2.1</span>
+~$ <span class="userinput">gem install bundler
+</span></code></pre></notextile>
+
+h2(#install). Install SSO server
+
 <notextile>
 <pre><code>~$ <span class="userinput">cd $HOME</span> # (or wherever you want to install)
 ~$ <span class="userinput">git clone https://github.com/curoverse/sso-devise-omniauth-provider.git</span>
index 055ef478923c5618484387e37824c230de548b63..ea9e73cfbc6fff962fceaec8e218b666643ab140 100644 (file)
@@ -73,7 +73,14 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 Copy @config/application.yml.example@ to @config/application.yml@ and edit it appropriately for your environment.
 
 * Set @secret_token@ to the string you generated with @rake secret@.
-* Point @arvados_login_base@ and @arvados_v1_base@ at your "API server":install-api-server.html
+* Point @arvados_login_base@ and @arvados_v1_base@ at your "API server":install-api-server.html, like this:
+
+<notextile>
+<pre><code>arvados_login_base: https://your.host:3030/login
+arvados_v1_base: https://your.host:3030/arvados/v1
+</code></pre>
+</notextile>
+
 * @site_name@ can be any string to identify this Workbench.
 * If the SSL certificate you use for development isn't signed by a CA, make sure @arvados_insecure_https@ is @true@.
 
diff --git a/doc/user/reference/job-pipeline-ref.html.textile.liquid b/doc/user/reference/job-pipeline-ref.html.textile.liquid
new file mode 100644 (file)
index 0000000..f8f749c
--- /dev/null
@@ -0,0 +1,7 @@
+---
+layout: default
+navsection: userguide
+title: "Pipeline template reference"
+...
+
+Pipeline template options are described on the "pipeline template schema page.":{{site.baseurl}}/api/schema/PipelineTemplate.html
diff --git a/doc/user/topics/arv-docker.html.textile.liquid b/doc/user/topics/arv-docker.html.textile.liquid
new file mode 100644 (file)
index 0000000..0a0693f
--- /dev/null
@@ -0,0 +1,202 @@
+---
+layout: default
+navsection: userguide
+title: "Customizing Crunch environment using Docker"
+...
+
+This page describes how to customize the runtime environment (e.g. the programs, libraries, and other dependencies needed to run a job) that a crunch script will be run in using "Docker.":https://www.docker.com/  Docker is a tool for building and running containers that isolate applications from other applications running on the same node.  For detailed information about Docker, see the "Docker User Guide.":https://docs.docker.com/userguide/
+
+This page will demonstrate how to:
+
+# Fetch the arvados/jobs Docker image
+# Manually install additional software into the container
+# Create a new custom image
+# Upload that image to Arvados for use by Crunch jobs
+# Share your image with others
+
+{% include 'tutorial_expectations' %}
+
+You also need ensure that "Docker is installed,":https://docs.docker.com/installation/ the Docker daemon is running, and you have permission to access Docker.  You can test this by running @docker version@.  If you receive a permission denied error, your user account may need to be added to the @docker@ group.  If you have root access, you can add yourself to the @docker@ group using @$ sudo addgroup $USER docker@ then log out and log back in again; otherwise consult your local sysadmin.
+
+h2. Fetch a starting image
+
+The easiest way to begin is to start from the "arvados/jobs" image which already has the Arvados SDK installed along with other configuration required for use with Crunch.
+
+Download the latest "arvados/jobs" image from the Docker registry:
+
+<notextile>
+<pre><code>$ <span class="userinput">docker pull arvados/jobs</span>
+Pulling repository arvados/jobs
+3132168f2acb: Download complete
+a42b7f2c59b6: Download complete
+e5afdf26a7ae: Download complete
+5cae48636278: Download complete
+7a4f91b70558: Download complete
+a04a275c1fd6: Download complete
+c433ff206a22: Download complete
+b2e539b45f96: Download complete
+073b2581c6be: Download complete
+593915af19dc: Download complete
+32260b35005e: Download complete
+6e5b860c1cde: Download complete
+95f0bfb43d4d: Download complete
+c7fd77eedb96: Download complete
+0d7685aafd00: Download complete
+</code></pre>
+</notextile>
+
+h2. Install new packages
+
+Next, enter the container using @docker run@, providing the arvados/jobs image and the program you want to run (in this case the bash shell).
+
+<notextile>
+<pre><code>$ <span class="userinput">docker run --interactive --tty --user root arvados/jobs /bin/bash</span>
+root@a0e8299b59aa:/#
+</code></pre>
+</notextile>
+
+Next, update the package list using @apt-get update@.
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">apt-get update</span>
+Get:1 http://apt.arvados.org wheezy Release.gpg [490 B]
+Get:2 http://apt.arvados.org wheezy Release [1568 B]
+Get:3 http://apt.arvados.org wheezy/main amd64 Packages [34.6 kB]
+Get:4 http://ftp.us.debian.org wheezy Release.gpg [1655 B]
+Get:5 http://ftp.us.debian.org wheezy-updates Release.gpg [836 B]
+Get:6 http://ftp.us.debian.org wheezy Release [168 kB]
+Ign http://apt.arvados.org wheezy/main Translation-en
+Get:7 http://security.debian.org wheezy/updates Release.gpg [836 B]
+Get:8 http://security.debian.org wheezy/updates Release [102 kB]
+Get:9 http://ftp.us.debian.org wheezy-updates Release [124 kB]
+Get:10 http://ftp.us.debian.org wheezy/main amd64 Packages [5841 kB]
+Get:11 http://security.debian.org wheezy/updates/main amd64 Packages [218 kB]
+Get:12 http://security.debian.org wheezy/updates/main Translation-en [123 kB]
+Hit http://ftp.us.debian.org wheezy/main Translation-en
+Hit http://ftp.us.debian.org wheezy-updates/main amd64 Packages/DiffIndex
+Hit http://ftp.us.debian.org wheezy-updates/main Translation-en/DiffIndex
+Fetched 6617 kB in 5s (1209 kB/s)
+Reading package lists... Done
+</code></pre>
+</notextile>
+
+In this example, we will install the "R" statistical language Debian package "r-base-core".  Use @apt-get install@:
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">apt-get install r-base-core</span>
+Reading package lists... Done
+Building dependency tree
+Reading state information... Done
+The following extra packages will be installed:
+  [...]
+libxv1 libxxf86dga1 libxxf86vm1 r-base-core r-base-dev r-base-html r-cran-boot r-cran-class r-cran-cluster r-cran-codetools
+  [...]
+Suggested packages:
+  [...]
+The following NEW packages will be installed:
+  [...]
+  libxv1 libxxf86dga1 libxxf86vm1 r-base r-base-core r-base-dev r-base-html r-cran-boot r-cran-class r-cran-cluster
+  [...]
+0 upgraded, 107 newly installed, 0 to remove and 9 not upgraded.
+Need to get 88.2 MB of archives.
+After this operation, 219 MB of additional disk space will be used.
+Do you want to continue [Y/n]? y
+[...]
+Get:85 http://ftp.us.debian.org/debian/ wheezy/main r-base-core amd64 2.15.1-4 [20.6 MB]
+Get:86 http://ftp.us.debian.org/debian/ wheezy/main r-base-dev all 2.15.1-4 [3882 B]
+Get:87 http://ftp.us.debian.org/debian/ wheezy/main r-cran-boot all 1.3-5-1 [472 kB]
+[...]
+Fetched 88.2 MB in 2min 17s (642 kB/s)
+Extracting templates from packages: 100%
+Preconfiguring packages ...
+[...]
+Unpacking r-base-core (from .../r-base-core_2.15.1-4_amd64.deb) ...
+Selecting previously unselected package r-base-dev.
+Unpacking r-base-dev (from .../r-base-dev_2.15.1-4_all.deb) ...
+Selecting previously unselected package r-cran-boot.
+Unpacking r-cran-boot (from .../r-cran-boot_1.3-5-1_all.deb) ...
+[...]
+Setting up r-base-core (2.15.1-4) ...
+Setting R_PAPERSIZE_USER default to 'a4'
+
+Creating config file /etc/R/Renviron with new version
+Setting up r-base-dev (2.15.1-4) ...
+Setting up r-cran-boot (1.3-5-1) ...
+[...]
+</code></pre>
+</notextile>
+
+Now we can verify that "R" is installed:
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">R</span>
+
+R version 2.15.1 (2012-06-22) -- "Roasted Marshmallows"
+Copyright (C) 2012 The R Foundation for Statistical Computing
+ISBN 3-900051-07-0
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+>
+</code></pre>
+</notextile>
+
+Note that you are not limited to installing Debian packages.  You may compile programs or libraries from source and install them, edit systemwide configuration files, use other package managers such as @pip@ or @gem@, and perform any other customization necessary to run your program.
+
+h2. Create a new image
+
+We're now ready to create a new Docker image.  First, quit the container, then use @docker commit@ to create a new image from the stopped container.  The container id can be found in the default hostname of the container displayed in the prompt, in this case @a0e8299b59aa@:
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">exit</span>
+$ <span class="userinput">docker commit a0e8299b59aa arvados/jobs-with-r</span>
+33ea6b87792364cb9989a149c36a31e5a9c8cf96694ba05f66545ad7b842522e
+$ <span class="userinput">docker images</span>
+REPOSITORY            TAG                 IMAGE ID            CREATED              VIRTUAL SIZE
+arvados/jobs-with-r   latest              33ea6b877923        43 seconds ago       1.607 GB
+arvados/jobs          latest              3132168f2acb        22 hours ago         1.314 GB
+</code></pre>
+</notextile>
+
+h2. Upload your image
+
+Finally, we are ready to upload the new Docker image to Arvados.  Use @arv keep docker@ with the image repository name to upload the image.  Without arguments, @arv keep docker@ will print out the list of Docker images in Arvados that are available to you.
+
+<notextile>
+<pre><code>$ <span class="userinput">arv keep docker arvados/jobs-with-r</span>
+1591M / 1591M 100.0%
+Collection saved as 'Docker image arvados/jobs-with-r:latest 33ea6b877923'
+qr1hi-4zz18-3fk2px2ji25nst2
+$ <span class="userinput">arv keep docker</span>
+REPOSITORY                      TAG         IMAGE ID      COLLECTION                     CREATED
+arvados/jobs-with-r             latest      33ea6b877923  qr1hi-4zz18-3fk2px2ji25nst2    Thu Oct 16 13:58:53 2014
+</code></pre>
+</notextile>
+
+You are now able to specify the runtime environment for your program using the @docker_image@ field of the @runtime_constaints@ section of your pipeline components:
+
+<notextile>
+{% code 'example_docker' as javascript %}
+</notextile>
+
+* The @docker_image@ field can be one of: the Docker repository name (as shown above), the Docker image hash, the Arvados collection UUID, or the Arvados collection portable data hash.
+
+h2. Share Docker images
+
+Docker images are subject to normal Arvados permissions.  If wish to share your Docker image with others (or wish to share a pipeline template that uses your Docker image) you will need to use @arv keep docker@ with the @--project-uuid@ option to upload the image to a shared project.
+
+<notextile>
+<pre><code>$ <span class="userinput">arv keep docker --project-uuid zzzzz-j7d0g-u7zg1qdaowykd8d arvados/jobs-with-r</span>
+</code></pre>
+</notextile>
diff --git a/doc/user/topics/run-command.html.textile.liquid b/doc/user/topics/run-command.html.textile.liquid
new file mode 100644 (file)
index 0000000..6d3e87b
--- /dev/null
@@ -0,0 +1,212 @@
+---
+layout: default
+navsection: userguide
+title: "run-command reference"
+...
+
+The @run-command@ crunch script enables you run command line programs.
+
+h1. Using run-command
+
+The basic @run-command@ process evaluates its inputs and builds a command line, executes the command, and saves the contents of the output directory back to Keep.  For large datasets, @run-command@ can schedule concurrent tasks to execute the wrapped program over a range of inputs (see @task.foreach@ below.)
+
+@run-command@ is controlled through the @script_parameters@ section of a pipeline component.  @script_parameters@ is a JSON object consisting of key-value pairs.  There are three categories of keys that are meaningful to run-command:
+* The @command@ section defining the template to build the command line of task
+* Special processing directives such as @task.foreach@ @task.cwd@ @task.vwd@ @task.stdin@ @task.stdout@
+* User-defined parameters (everything else)
+
+In the following examples, you can use "dry run mode" to determine the command line that @run-command@ will use without actually running the command.  For example:
+
+<notextile>
+<pre><code>~$ <span class="userinput">./run-command --dry-run --script-parameters '{
+  "command": ["echo", "hello world"]
+}'</span>
+run-command: echo hello world
+</code></pre>
+</notextile>
+
+h2. Command template
+
+The value of the "command" key is a list.  The first parameter of the list is the actual program to invoke, followed by the command arguments.  The simplest @run-command@ invocation simply runs a program with static parameters.  In this example, run "echo" with the first argument "hello world":
+
+<pre>
+{
+  "command": ["echo", "hello world"]
+}
+</pre>
+
+Running this job will print "hello world" to the job log.
+
+By default, the command will start with the current working directory set to the output directory.  Anything written to the output directory will be saved to Keep when the command is finished.  You can change the default working directory using @task.cwd@ and get the path to the output directory using @$(task.outdir)@ as explained below.
+
+Items in the "command" list may include lists and objects in addition to strings.  Lists are flattened to produce the final command line.  JSON objects are evaluated as list item functions (see below).  For example, the following evaluates to @["echo", "hello", "world"]@:
+
+<pre>
+{
+  "command": ["echo", ["hello", "world"]]
+}
+</pre>
+
+h2. Parameter substitution
+
+The "command" list can include parameter substitutions.  Substitutions are enclosed in "$(...)" and may contain the name of a user-defined parameter.  In the following example, the value of "a" is "hello world"; so when "command" is evaluated, it will substitute "hello world" for "$(a)":
+
+<pre>
+{
+  "command": ["echo", "$(file $(a))"],
+  "a": "c1bad4b39ca5a924e481008009d94e32+210/var-GS000016015-ASM.tsv.bz2"
+}
+</pre>
+
+table(table table-bordered table-condensed).
+|_. Function|_. Action|
+|$(file ...)       | Takes a reference to a file within an Arvados collection and evaluates to a file path on the local file system where that file can be accessed by your command.  Will raise an error if the file is not accessible.|
+|$(dir ...)        | Takes a reference to an Arvados collection or directory within an Arvados collection and evaluates to a directory path on the local file system where that directory can be accessed by your command.  The path may include a file name, in which case it will evaluate to the parent directory of the file.  Uses Python's os.path.dirname(), so "/foo/bar" will evaluate to "/foo" but "/foo/bar/" will evaluate to "/foo/bar".  Will raise an error if the directory is not accessible. |
+|$(basename&nbsp;...)   | Strip leading directory and trailing file extension from the path provided.  For example, $(basename /foo/bar.baz.txt) will evaluate to "bar.baz".|
+|$(glob ...)       | Take a Unix shell path pattern (supports @*@ @?@ and @[]@) and search the local filesystem, returning the first match found.  Use together with $(dir ...) to get a local filesystem path for Arvados collections.  For example: $(glob $(dir $(mycollection)/*.bam)) will find the first .bam file in the collection specified by the user parameter "mycollection".  If there is more than one match, which one is returned is undefined.  Will raise an error if no matches are found.|
+
+h2. List context
+
+When a parameter is evaluated in a list context, that means its value should evaluate to a list instead of a string.  Parameter values can be a static list (as demonstrated above), a path to a file, a path to a directory, or a JSON object describing a list context function.
+
+If the value is a static list, it will evaluate the list items for parameter substation and list functions.
+
+If the value is a string, it is interpreted as a path.  If the path specifies a regular file, that file will be opened as a text file and produce a list with one item for each line in the file (end-of-line characters will be stripped).  If the path specifies a directory, produce a list containing all of the entries in the directory.  Note that parameter expansion is not performed on lists produced this way.
+
+If the value is a JSON object, it is evaluated as a list function described below.
+
+h2. List functions
+
+When @run-command@ is evaluating a list (such as "command"), in addition to string parameter substitution, you can use list item functions.  Note: in the following functions, you specify the name of a user parameter to act on; you cannot provide the list value directly in line.
+
+h3. foreach
+
+The @foreach@ list item function (not to be confused with the @task.foreach@ directive) expands a command template for each item in the specified user parameter (the value of the user parameter is evaluated in a list context, as described above).  The following example will evaluate "command" to @["echo", "--something", "alice", "--something", "bob"]@:
+
+<pre>
+{
+  "command": ["echo", {"foreach": "a", "command": ["--something", "$(a)"]}],
+  "a": ["alice", "bob"]
+}
+</pre>
+
+h3. index
+
+This function extracts a single item from a list.  The value of @index@ is zero-based (i.e. the first item is at index 0, the second item index 1, etc).  The following example will evaluate "command" to @["echo", "--something", "bob"]@:
+
+<pre>
+{
+  "command": ["echo", {"list": "a", "index": 1, "command": ["--something", "$(a)"]}],
+  "a": ["alice", "bob"]
+}
+</pre>
+
+h3. filter
+
+Filter the list so that it only includes items that match a regular expression.  The following example will evaluate to @["echo", "bob"]@
+
+<pre>
+{
+  "command": ["echo", {"filter": "a", "regex": "b.*"}],
+  "a": ["alice", "bob"]
+}
+</pre>
+
+h3. group
+
+Generate a list of lists, where items are grouped on common subexpression match.  Items which don't match the regular expression are excluded.  The following example evaluates to @["echo", "--group", "alice", "carol", "dave", "--group", "bob"]@:
+
+<pre>
+{
+  "command": ["echo", {"foreach": "b", "command":["--group", {"foreach": "b", "command":"$(b)"}]}],
+  "a": ["alice", "bob", "carol", "dave"],
+  "b": {"group": "a", "regex": "[^a]*(a?).*"}
+}
+</pre>
+
+h3. extract
+
+Generate a list of lists, where items are split by subexpression match.  Items which don't match the regular expression are excluded.  The following example evaluates to @["echo", "c", "a", "rol", "d", "a", "ve"]@:
+
+<pre>
+{
+  "command": ["echo", {"foreach": "b", "command":[{"foreach": "b", "command":"$(b)"}]}],
+  "a": ["alice", "bob", "carol", "dave"],
+  "b": {"extract": "a", "regex": "(.+)(a)(.*)"}
+}
+</pre>
+
+h2. Directives
+
+Directives alter the behavior of run-command.  All directives are optional.
+
+h3. task.cwd
+
+This directive sets the initial current working directory that your command will run in.  If @task.cwd@ is not specified, the default current working directory is @task.outdir@.
+
+h3. task.stdin and task.stdout
+
+Provide standard input and standard output redirection.
+
+@task.stdin@ must evaluate to a path to a file to be bound to the command's standard input stream.
+
+@task.stdout@ specifies the desired file name in the output directory to save the content of standard output.
+
+h3. task.vwd
+
+Background: because Keep collections are read-only, this does not play well with certain tools that expect to be able to write their outputs alongside their inputs (such as tools that generate indexes that are closely associated with the original file.)  The run-command's solution to this is the "virtual working directory".
+
+@task.vwd@ specifies a Keep collection with the starting contents of the directory.  @run-command@ will then populate @task.outdir@ with directories and symlinks to mirror the contents of the @task.vwd@ collection.  Your command will then be able to both access its input files and write its output files in @task.outdir@.  When the command completes, the output collection will merge the output of your command with the contents of the starting collection.  Note that files in the starting collection remain read-only and cannot be altered or deleted.
+
+h3. task.foreach
+
+Using @task.foreach@, you can run your command concurrently over large datasets.
+
+@task.foreach@ takes the names of one or more user-defined parameters.  The value of these parameters are evaluated in a list context.  @run-command@ then generates tasks based on the Cartesian product (i.e. all combinations) of the input lists.  The outputs of all tasks are merged to create the final output collection.  Note that if two tasks output a file in the same directory with the same name, that file will be concatenated in the final output.  In the following example, three tasks will be created for the "grep" command, based on the contents of user parameter "a":
+
+<pre>
+{
+  "command": ["echo", "$(a)"],
+  "task.foreach": "a",
+  "a": ["alice", "bob", "carol"]
+}
+</pre>
+
+This evaluates to the commands:
+<notextile>
+<pre>
+["echo", "alice"]
+["echo", "bob"]
+["echo", "carol"]
+</pre>
+</notextile>
+
+You can also specify multiple parameters:
+
+<pre>
+{
+  "command": ["echo", "$(a)", "$(b)"],
+  "task.foreach": ["a", "b"],
+  "a": ["alice", "bob"],
+  "b": ["carol", "dave"]
+}
+</pre>
+
+This evaluates to the commands:
+
+<pre>
+["echo", "alice", "carol"]
+["echo", "alice", "dave"]
+["echo", "bob", "carol"]
+["echo", "bob", "dave"]
+</pre>
+
+h1. Examples
+
+The following is a single task pipeline using @run-command@ to run the bwa alignment tool to align a single paired-end read fastq sample.  The input to this pipeline is the reference genome and a collection consisting of two fastq files for the read pair.
+
+<notextile>{% code 'run_command_simple_example' as javascript %}</notextile>
+
+The following is a concurrent task pipeline using @run-command@ to run the bwa alignment tool to align a set of fastq reads over multiple samples.  The input to this pipeline is the reference genome and a collection consisting subdirectories for each sample, with each subdirectory containing pairs of fastq files for each set of reads.
+
+<notextile>{% code 'run_command_foreach_example' as javascript %}</notextile>
index f13d1c8319a52d0c548c023f796887c2f12504d4..18f5f7d35f75eabeb2c45be8744846572bc7d548 100644 (file)
@@ -8,15 +8,13 @@ This tutorial demonstrates how to construct a two stage pipeline template that u
 
 {% include 'tutorial_expectations' %}
 
-Use the following command to create a new empty template using @arv pipeline_template create@:
+Use the following command to create an empty template using @arv create pipeline_template@:
 
 <notextile>
-<pre><code>~$ <span class="userinput">arv edit $(arv --format=uuid pipeline_template create --pipeline-template '{}') name components </span></code></pre>
+<pre><code>~$ <span class="userinput">arv create pipeline_template</span></code></pre>
 </notextile>
 
-* @--format=uuid@ option prints out just the unique identifier for the new template, instead of the entire template record (default)
-
-This will open the template record in an interactive text editor (as specified by $EDITOR or $VISUAL, otherwise defaults to @nano@) using @arv edit@.  Now add the following content:
+This will open the template record in an interactive text editor (as specified by $EDITOR or $VISUAL, otherwise defaults to @nano@).  Now, update the contents of the editor with the following content:
 
 <notextile>{% code 'tutorial_bwa_sortsam_pipeline' as javascript %}</notextile>
 
@@ -40,6 +38,8 @@ This will open the template record in an interactive text editor (as specified b
 
 When using @run-command@, the tool should write its output to the current working directory.  The output will be automatically uploaded to Keep when the job completes.
 
+See the "run-command reference":{{site.baseurl}}/user/topics/run-command.html for more information about using @run-command@.
+
 h2. Running your pipeline
 
 Your new pipeline template should appear at the top of the Workbench "pipeline&nbsp;templates":https://{{ site.arvados_workbench_host }}/pipeline_templates page.  You can run your pipeline "using Workbench":tutorial-pipeline-workbench.html or the "command line.":{{site.baseurl}}/user/topics/running-pipeline-command-line.html
index e07be5d907d43decf9f01f226a7f58fe70e641c0..a41fede744e92dfaab1b43782a81121d57fff94b 100644 (file)
@@ -16,17 +16,21 @@ h2. Arv-mount
 * It is easy for existing tools to access files in Keep.
 * Data is downloaded on demand.  It is not necessary to download an entire file or collection to start processing.
 
-The default mode permits browsing any collection in Arvados as a subdirectory under the mount directory.  To avoid having to fetch a potentially large list of all collections, collection directories only come into existence when explicitly accessed by their keep locator.
+The default mode permits browsing any collection in Arvados as a subdirectory under the mount directory.  To avoid having to fetch a potentially large list of all collections, collection directories only come into existence when explicitly accessed by their Keep locator. For instance, a collection may be found by its content hash in the @keep/by_id@ directory.
 
 <notextile>
 <pre><code>~$ <span class="userinput">mkdir -p keep</span>
 ~$ <span class="userinput">arv-mount keep</span>
-~$ <span class="userinput">cd keep/c1bad4b39ca5a924e481008009d94e32+210</span>
-~/keep/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">ls</span>
+~$ <span class="userinput">cd keep/by_id/c1bad4b39ca5a924e481008009d94e32+210</span>
+~/keep/by_id/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">ls</span>
 var-GS000016015-ASM.tsv.bz2
-~/keep/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">md5sum var-GS000016015-ASM.tsv.bz2</span>
+~/keep/by_id/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">md5sum var-GS000016015-ASM.tsv.bz2</span>
 44b8ae3fde7a8a88d2f7ebd237625b4f  var-GS000016015-ASM.tsv.bz2
-~/keep/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">cd ../..</span>
+~/keep/by_id/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">cd ../..</span>
 ~$ <span class="userinput">fusermount -u keep</span>
 </code></pre>
 </notextile>
+
+The last line unmounts Keep.  Subdirectories will no longer be accessible.
+
+Within each directory on Keep, there is a @.arvados#collection@ file that does not show up with @ls@. Its contents include, for instance, the @portable_data_hash@, which is the same as the Keep locator.
diff --git a/doc/user/tutorials/tutorial-new-pipeline.html.textile.liquid b/doc/user/tutorials/tutorial-new-pipeline.html.textile.liquid
deleted file mode 100644 (file)
index 83303ad..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
----
-layout: default
-navsection: userguide
-title: "Writing a multi-step pipeline"
-...
-
-A pipeline in Arvados is a collection of crunch scripts, in which the output from one script may be used as the input to another script.
-
-{% include 'tutorial_expectations' %}
-
-
-First, use @arv pipeline_template create@ to create a new empty template.  The @--format=uuid@ option will print out the unique identifier for the new template:
-
-<notextile>
-<pre><code>~$ <span class="userinput">arv --format=uuid pipeline_template create --pipeline-template '{}'</span>
-qr1hi-p5p6p-wt1vdhkezgx7g2k
-</span></code></pre>
-</notextile>
-
-Next, use @arv edit@ to edit the template.  This will open the template record in an interactive text editor (as specified by $EDITOR or $VISUAL, otherwise defaults to @nano@).  Replace the empty fields with the following content:
-
-<notextile>{% code 'tutorial_bwa_pipeline' as javascript %}</notextile>
-
-
-Your new pipeline template will appear on the Workbench "Pipeline&nbsp;templates":https://{{ site.arvados_workbench_host }}/pipeline_templates page.
-
-For more information and examples for writing pipelines, see the "pipeline template reference":{{site.baseurl}}/api/schema/PipelineTemplate.html
index 5df82a789e316b5327d4ddedca4e00e993302a79..36408801f4952d9fc7134761334edbbd74c59af1 100644 (file)
@@ -4,11 +4,10 @@ navsection: userguide
 title: "Running a pipeline using Workbench"
 ...
 
-A "pipeline" (sometimes called a "workflow" in other systems) is a sequence of steps that apply various programs or tools to transform input data to output data.  Pipelines are the principal means of performing computation with Arvados.  This tutorial demonstrates how to run a single-stage pipeline to take a small data set of paired-end reads from a sample "exome":https://en.wikipedia.org/wiki/Exome in "FASTQ":https://en.wikipedia.org/wiki/FASTQ_format format and align them to "Chromosome 19":https://en.wikipedia.org/wiki/Chromosome_19_%28human%29 using the "bwa mem":http://bio-bwa.sourceforge.net/ tool, producing a "Sequence Alignment/Map (SAM)":https://samtools.github.io/ file.  This will introduce the following Arvados features:
+A "pipeline" (sometimes called a "workflow" in other systems) is a sequence of steps that apply various programs or tools to transform input data to output data.  Pipelines are the principal means of performing computation with Arvados.  This tutorial demonstrates how to run a single-stage pipeline to take a small data set of paired-end reads from a sample "exome":https://en.wikipedia.org/wiki/Exome in "FASTQ":https://en.wikipedia.org/wiki/FASTQ_format format and align them to "Chromosome 19":https://en.wikipedia.org/wiki/Chromosome_19_%28human%29 using the "bwa mem":http://bio-bwa.sourceforge.net/ tool, producing a "Sequence Alignment/Map (SAM)":https://samtools.github.io/ file.  This tutorial will introduce the following Arvados features:
 
 <div class="inside-list">
-* How to create a project.
-* How to browse available pipeline templates and create a new pipeline from an existing template.
+* How to create a new pipeline from an existing template.
 * How to browse and select input data for the pipeline and submit the pipeline to run on the Arvados cluster.
 * How to access your pipeline results.
 </div>
@@ -16,8 +15,6 @@ A "pipeline" (sometimes called a "workflow" in other systems) is a sequence of s
 notextile. <div class="spaced-out">
 
 # Start from the *Workbench Dashboard*.  You can access the Dashboard by clicking on *<i class="fa fa-lg fa-fw fa-dashboard"></i> Dashboard* in the upper left corner of any Workbench page.
-# In the *My projects* panel, click the <span class="btn btn-sm btn-primary" > <i class="fa fa-fw fa-plus"></i> Add new project</span> button.  The new project will be created immediately, and your browser opens the new project's page for you to customize it.
-# On the new project page, click on the pencil icon <i class="fa fa-fw fa-pencil"></i> next to *New project* to pop up a text box and change the project title to *Tutorial output*.  Click the <span class="btn btn-xs btn-primary" ><i class="glyphicon glyphicon-ok"></i></span> checkbox button to save the new name.
 # Click on the <span class="btn btn-sm btn-primary"><i class="fa fa-fw fa-gear"></i> Run a pipeline...</span> button.  This will open a dialog box titled *Choose a pipeline to run*.
 # Click to open the *All projects <span class="caret"></span>* menu.  Under the *Projects shared with me* header, select *<i class="fa fa-fw fa-share-alt"></i> Arvados Tutorial*.
 # Select *<i class="fa fa-fw fa-gear"></i> Tutorial align using bwa mem* and click the <span class="btn btn-sm btn-primary" >Next: choose inputs <i class="fa fa-fw fa-arrow-circle-right"></i></span> button.  This will load a new page where you will supply the inputs for the pipeline.
@@ -25,8 +22,8 @@ notextile. <div class="spaced-out">
 # Once again, open the *All projects <span class="caret"></span>* menu and select *<i class="fa fa-fw fa-share-alt"></i> Arvados Tutorial*.  Select *<i class="fa fa-fw fa-archive"></i> Tutorial chromosome 19 reference* and click the <span class="btn btn-sm btn-primary" >OK</span> button.
 # Repeat the previous two steps to set the *Input genome (fastq)* parameter to *<i class="fa fa-fw fa-archive"></i> Tutorial sample exome*.
 # Click on the <span class="btn btn-sm btn-primary" >Run <i class="fa fa-fw fa-play"></i></span> button.  The page updates to show you that the pipeline has been submitted to run on the Arvados cluster.
-# After the pipeline starts running, you can track the progress by watching log messages from jobs.  This page refreshes automatically.  You will see a <span class="label label-success">success</span> label under the *job* the column when the pipeline completes successfully.
-# Click on *<i class="fa fa-fw fa-archive"></i> Show output files* to see the results of the job.  This will load a new page listing the output files from this pipeline.  You'll see the output SAM file from the alignment tool under the *Files* tab.
+# After the pipeline starts running, you can track the progress by watching log messages from jobs.  This page refreshes automatically.  You will see a <span class="label label-success">complete</span> label under the *job* column when the pipeline completes successfully.
+# Click on the *Output* link to see the results of the job.  This will load a new page listing the output files from this pipeline.  You'll see the output SAM file from the alignment tool under the *Files* tab.
 # Click on the <span class="btn btn-sm btn-info"><i class="fa fa-download"></i></span> download button to the right of the SAM file to download your results.
 
 notextile. </div>
index 10abdf0b1c15723baca4ad0384d33e49a3c44887..fc77e5cdc02162f4d42a997cbb644d7323e4adcc 100644 (file)
@@ -90,44 +90,20 @@ To git@git.qr1hi.arvadosapi.com:$USER.git
 
 h2. Create a pipeline template
 
-Next, create a file that contains the pipeline definition:
+Next, create a new template using @arv create pipeline_template@:
 
 <notextile>
-<pre><code>~/$USER/crunch_scripts$ <span class="userinput">cd ~</span>
-~$ <span class="userinput">cat &gt;the_pipeline &lt;&lt;EOF
-{
-  "name":"My md5 pipeline",
-  "components":{
-    "do_hash":{
-      "script":"hash.py",
-      "script_parameters":{
-        "input":{
-          "required": true,
-          "dataclass": "Collection"
-        }
-      },
-      "repository":"$USER",
-      "script_version":"master",
-      "runtime_constraints":{
-        "docker_image":"arvados/jobs-java-bwa-samtools"
-      }
-    }
-  }
-}
-EOF
-</span></code></pre>
+<pre><code>~$ <span class="userinput">arv create pipeline_template</span></code></pre>
 </notextile>
 
+In the editor, enter the following template:
+
+<notextile> {% code 'tutorial_submit_job' as javascript %} </notextile>
+
 * @"repository"@ is the name of a git repository to search for the script version.  You can access a list of available git repositories on the Arvados Workbench under "Code repositories":https://{{site.arvados_workbench_host}}/repositories.
 * @"script_version"@ specifies the version of the script that you wish to run.  This can be in the form of an explicit Git revision hash, a tag, or a branch (in which case it will use the HEAD of the specified branch).  Arvados logs the script version that was used in the run, enabling you to go back and re-run any past job with the guarantee that the exact same code will be used as was used in the previous run.
 * @"script"@ specifies the filename of the script to run.  Crunch expects to find this in the @crunch_scripts/@ subdirectory of the Git repository.
-
-Now, use @arv pipeline_template create@ to register your pipeline template in Arvados:
-
-<notextile>
-<pre><code>~$ <span class="userinput">arv pipeline_template create --pipeline-template "$(cat the_pipeline)"</span>
-</code></pre>
-</notextile>
+* @"runtime_constraints"@ describes the runtime environment required to run the job.  These are described in the "job record schema":{{site.baseurl}}/api/schema/Job.html
 
 h2. Running your pipeline
 
index 9b486d2d798a14b14491b6ff69aba75c6aaf34c1..481cd9ee6014593bb3185e043fda99ce3ab557c2 100755 (executable)
@@ -23,6 +23,7 @@ begin
   require 'oj'
   require 'active_support/inflector'
   require 'yaml'
+  require 'tempfile'
 rescue LoadError
   abort <<-EOS
 
@@ -112,10 +113,14 @@ def init_config
   end
 end
 
-subcommands = %w(keep pipeline tag ws edit)
+subcommands = %w(create edit keep pipeline tag ws)
 
 def check_subcommands client, arvados, subcommand, global_opts, remaining_opts
   case subcommand
+  when 'create'
+    arv_create client, arvados, global_opts, remaining_opts
+  when 'edit'
+    arv_edit client, arvados, global_opts, remaining_opts
   when 'keep'
     @sub = remaining_opts.shift
     if ['get', 'put', 'ls', 'normalize'].index @sub then
@@ -146,8 +151,6 @@ def check_subcommands client, arvados, subcommand, global_opts, remaining_opts
     exec `which arv-tag`.strip, *remaining_opts
   when 'ws'
     exec `which arv-ws`.strip, *remaining_opts
-  when 'edit'
-    arv_edit client, arvados, global_opts, remaining_opts
   end
 end
 
@@ -156,6 +159,67 @@ def arv_edit_save_tmp tmp
   puts "Saved contents to " + tmp.path + ".saved"
 end
 
+def command_exists?(command)
+  ENV['PATH'].split(':').each {|folder| File.executable?(File.join(folder, command))}
+end
+
+def run_editor tmp_file, global_opts
+  need_edit = true
+  while need_edit
+    pid = Process::fork
+    if pid.nil?
+      editor = nil
+      [ENV["VISUAL"], ENV["EDITOR"], "nano", "vi"].each do |e|
+        editor ||= e if e and command_exists? e
+      end
+      if editor.nil?
+        puts "Could not find any editor to use, please set $VISUAL or $EDITOR to your desired editor."
+        exit 1
+      end
+      exec editor, tmp_file.path
+    else
+      Process.wait pid
+    end
+
+    if $?.exitstatus == 0
+      tmp_file.open
+      newcontent = tmp_file.read()
+
+      newobj = {}
+      begin
+        case global_opts[:format]
+        when 'json'
+          newobj = Oj.load(newcontent)
+        when 'yaml'
+          newobj = YAML.load(newcontent)
+        end
+        need_edit = false
+      rescue Exception => e
+        n = 1
+        newcontent.each_line do |line|
+          puts "#{n.to_s.rjust 4}  #{line}"
+          n += 1
+        end
+        puts "Parse error! " + e.to_s
+        puts "\nTry again (y/n)? "
+        yn = "X"
+        while not ["y", "Y", "n", "N"].include?(yn)
+          yn = $stdin.read 1
+        end
+        if yn == 'n' or yn == 'N'
+          arv_edit_save_tmp tmp_file
+          abort
+        end
+      end
+    else
+      puts "Editor exited with status #{$?.exitstatus}"
+      exit $?.exitstatus
+    end
+  end
+
+  newobj
+end
+
 def arv_edit client, arvados, global_opts, remaining_opts
   uuid = remaining_opts.shift
   if uuid.nil? or uuid == "-h" or uuid == "--help"
@@ -224,60 +288,11 @@ def arv_edit client, arvados, global_opts, remaining_opts
     content = results.to_yaml
   end
 
-  require 'tempfile'
-
-  tmp = Tempfile.new([uuid, "." + global_opts[:format]])
-  tmp.write(content)
-  tmp.close
-
-  need_edit = true
-
-  while need_edit
-    pid = Process::fork
-    if pid.nil?
-      editor ||= ENV["VISUAL"]
-      editor ||= ENV["EDITOR"]
-      editor ||= "nano"
-      exec editor, tmp.path
-    else
-      Process.wait pid
-    end
-
-    if $?.exitstatus == 0
-      tmp.open
-      newcontent = tmp.read()
+  tmp_file = Tempfile.new([uuid, "." + global_opts[:format]])
+  tmp_file.write(content)
+  tmp_file.close
 
-      newobj = {}
-      begin
-        case global_opts[:format]
-        when 'json'
-          newobj = Oj.load(newcontent)
-        when 'yaml'
-          newobj = YAML.load(newcontent)
-        end
-        need_edit = false
-      rescue Exception => e
-        puts "Parse error! " + e.to_s
-        n = 1
-        newcontent.each_line do |line|
-          puts "#{n.to_s.rjust 4}  #{line}"
-          n += 1
-        end
-        puts "\nTry again (y/n)? "
-        yn = "X"
-        while not ["y", "Y", "n", "N"].include?(yn)
-          yn = $stdin.read 1
-        end
-        if yn == 'n' or yn == 'N'
-          arv_edit_save_tmp tmp
-          abort
-        end
-      end
-    else
-      puts "Editor exited with status #{$?.exitstatus}"
-      exit $?.exitstatus
-    end
-  end
+  newobj = run_editor tmp_file, global_opts
 
   begin
     if newobj != results
@@ -296,13 +311,14 @@ def arv_edit client, arvados, global_opts, remaining_opts
         puts "Error communicating with server, error was #{e}"
         puts "Update body was:"
         puts dumped
-        arv_edit_save_tmp tmp
+        arv_edit_save_tmp tmp_file
         abort
       end
 
       begin
         results = JSON.parse result.body
       rescue JSON::ParserError => e
+        arv_edit_save_tmp tmp_file
         abort "Failed to parse server response:\n" + e.to_s
       end
 
@@ -310,14 +326,114 @@ def arv_edit client, arvados, global_opts, remaining_opts
         puts "Update failed.  Server responded #{result.response.status}: #{results['errors']} "
         puts "Update body was:"
         puts dumped
-        arv_edit_save_tmp tmp
+        arv_edit_save_tmp tmp_file
         abort
       end
     else
       puts "Object is unchanged, did not update."
     end
   ensure
-    tmp.close(true)
+    tmp_file.close(true)
+  end
+
+  exit 0
+end
+
+def arv_create client, arvados, global_opts, remaining_opts
+  types = resource_types(arvados.discovery_document)
+  create_opts = Trollop::options do
+    opt :project_uuid, "Project uuid in which to create the object", :type => :string
+    stop_on resource_types(arvados.discovery_document)
+  end
+
+  object_type = remaining_opts.shift
+  if object_type.nil?
+    abort "Missing resource type, must be one of #{types.join ', '}"
+  end
+
+  rsc = arvados.discovery_document["resources"].keys.select { |k| object_type == k.singularize }
+  if rsc.empty?
+    abort "Could not determine resource type #{object_type}"
+  end
+  rsc = rsc.first
+
+  discovered_params = arvados.discovery_document["resources"][rsc]["methods"]["create"]["parameters"]
+  method_opts = Trollop::options do
+    banner head_banner
+    banner "Usage: arv create [--project-uuid] #{object_type} [create parameters]"
+    banner ""
+    banner "This method supports the following parameters:"
+    banner ""
+    discovered_params.each do |k,v|
+      opts = Hash.new()
+      opts[:type] = v["type"].to_sym if v.include?("type")
+      if [:datetime, :text, :object, :array].index opts[:type]
+        opts[:type] = :string                       # else trollop bork
+      end
+      opts[:default] = v["default"] if v.include?("default")
+      opts[:default] = v["default"].to_i if opts[:type] == :integer
+      opts[:default] = to_boolean(v["default"]) if opts[:type] == :boolean
+      opts[:required] = true if v.include?("required") and v["required"]
+      description = ''
+      description = '  ' + v["description"] if v.include?("description")
+      opt k.to_sym, description, opts
+    end
+  end
+
+
+  newobj = {}
+  if create_opts[:project_uuid]
+    newobj["owner_uuid"] = create_opts[:project_uuid]
+  end
+
+  case global_opts[:format]
+  when 'json'
+    content = Oj.dump(newobj, :indent => 1)
+  when 'yaml'
+    content = newobj.to_yaml
+  end
+
+  tmp_file = Tempfile.new(["", ".#{global_opts[:format]}"])
+  tmp_file.write(content)
+  tmp_file.close
+
+  newobj = run_editor tmp_file, global_opts
+
+  begin
+    api_method = 'arvados.' + rsc + '.create'
+    dumped = Oj.dump(newobj)
+
+    result = client.execute(:api_method => eval(api_method),
+                            :parameters => method_opts,
+                            :body_object => {object_type => newobj},
+                            :authenticated => false,
+                            :headers => {
+                              authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                            })
+
+    begin
+      results = JSON.parse result.body
+    rescue JSON::ParserError => e
+      arv_edit_save_tmp tmp_file
+      abort "Failed to parse server response:\n" + e.to_s
+    end
+
+    if result.response.status != 200
+      puts "Create failed.  Server responded #{result.response.status}: #{results['errors']} "
+      puts "Create body was:"
+      puts dumped
+      arv_edit_save_tmp tmp_file
+      abort
+    end
+
+    begin
+      puts "Created object #{results['uuid']}"
+    rescue
+      arv_edit_save_tmp tmp_file
+      abort "Unexpected response:\n#{results}"
+    end
+  ensure
+    tmp_file.close(true)
   end
 
   exit 0
@@ -361,13 +477,16 @@ def help_resources(option_parser, discovery_document, resource)
   exit 255
 end
 
-def parse_arguments(discovery_document, subcommands)
+def resource_types discovery_document
   resource_types = Array.new()
   discovery_document["resources"].each do |k,v|
     resource_types << k.singularize
   end
+  resource_types
+end
 
-  resource_types += subcommands
+def parse_arguments(discovery_document, subcommands)
+  resources_and_subcommands = resource_types(discovery_document) + subcommands
 
   option_parser = Trollop::Parser.new do
     version __FILE__
@@ -396,7 +515,7 @@ def parse_arguments(discovery_document, subcommands)
     banner "Additional options:"
 
     conflicts :short, :format
-    stop_on resource_types
+    stop_on resources_and_subcommands
   end
 
   global_opts = Trollop::with_standard_exception_handling option_parser do
@@ -416,7 +535,7 @@ def parse_arguments(discovery_document, subcommands)
   resource = ARGV.shift
 
   if not subcommands.include? resource
-    if not resource_types.include?(resource)
+    if not resources_and_subcommands.include?(resource)
       puts "Resource or subcommand '#{resource}' is not recognized.\n\n" if !resource.nil?
       help_resources(option_parser, discovery_document, resource)
     end
index c6ccf842a5c617422412f52dc549e132053e8939..63313fc8082a18b7c637a222d92cabafc76e4ab3 100755 (executable)
@@ -445,12 +445,17 @@ class WhRunPipelineInstance
         if value.nil? and
             ![false,'false',0,'0'].index parameter[:required]
           if parameter[:output_of]
+            if not @components[parameter[:output_of].intern]
+              errors << [componentname, parametername, "output_of refers to nonexistent component '#{parameter[:output_of]}'"]
+            end
             next
           end
           errors << [componentname, parametername, "required parameter is missing"]
         end
         debuglog "parameter #{componentname}::#{parametername} == #{value}"
-        component[:script_parameters][parametername] = value
+
+        component[:script_parameters][parametername] =
+          parameter.dup.merge(value: value)
       end
     end
     if !errors.empty?
@@ -518,7 +523,9 @@ class WhRunPipelineInstance
           my_submit_id = "instance #{@instance[:uuid]} rand #{rand(2**64).to_s(36)}"
           job = JobCache.create(@instance, cname, {
             :script => c[:script],
-            :script_parameters => c[:script_parameters],
+            :script_parameters => Hash[c[:script_parameters].map do |key, spec|
+                                         [key, spec[:value]]
+                                       end],
             :script_version => c[:script_version],
             :repository => c[:repository],
             :nondeterministic => c[:nondeterministic],
@@ -594,7 +601,7 @@ class WhRunPipelineInstance
               c2[:script_parameters].each do |pname, p|
                 if p.is_a? Hash and p[:output_of] == cname.to_s
                   debuglog "parameter #{c2name}::#{pname} == #{c[:job][:output]}"
-                  c2[:script_parameters][pname] = c[:job][:output]
+                  c2[:script_parameters][pname] = {value: c[:job][:output]}
                   moretodo = true
                 end
               end
index cdf416fbcb2bc1744819a6b74e99487b49ff6167..369bc3e1ae6f021fbd809ecc7fa82c4da2a77ccc 100755 (executable)
@@ -10,12 +10,14 @@ crunch-job: Execute job steps, save snapshots as requested, collate output.
 Obtain job details from Arvados, run tasks on compute nodes (typically
 invoked by scheduler on controller):
 
- crunch-job --job x-y-z
+ crunch-job --job x-y-z --git-dir /path/to/repo/.git
 
 Obtain job details from command line, run tasks on local machine
 (typically invoked by application or developer on VM):
 
- crunch-job --job '{"script_version":"/path/to/tree","script":"scriptname",...}'
+ crunch-job --job '{"script_version":"/path/to/working/tree","script":"scriptname",...}'
+
+ crunch-job --job '{"repository":"https://github.com/curoverse/arvados.git","script_version":"master","script":"scriptname",...}'
 
 =head1 OPTIONS
 
@@ -27,7 +29,9 @@ If the job is already locked, steal the lock and run it anyway.
 
 =item --git-dir
 
-Path to .git directory where the specified commit is found.
+Path to a .git directory (or a git URL) where the commit given in the
+job's C<script_version> attribute is to be found. If this is I<not>
+given, the job's C<repository> attribute will be used.
 
 =item --job-api-token
 
@@ -39,6 +43,11 @@ Do not clear per-job/task temporary directories during initial job
 setup. This can speed up development and debugging when running jobs
 locally.
 
+=item --job
+
+UUID of the job to run, or a JSON-encoded job resource without a
+UUID. If the latter is given, a new job object will be created.
+
 =back
 
 =head1 RUNNING JOBS LOCALLY
@@ -83,7 +92,7 @@ use IPC::Open2;
 use IO::Select;
 use File::Temp;
 use Fcntl ':flock';
-use File::Path qw( make_path );
+use File::Path qw( make_path remove_tree );
 
 use constant EX_TEMPFAIL => 75;
 
@@ -125,8 +134,7 @@ if (defined $job_api_token) {
 }
 
 my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
-my $job_has_uuid = $jobspec =~ /^[-a-z\d]+$/;
-my $local_job = !$job_has_uuid;
+my $local_job = 0;
 
 
 $SIG{'USR1'} = sub
@@ -142,21 +150,26 @@ $SIG{'USR2'} = sub
 
 my $arv = Arvados->new('apiVersion' => 'v1');
 
-my $User = $arv->{'users'}->{'current'}->execute;
-
-my $Job = {};
+my $Job;
 my $job_id;
 my $dbh;
 my $sth;
-if ($job_has_uuid)
+my @jobstep;
+
+my $User = retry_op(sub { $arv->{'users'}->{'current'}->execute; });
+
+if ($jobspec =~ /^[-a-z\d]+$/)
 {
-  $Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
+  # $jobspec is an Arvados UUID, not a JSON job specification
+  $Job = retry_op(sub {
+    $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
+  });
   if (!$force_unlock) {
     # Claim this job, and make sure nobody else does
-    eval {
+    eval { retry_op(sub {
       # lock() sets is_locked_by_uuid and changes state to Running.
       $arv->{'jobs'}->{'lock'}->execute('uuid' => $Job->{'uuid'})
-    };
+    }); };
     if ($@) {
       Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
       exit EX_TEMPFAIL;
@@ -175,10 +188,9 @@ else
 
   $Job->{'is_locked_by_uuid'} = $User->{'uuid'};
   $Job->{'started_at'} = gmtime;
+  $Job->{'state'} = 'Running';
 
-  $Job = $arv->{'jobs'}->{'create'}->execute('job' => $Job);
-
-  $job_has_uuid = 1;
+  $Job = retry_op(sub { $arv->{'jobs'}->{'create'}->execute('job' => $Job); });
 }
 $job_id = $Job->{'uuid'};
 
@@ -291,7 +303,6 @@ $ENV{"CRUNCH_JOB_UUID"} = $job_id;
 $ENV{"JOB_UUID"} = $job_id;
 
 
-my @jobstep;
 my @jobstep_todo = ();
 my @jobstep_done = ();
 my @jobstep_tomerge = ();
@@ -309,12 +320,14 @@ if (defined $Job->{thawedfromkey})
 }
 else
 {
-  my $first_task = $arv->{'job_tasks'}->{'create'}->execute('job_task' => {
-    'job_uuid' => $Job->{'uuid'},
-    'sequence' => 0,
-    'qsequence' => 0,
-    'parameters' => {},
-                                                          });
+  my $first_task = retry_op(sub {
+    $arv->{'job_tasks'}->{'create'}->execute('job_task' => {
+      'job_uuid' => $Job->{'uuid'},
+      'sequence' => 0,
+      'qsequence' => 0,
+      'parameters' => {},
+    });
+  });
   push @jobstep, { 'level' => 0,
                   'failures' => 0,
                    'arvados_task' => $first_task,
@@ -330,137 +343,209 @@ if (!$have_slurm)
 
 
 my $build_script;
+do {
+  local $/ = undef;
+  $build_script = <DATA>;
+};
+my $nodelist = join(",", @node);
 
+if (!defined $no_clear_tmp) {
+  # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
+  Log (undef, "Clean work dirs");
 
-$ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
-
-my $skip_install = ($local_job && $Job->{script_version} =~ m{^/});
-if ($skip_install)
-{
-  if (!defined $no_clear_tmp) {
-    my $clear_tmp_cmd = 'rm -rf $JOB_WORK $CRUNCH_TMP/opt $CRUNCH_TMP/src*';
-    system($clear_tmp_cmd) == 0
-       or croak ("`$clear_tmp_cmd` failed: ".($?>>8));
-  }
-  $ENV{"CRUNCH_SRC"} = $Job->{script_version};
-  for my $src_path ("$ENV{CRUNCH_SRC}/arvados/sdk/python") {
-    if (-d $src_path) {
-      system("virtualenv", "$ENV{CRUNCH_TMP}/opt") == 0
-          or croak ("virtualenv $ENV{CRUNCH_TMP}/opt failed: exit ".($?>>8));
-      system ("cd $src_path && ./build.sh && \$CRUNCH_TMP/opt/bin/python setup.py install")
-          == 0
-          or croak ("setup.py in $src_path failed: exit ".($?>>8));
-    }
+  my $cleanpid = fork();
+  if ($cleanpid == 0)
+  {
+    srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
+          ['bash', '-c', 'if mount | grep -q $JOB_WORK/; then for i in $JOB_WORK/*keep; do /bin/fusermount -z -u $i; done; fi; sleep 1; rm -rf $JOB_WORK $CRUNCH_TMP/opt $CRUNCH_TMP/src*']);
+    exit (1);
   }
+  while (1)
+  {
+    last if $cleanpid == waitpid (-1, WNOHANG);
+    freeze_if_want_freeze ($cleanpid);
+    select (undef, undef, undef, 0.1);
+  }
+  Log (undef, "Cleanup command exited ".exit_status_s($?));
 }
-else
-{
-  do {
-    local $/ = undef;
-    $build_script = <DATA>;
-  };
-  Log (undef, "Install revision ".$Job->{script_version});
-  my $nodelist = join(",", @node);
-
-  if (!defined $no_clear_tmp) {
-    # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
 
-    my $cleanpid = fork();
-    if ($cleanpid == 0)
-    {
-      srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
-           ['bash', '-c', 'if mount | grep -q $JOB_WORK/; then for i in $JOB_WORK/*keep; do /bin/fusermount -z -u $i; done; fi; sleep 1; rm -rf $JOB_WORK $CRUNCH_TMP/opt $CRUNCH_TMP/src*']);
-      exit (1);
-    }
-    while (1)
-    {
-      last if $cleanpid == waitpid (-1, WNOHANG);
-      freeze_if_want_freeze ($cleanpid);
-      select (undef, undef, undef, 0.1);
-    }
-    Log (undef, "Clean-work-dir exited $?");
-  }
 
-  # Install requested code version
+my $git_archive;
+if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
+  # If script_version looks like an absolute path, *and* the --git-dir
+  # argument was not given -- which implies we were not invoked by
+  # crunch-dispatch -- we will use the given path as a working
+  # directory instead of resolving script_version to a git commit (or
+  # doing anything else with git).
+  $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
+  $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
+}
+else {
+  # Resolve the given script_version to a git commit sha1. Also, if
+  # the repository is remote, clone it into our local filesystem: this
+  # ensures "git archive" will work, and is necessary to reliably
+  # resolve a symbolic script_version like "master^".
+  $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
 
-  my @execargs;
-  my @srunargs = ("srun",
-                 "--nodelist=$nodelist",
-                 "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
+  Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
 
   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
-  $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
-
-  my $commit;
-  my $git_archive;
-  my $treeish = $Job->{'script_version'};
 
-  # If we're running under crunch-dispatch, it will have pulled the
-  # appropriate source tree into its own repository, and given us that
-  # repo's path as $git_dir. If we're running a "local" job, and a
-  # script_version was specified, it's up to the user to provide the
-  # full path to a local repository in Job->{repository}.
+  # If we're running under crunch-dispatch, it will have already
+  # pulled the appropriate source tree into its own repository, and
+  # given us that repo's path as $git_dir.
   #
-  # TODO: Accept URLs too, not just local paths. Use git-ls-remote and
-  # git-archive --remote where appropriate.
+  # If we're running a "local" job, we might have to fetch content
+  # from a remote repository.
   #
-  # TODO: Accept a locally-hosted Arvados repository by name or
-  # UUID. Use arvados.v1.repositories.list or .get to figure out the
-  # appropriate fetch-url.
-  my $repo = $git_dir || $ENV{'CRUNCH_DEFAULT_GIT_DIR'} || $Job->{'repository'};
-
+  # (Currently crunch-dispatch gives a local path with --git-dir, but
+  # we might as well accept URLs there too in case it changes its
+  # mind.)
+  my $repo = $git_dir || $Job->{'repository'};
+
+  # Repository can be remote or local. If remote, we'll need to fetch it
+  # to a local dir before doing `git log` et al.
+  my $repo_location;
+
+  if ($repo =~ m{://|^[^/]*:}) {
+    # $repo is a git url we can clone, like git:// or https:// or
+    # file:/// or [user@]host:repo.git. Note "user/name@host:foo" is
+    # not recognized here because distinguishing that from a local
+    # path is too fragile. If you really need something strange here,
+    # use the ssh:// form.
+    $repo_location = 'remote';
+  } elsif ($repo =~ m{^\.*/}) {
+    # $repo is a local path to a git index. We'll also resolve ../foo
+    # to ../foo/.git if the latter is a directory. To help
+    # disambiguate local paths from named hosted repositories, this
+    # form must be given as ./ or ../ if it's a relative path.
+    if (-d "$repo/.git") {
+      $repo = "$repo/.git";
+    }
+    $repo_location = 'local';
+  } else {
+    # $repo is none of the above. It must be the name of a hosted
+    # repository.
+    my $arv_repo_list = retry_op(sub {
+      $arv->{'repositories'}->{'list'}->execute(
+        'filters' => [['name','=',$repo]]);
+    });
+    my @repos_found = @{$arv_repo_list->{'items'}};
+    my $n_found = $arv_repo_list->{'serverResponse'}->{'items_available'};
+    if ($n_found > 0) {
+      Log(undef, "Repository '$repo' -> "
+          . join(", ", map { $_->{'uuid'} } @repos_found));
+    }
+    if ($n_found != 1) {
+      croak("Error: Found $n_found repositories with name '$repo'.");
+    }
+    $repo = $repos_found[0]->{'fetch_url'};
+    $repo_location = 'remote';
+  }
+  Log(undef, "Using $repo_location repository '$repo'");
   $ENV{"CRUNCH_SRC_URL"} = $repo;
 
-  if (-d "$repo/.git") {
-    # We were given a working directory, but we are only interested in
-    # the index.
-    $repo = "$repo/.git";
-  }
+  # Resolve given script_version (we'll call that $treeish here) to a
+  # commit sha1 ($commit).
+  my $treeish = $Job->{'script_version'};
+  my $commit;
+  if ($repo_location eq 'remote') {
+    # We minimize excess object-fetching by re-using the same bare
+    # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
+    # just keep adding remotes to it as needed.
+    my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
+    my $gitcmd = "git --git-dir=\Q$local_repo\E";
+
+    # Set up our local repo for caching remote objects, making
+    # archives, etc.
+    if (!-d $local_repo) {
+      make_path($local_repo) or croak("Error: could not create $local_repo");
+    }
+    # This works (exits 0 and doesn't delete fetched objects) even
+    # if $local_repo is already initialized:
+    `$gitcmd init --bare`;
+    if ($?) {
+      croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
+    }
+
+    # If $treeish looks like a hash (or abbrev hash) we look it up in
+    # our local cache first, since that's cheaper. (We don't want to
+    # do that with tags/branches though -- those change over time, so
+    # they should always be resolved by the remote repo.)
+    if ($treeish =~ /^[0-9a-f]{7,40}$/s) {
+      # Hide stderr because it's normal for this to fail:
+      my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
+      if ($? == 0 &&
+          # Careful not to resolve a branch named abcdeff to commit 1234567:
+          $sha1 =~ /^$treeish/ &&
+          $sha1 =~ /^([0-9a-f]{40})$/s) {
+        $commit = $1;
+        Log(undef, "Commit $commit already present in $local_repo");
+      }
+    }
+
+    if (!defined $commit) {
+      # If $treeish isn't just a hash or abbrev hash, or isn't here
+      # yet, we need to fetch the remote to resolve it correctly.
 
-  # If this looks like a subversion r#, look for it in git-svn commit messages
+      # First, remove all local heads. This prevents a name that does
+      # not exist on the remote from resolving to (or colliding with)
+      # a previously fetched branch or tag (possibly from a different
+      # remote).
+      remove_tree("$local_repo/refs/heads", {keep_root => 1});
 
-  if ($treeish =~ m{^\d{1,4}$}) {
-    my $gitlog = `git --git-dir=\Q$repo\E log --pretty="format:%H" --grep="git-svn-id:.*\@"\Q$treeish\E" " master`;
-    chomp $gitlog;
-    Log(undef, "git Subversion search exited $?");
-    if (($? == 0) && ($gitlog =~ /^[a-f0-9]{40}$/)) {
-      $commit = $gitlog;
-      Log(undef, "Using commit $commit for Subversion revision $treeish");
+      Log(undef, "Fetching objects from $repo to $local_repo");
+      `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
+      if ($?) {
+        croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
+      }
     }
+
+    # Now that the data is all here, we will use our local repo for
+    # the rest of our git activities.
+    $repo = $local_repo;
   }
 
-  # If that didn't work, try asking git to look it up as a tree-ish.
-
-  if (!defined $commit) {
-    my $found = `git --git-dir=\Q$repo\E rev-list -1 ''\Q$treeish\E`;
-    chomp $found;
-    Log(undef, "git rev-list exited $? with result '$found'");
-    if (($? == 0) && ($found =~ /^[0-9a-f]{40}$/s)) {
-      $commit = $found;
-      Log(undef, "Using commit $commit for tree-ish $treeish");
-      if ($commit ne $treeish) {
-       # Make sure we record the real commit id in the database,
-       # frozentokey, logs, etc. -- instead of an abbreviation or a
-       # branch name which can become ambiguous or point to a
-       # different commit in the future.
-        $Job->{'script_version'} = $commit;
-        !$job_has_uuid or
-            $Job->update_attributes('script_version' => $commit) or
-            croak("Error while updating job");
-      }
+  my $gitcmd = "git --git-dir=\Q$repo\E";
+  my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
+  unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
+    croak("`$gitcmd rev-list` exited "
+          .exit_status_s($?)
+          .", '$treeish' not found. Giving up.");
+  }
+  $commit = $1;
+  Log(undef, "Version $treeish is commit $commit");
+
+  if ($commit ne $Job->{'script_version'}) {
+    # Record the real commit id in the database, frozentokey, logs,
+    # etc. -- instead of an abbreviation or a branch name which can
+    # become ambiguous or point to a different commit in the future.
+    if (!$Job->update_attributes('script_version' => $commit)) {
+      croak("Error: failed to update job's script_version attribute");
     }
   }
 
-  if (defined $commit) {
-    $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
-    @execargs = ("sh", "-c",
-                "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
-    $git_archive = `git --git-dir=\Q$repo\E archive ''\Q$commit\E`;
-    croak("git archive failed: exit " . ($? >> 8)) if ($? != 0);
+  $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
+  $git_archive = `$gitcmd archive ''\Q$commit\E`;
+  if ($?) {
+    croak("Error: $gitcmd archive exited ".exit_status_s($?));
   }
-  else {
-    croak ("could not figure out commit id for $treeish");
+}
+
+if (!defined $git_archive) {
+  Log(undef, "Skip install phase (no git archive)");
+  if ($have_slurm) {
+    Log(undef, "Warning: This probably means workers have no source tree!");
   }
+}
+else {
+  Log(undef, "Run install script on all workers");
+
+  my @srunargs = ("srun",
+                  "--nodelist=$nodelist",
+                  "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
+  my @execargs = ("sh", "-c",
+                  "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
 
   # Note: this section is almost certainly unnecessary if we're
   # running tasks in docker containers.
@@ -476,7 +561,7 @@ else
     freeze_if_want_freeze ($installpid);
     select (undef, undef, undef, 0.1);
   }
-  Log (undef, "Install exited $?");
+  Log (undef, "Install script exited ".exit_status_s($?));
 }
 
 if (!$have_slurm)
@@ -515,7 +600,8 @@ fi
   }
   if ($? != 0)
   {
-    croak("Installing Docker image from $docker_locator returned exit code $?");
+    croak("Installing Docker image from $docker_locator exited "
+          .exit_status_s($?));
   }
 }
 
@@ -833,12 +919,13 @@ else {
     while (my $manifest_line = <$orig_manifest>) {
       $orig_manifest_text .= $manifest_line;
     }
-    my $output = $arv->{'collections'}->{'create'}->execute('collection' => {
-      'manifest_text' => $orig_manifest_text,
+    my $output = retry_op(sub {
+      $arv->{'collections'}->{'create'}->execute(
+        'collection' => {'manifest_text' => $orig_manifest_text});
     });
     Log(undef, "output uuid " . $output->{uuid});
     Log(undef, "output hash " . $output->{portable_data_hash});
-    $Job->update_attributes('output' => $output->{portable_data_hash}) if $job_has_uuid;
+    $Job->update_attributes('output' => $output->{portable_data_hash});
   };
   if ($@) {
     Log (undef, "Failed to register output manifest: $@");
@@ -849,15 +936,15 @@ Log (undef, "finish");
 
 save_meta();
 
-if ($job_has_uuid) {
-  if ($collated_output && $main::success) {
-    $Job->update_attributes('state' => 'Complete')
-  } else {
-    $Job->update_attributes('state' => 'Failed')
-  }
+my $final_state;
+if ($collated_output && $main::success) {
+  $final_state = 'Complete';
+} else {
+  $final_state = 'Failed';
 }
+$Job->update_attributes('state' => $final_state);
 
-exit ($Job->{'state'} != 'Complete' ? 1 : 0);
+exit (($final_state eq 'Complete') ? 0 : 1);
 
 
 
@@ -872,9 +959,7 @@ sub update_progress_stats
   $Job->{'tasks_summary'}->{'todo'} = $todo;
   $Job->{'tasks_summary'}->{'done'} = $done;
   $Job->{'tasks_summary'}->{'running'} = $running;
-  if ($job_has_uuid) {
-    $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
-  }
+  $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
   Log (undef, "status: $done done, $running running, $todo todo");
   $progress_is_dirty = 0;
 }
@@ -895,10 +980,7 @@ sub reapchildren
 
   my $childstatus = $?;
   my $exitvalue = $childstatus >> 8;
-  my $exitinfo = sprintf("exit %d signal %d%s",
-                         $exitvalue,
-                         $childstatus & 127,
-                         ($childstatus & 128 ? ' core dump' : ''));
+  my $exitinfo = "exit ".exit_status_s($childstatus);
   $Jobstep->{'arvados_task'}->reload;
   my $task_success = $Jobstep->{'arvados_task'}->{success};
 
@@ -944,10 +1026,8 @@ sub reapchildren
       $main::success = 0;
       $main::please_freeze = 1;
     }
-    else {
-      # Put this task back on the todo queue
-      push @jobstep_todo, $jobstepid;
-    }
+    # Put this task back on the todo queue
+    push @jobstep_todo, $jobstepid;
     $Job->{'tasks_summary'}->{'failed'}++;
   }
   else
@@ -976,13 +1056,15 @@ sub reapchildren
     my $newtask_list = [];
     my $newtask_results;
     do {
-      $newtask_results = $arv->{'job_tasks'}->{'list'}->execute(
-        'where' => {
-          'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
-        },
-        'order' => 'qsequence',
-        'offset' => scalar(@$newtask_list),
-      );
+      $newtask_results = retry_op(sub {
+        $arv->{'job_tasks'}->{'list'}->execute(
+          'where' => {
+            'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
+          },
+          'order' => 'qsequence',
+          'offset' => scalar(@$newtask_list),
+        );
+      });
       push(@$newtask_list, @{$newtask_results->{items}});
     } while (@{$newtask_results->{items}});
     foreach my $arvados_task (@$newtask_list) {
@@ -1005,23 +1087,23 @@ sub check_refresh_wanted
   my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
   if (@stat && $stat[9] > $latest_refresh) {
     $latest_refresh = scalar time;
-    if ($job_has_uuid) {
-      my $Job2 = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
-      for my $attr ('cancelled_at',
-                    'cancelled_by_user_uuid',
-                    'cancelled_by_client_uuid',
-                    'state') {
-        $Job->{$attr} = $Job2->{$attr};
-      }
-      if ($Job->{'state'} ne "Running") {
-        if ($Job->{'state'} eq "Cancelled") {
-          Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
-        } else {
-          Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
-        }
-        $main::success = 0;
-        $main::please_freeze = 1;
+    my $Job2 = retry_op(sub {
+      $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
+    });
+    for my $attr ('cancelled_at',
+                  'cancelled_by_user_uuid',
+                  'cancelled_by_client_uuid',
+                  'state') {
+      $Job->{$attr} = $Job2->{$attr};
+    }
+    if ($Job->{'state'} ne "Running") {
+      if ($Job->{'state'} eq "Cancelled") {
+        Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
+      } else {
+        Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
       }
+      $main::success = 0;
+      $main::please_freeze = 1;
     }
   }
 }
@@ -1188,7 +1270,7 @@ sub collate_output
 
   my ($child_out, $child_in);
   my $pid = open2($child_out, $child_in, 'arv-put', '--raw',
-                  '--retries', put_retry_count());
+                  '--retries', retry_count());
   my $joboutput;
   for (@jobstep)
   {
@@ -1223,10 +1305,13 @@ sub collate_output
     if ($s->can_read(120)) {
       sysread($child_out, $joboutput, 64 * 1024 * 1024);
       chomp($joboutput);
+      # TODO: Ensure exit status == 0.
     } else {
       Log (undef, "timed out reading from 'arv-put'");
     }
   }
+  # TODO: kill $pid instead of waiting, now that we've decided to
+  # ignore further output.
   waitpid($pid, 0);
 
   return $joboutput;
@@ -1331,7 +1416,7 @@ sub log_writer_finish()
   waitpid($log_pipe_pid, 0);
   $log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
   if ($?) {
-    Log("log_writer_finish: arv-put returned error $?")
+    Log("log_writer_finish: arv-put exited ".exit_status_s($?))
   }
 
   return $arv_put_output;
@@ -1375,14 +1460,14 @@ sub croak
   freeze() if @jobstep_todo;
   collate_output() if @jobstep_todo;
   cleanup();
-  save_meta() if log_writer_is_active();
+  save_meta();
   die;
 }
 
 
 sub cleanup
 {
-  return if !$job_has_uuid;
+  return unless $Job;
   if ($Job->{'state'} eq 'Cancelled') {
     $Job->update_attributes('finished_at' => scalar gmtime);
   } else {
@@ -1395,11 +1480,12 @@ sub save_meta
 {
   my $justcheckpoint = shift; # false if this will be the last meta saved
   return if $justcheckpoint;  # checkpointing is not relevant post-Warehouse.pm
+  return unless log_writer_is_active();
 
   my $loglocator = log_writer_finish();
   Log (undef, "log manifest is $loglocator");
   $Job->{'log'} = $loglocator;
-  $Job->update_attributes('log', $loglocator) if $job_has_uuid;
+  $Job->update_attributes('log', $loglocator);
 }
 
 
@@ -1427,7 +1513,7 @@ sub freeze_if_want_freeze
     collate_output();
     cleanup();
     save_meta();
-    exit 0;
+    exit 1;
   }
 }
 
@@ -1516,7 +1602,10 @@ sub find_docker_image {
   # If not, return undef for both values.
   my $locator = shift;
   my ($streamname, $filename);
-  if (my $image = $arv->{collections}->{get}->execute(uuid => $locator)) {
+  my $image = retry_op(sub {
+    $arv->{collections}->{get}->execute(uuid => $locator);
+  });
+  if ($image) {
     foreach my $line (split(/\n/, $image->{manifest_text})) {
       my @tokens = split(/\s+/, $line);
       next if (!@tokens);
@@ -1537,20 +1626,66 @@ sub find_docker_image {
   }
 }
 
-sub put_retry_count {
-  # Calculate a --retries argument for arv-put that will have it try
-  # approximately as long as this Job has been running.
-  my $stoptime = shift || time;
-  my $starttime = $jobstep[0]->{starttime};
-  my $timediff = defined($starttime) ? ($stoptime - $starttime) : 1;
-  my $retries = 0;
-  while ($timediff >= 2) {
-    $retries++;
-    $timediff /= 2;
+sub retry_count {
+  # Calculate the number of times an operation should be retried,
+  # assuming exponential backoff, and that we're willing to retry as
+  # long as tasks have been running.  Enforce a minimum of 3 retries.
+  my ($starttime, $endtime, $timediff, $retries);
+  if (@jobstep) {
+    $starttime = $jobstep[0]->{starttime};
+    $endtime = $jobstep[-1]->{finishtime};
+  }
+  if (!defined($starttime)) {
+    $timediff = 0;
+  } elsif (!defined($endtime)) {
+    $timediff = time - $starttime;
+  } else {
+    $timediff = ($endtime - $starttime) - (time - $endtime);
+  }
+  if ($timediff > 0) {
+    $retries = int(log($timediff) / log(2));
+  } else {
+    $retries = 1;  # Use the minimum.
   }
   return ($retries > 3) ? $retries : 3;
 }
 
+sub retry_op {
+  # Given a function reference, call it with the remaining arguments.  If
+  # it dies, retry it with exponential backoff until it succeeds, or until
+  # the current retry_count is exhausted.
+  my $operation = shift;
+  my $retries = retry_count();
+  foreach my $try_count (0..$retries) {
+    my $next_try = time + (2 ** $try_count);
+    my $result = eval { $operation->(@_); };
+    if (!$@) {
+      return $result;
+    } elsif ($try_count < $retries) {
+      my $sleep_time = $next_try - time;
+      sleep($sleep_time) if ($sleep_time > 0);
+    }
+  }
+  # Ensure the error message ends in a newline, so Perl doesn't add
+  # retry_op's line number to it.
+  chomp($@);
+  die($@ . "\n");
+}
+
+sub exit_status_s {
+  # Given a $?, return a human-readable exit code string like "0" or
+  # "1" or "0 with signal 1" or "1 with signal 11".
+  my $exitcode = shift;
+  my $s = $exitcode >> 8;
+  if ($exitcode & 0x7f) {
+    $s .= " with signal " . ($exitcode & 0x7f);
+  }
+  if ($exitcode & 0x80) {
+    $s .= " with core dump";
+  }
+  return $s;
+}
+
 __DATA__
 #!/usr/bin/perl
 
index cb716f1709fe39b2c22519f6d9ebc304f4c42f4e..c618fc3c6623ef18e8663609340b55bcce219f78 100644 (file)
@@ -6,8 +6,8 @@ import re
 import types
 
 import apiclient
-import apiclient.discovery
-import apiclient.errors
+from apiclient import discovery as apiclient_discovery
+from apiclient import errors as apiclient_errors
 import config
 import errors
 import util
@@ -47,7 +47,7 @@ class CredentialsFromToken(object):
 
 # Monkey patch discovery._cast() so objects and arrays get serialized
 # with json.dumps() instead of str().
-_cast_orig = apiclient.discovery._cast
+_cast_orig = apiclient_discovery._cast
 def _cast_objects_too(value, schema_type):
     global _cast_orig
     if (type(value) != type('') and
@@ -55,16 +55,16 @@ def _cast_objects_too(value, schema_type):
         return json.dumps(value)
     else:
         return _cast_orig(value, schema_type)
-apiclient.discovery._cast = _cast_objects_too
+apiclient_discovery._cast = _cast_objects_too
 
 # Convert apiclient's HttpErrors into our own API error subclass for better
 # error reporting.
-# Reassigning apiclient.errors.HttpError is not sufficient because most of the
+# Reassigning apiclient_errors.HttpError is not sufficient because most of the
 # apiclient submodules import the class into their own namespace.
 def _new_http_error(cls, *args, **kwargs):
-    return super(apiclient.errors.HttpError, cls).__new__(
+    return super(apiclient_errors.HttpError, cls).__new__(
         errors.ApiError, *args, **kwargs)
-apiclient.errors.HttpError.__new__ = staticmethod(_new_http_error)
+apiclient_errors.HttpError.__new__ = staticmethod(_new_http_error)
 
 def http_cache(data_type):
     path = os.environ['HOME'] + '/.cache/arvados/' + data_type
@@ -90,7 +90,7 @@ def api(version=None, cache=True, host=None, token=None, insecure=False, **kwarg
     * insecure: If True, ignore SSL certificate validation errors.
 
     Additional keyword arguments will be passed directly to
-    `apiclient.discovery.build` if a new Resource object is created.
+    `apiclient_discovery.build` if a new Resource object is created.
     If the `discoveryServiceUrl` or `http` keyword arguments are
     missing, this function will set default values for them, based on
     the current Arvados configuration settings.
@@ -153,7 +153,7 @@ def api(version=None, cache=True, host=None, token=None, insecure=False, **kwarg
     credentials = CredentialsFromToken(api_token=token)
     kwargs['http'] = credentials.authorize(kwargs['http'])
 
-    svc = apiclient.discovery.build('arvados', version, **kwargs)
+    svc = apiclient_discovery.build('arvados', version, **kwargs)
     svc.api_token = token
     kwargs['http'].cache = None
     if cache:
index c0913747108d24ea679a75451b73dcf78d642604..c36da3b36010d9c1a5de4848a23f5713f04feee9 100644 (file)
@@ -9,6 +9,7 @@ import subprocess
 import sys
 import tarfile
 import tempfile
+import _strptime
 
 from collections import namedtuple
 from stat import *
@@ -116,7 +117,8 @@ def stat_cache_name(image_file):
     return getattr(image_file, 'name', image_file) + '.stat'
 
 def pull_image(image_name, image_tag):
-    check_docker(popen_docker(['pull', '-t', image_tag, image_name]), "pull")
+    check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
+                 "pull")
 
 def save_image(image_hash, image_file):
     # Save the specified Docker image to image_file, then try to save its
index 00994872f3d414253c1a91bb801ff9c1c23903c6..4a926c701c9f45648dc6337fc294391986e03379 100644 (file)
@@ -3,7 +3,6 @@
 # TODO:
 # --md5sum - display md5 of each file as read from disk
 
-import apiclient.errors
 import argparse
 import arvados
 import base64
@@ -18,6 +17,7 @@ import signal
 import socket
 import sys
 import tempfile
+from apiclient import errors as apiclient_errors
 
 import arvados.commands._util as arv_cmd
 
@@ -364,11 +364,11 @@ def desired_project_uuid(api_client, project_uuid, num_retries):
 
 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     global api_client
-    if api_client is None:
-        api_client = arvados.api('v1')
-    status = 0
 
     args = parse_arguments(arguments)
+    status = 0
+    if api_client is None:
+        api_client = arvados.api('v1')
 
     # Determine the name to use
     if args.name:
@@ -390,7 +390,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     try:
         project_uuid = desired_project_uuid(api_client, args.project_uuid,
                                             args.retries)
-    except (apiclient.errors.Error, ValueError) as error:
+    except (apiclient_errors.Error, ValueError) as error:
         print >>stderr, error
         sys.exit(1)
 
@@ -468,7 +468,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
             else:
                 output = collection['uuid']
 
-        except apiclient.errors.Error as error:
+        except apiclient_errors.Error as error:
             print >>stderr, (
                 "arv-put: Error creating Collection on project: {}.".format(
                     error))
diff --git a/sdk/python/arvados/commands/ws.py b/sdk/python/arvados/commands/ws.py
new file mode 100644 (file)
index 0000000..674daad
--- /dev/null
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+
+import sys
+import logging
+import argparse
+import arvados
+import json
+from arvados.events import subscribe
+import signal
+
+def main(arguments=None):
+    logger = logging.getLogger('arvados.arv-ws')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-u', '--uuid', type=str, default="", help="Filter events on object_uuid")
+    parser.add_argument('-f', '--filters', type=str, default="", help="Arvados query filter to apply to log events (JSON encoded)")
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--poll-interval', default=15, type=int, help="If websockets is not available, specify the polling interval, default is every 15 seconds")
+    group.add_argument('--no-poll', action='store_false', dest='poll_interval', help="Do not poll if websockets are not available, just fail")
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-p', '--pipeline', type=str, default="", help="Supply pipeline uuid, print log output from pipeline and its jobs")
+    group.add_argument('-j', '--job', type=str, default="", help="Supply job uuid, print log output from jobs")
+
+    args = parser.parse_args(arguments)
+
+    global filters
+    global known_component_jobs
+    global ws
+
+    filters = []
+    known_component_jobs = set()
+    ws = None
+
+    def update_subscribed_components(components):
+        global known_component_jobs
+        global filters
+        pipeline_jobs = set()
+        for c in components:
+            if "job" in components[c]:
+                pipeline_jobs.add(components[c]["job"]["uuid"])
+        if known_component_jobs != pipeline_jobs:
+            ws.unsubscribe(filters)
+            filters = [['object_uuid', 'in', [args.pipeline] + list(pipeline_jobs)]]
+            ws.subscribe([['object_uuid', 'in', [args.pipeline] + list(pipeline_jobs)]])
+            known_component_jobs = pipeline_jobs
+
+    api = arvados.api('v1', cache=False)
+
+    if args.uuid:
+        filters += [ ['object_uuid', '=', args.uuid] ]
+
+    if args.filters:
+        filters += json.loads(args.filters)
+
+    if args.job:
+        filters += [ ['object_uuid', '=', args.job] ]
+
+    if args.pipeline:
+        filters += [ ['object_uuid', '=', args.pipeline] ]
+
+    def on_message(ev):
+        global filters
+        global ws
+
+        logger.debug(ev)
+        if 'event_type' in ev and (args.pipeline or args.job):
+            if ev['event_type'] in ('stderr', 'stdout'):
+                sys.stdout.write(ev["properties"]["text"])
+            elif ev["event_type"] in ("create", "update"):
+                if ev["object_kind"] == "arvados#pipelineInstance":
+                    update_subscribed_components(ev["properties"]["new_attributes"]["components"])
+        elif 'status' in ev and ev['status'] == 200:
+            pass
+        else:
+            print json.dumps(ev)
+
+    try:
+        ws = subscribe(arvados.api('v1', cache=False), filters, on_message, poll_fallback=args.poll_interval)
+        if ws:
+            if args.pipeline:
+                c = api.pipeline_instances().get(uuid=args.pipeline).execute()
+                update_subscribed_components(c["components"])
+
+            while True:
+                signal.pause()
+    except KeyboardInterrupt:
+        pass
+    except Exception as e:
+        logger.error(e)
+    finally:
+        if ws:
+            ws.close()
index 2b8374afb5c493cbf0eaeecc4d95da5325433865..ea45a48813395851acd878f815562d7ff7408842 100644 (file)
@@ -7,7 +7,10 @@ import os
 import re
 
 _settings = None
-default_config_file = os.environ['HOME'] + '/.config/arvados/settings.conf'
+if os.environ.get('HOME') != None:
+    default_config_file = os.environ['HOME'] + '/.config/arvados/settings.conf'
+else:
+    default_config_file = ''
 
 EMPTY_BLOCK_LOCATOR = 'd41d8cd98f00b204e9800998ecf8427e+0'
 
index 89910aa60f389f7e61ecfc9d4be6d492f09ad25f..4740a2d91962f31f6aeea1638c8d9582a2b35f50 100644 (file)
@@ -1,9 +1,9 @@
 # errors.py - Arvados-specific exceptions.
 
-import apiclient.errors
 import json
+from apiclient import errors as apiclient_errors
 
-class ApiError(apiclient.errors.HttpError):
+class ApiError(apiclient_errors.HttpError):
     def _get_reason(self):
         try:
             return '; '.join(json.loads(self.content)['errors'])
index b7d610d66e729a9b9cb3a27b29a9abc51395493b..e6038fcd7d7c7b38845dec0a8653b19f259eb3f5 100644 (file)
@@ -1,5 +1,5 @@
 from ws4py.client.threadedclient import WebSocketClient
-import thread
+import threading
 import json
 import os
 import time
@@ -7,6 +7,7 @@ import ssl
 import re
 import config
 import logging
+import arvados
 
 _logger = logging.getLogger('arvados.events')
 
@@ -18,13 +19,12 @@ class EventClient(WebSocketClient):
             ssl_options={'cert_reqs': ssl.CERT_NONE}
         else:
             ssl_options={'cert_reqs': ssl.CERT_REQUIRED}
-
-        super(EventClient, self).__init__(url, ssl_options)
+        super(EventClient, self).__init__(url, ssl_options=ssl_options)
         self.filters = filters
         self.on_event = on_event
 
     def opened(self):
-        self.send(json.dumps({"method": "subscribe", "filters": self.filters}))
+        self.subscribe(self.filters)
 
     def received_message(self, m):
         self.on_event(json.loads(str(m)))
@@ -36,14 +36,83 @@ class EventClient(WebSocketClient):
         except:
             pass
 
-def subscribe(api, filters, on_event):
+    def subscribe(self, filters, last_log_id=None):
+        m = {"method": "subscribe", "filters": filters}
+        if last_log_id is not None:
+            m["last_log_id"] = last_log_id
+        self.send(json.dumps(m))
+
+    def unsubscribe(self, filters):
+        self.send(json.dumps({"method": "unsubscribe", "filters": filters}))
+
+class PollClient(threading.Thread):
+    def __init__(self, api, filters, on_event, poll_time):
+        super(PollClient, self).__init__()
+        self.api = api
+        if filters:
+            self.filters = [filters]
+        else:
+            self.filters = [[]]
+        self.on_event = on_event
+        self.poll_time = poll_time
+        self.stop = threading.Event()
+
+    def run(self):
+        self.id = 0
+        for f in self.filters:
+            items = self.api.logs().list(limit=1, order="id desc", filters=f).execute()['items']
+            if items:
+                if items[0]['id'] > self.id:
+                    self.id = items[0]['id']
+
+        self.on_event({'status': 200})
+
+        while not self.stop.isSet():
+            max_id = self.id
+            for f in self.filters:
+                items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute()['items']
+                for i in items:
+                    if i['id'] > max_id:
+                        max_id = i['id']
+                    self.on_event(i)
+            self.id = max_id
+            self.stop.wait(self.poll_time)
+
+    def close(self):
+        self.stop.set()
+        self.join()
+
+    def subscribe(self, filters):
+        self.on_event({'status': 200})
+        self.filters.append(filters)
+
+    def unsubscribe(self, filters):
+        del self.filters[self.filters.index(filters)]
+
+
+def subscribe(api, filters, on_event, poll_fallback=15):
+    '''
+    api: Must be a newly created from arvados.api(cache=False), not shared with the caller, as it may be used by a background thread.
+    filters: Initial subscription filters.
+    on_event: The callback when a message is received
+    poll_fallback: If websockets are not available, fall back to polling every N seconds.  If poll_fallback=False, this will return None if websockets are not available.
+    '''
     ws = None
-    try:
-        url = "{}?api_token={}".format(api._rootDesc['websocketUrl'], config.get('ARVADOS_API_TOKEN'))
-        ws = EventClient(url, filters, on_event)
-        ws.connect()
-        return ws
-    except Exception:
-        if (ws):
-          ws.close_connection()
-        raise
+    if 'websocketUrl' in api._rootDesc:
+        try:
+            url = "{}?api_token={}".format(api._rootDesc['websocketUrl'], api.api_token)
+            ws = EventClient(url, filters, on_event)
+            ws.connect()
+            return ws
+        except Exception as e:
+            _logger.warn("Got exception %s trying to connect to websockets at %s" % (e, api._rootDesc['websocketUrl']))
+            if ws:
+                ws.close_connection()
+    if poll_fallback:
+        _logger.warn("Websockets not available, falling back to log table polling")
+        p = PollClient(api, filters, on_event, poll_fallback)
+        p.start()
+        return p
+    else:
+        _logger.error("Websockets not available")
+        return None
index ce7f066ec7682ff8b4fe3cdeaa59e0656db68716..4e663cef1bc2bd705ed7651c3185aa743637b4b7 100755 (executable)
@@ -1,30 +1,4 @@
 #!/usr/bin/env python
 
-import sys
-import logging
-import argparse
-import arvados
-from arvados.events import subscribe
-
-logger = logging.getLogger('arvados.arv-ws')
-
-parser = argparse.ArgumentParser()
-parser.add_argument('-u', '--uuid', type=str, default="")
-args = parser.parse_args()
-
-filters = []
-if len(args.uuid)>0: filters = [ ['object_uuid', '=', args.uuid] ]
-
-api = arvados.api('v1', cache=False)
-
-def on_message(ev):
-  print "\n", ev
-
-ws = None
-try:
-  ws = subscribe(api, filters, lambda ev: on_message(ev))
-  ws.run_forever()
-except Exception:
-  logger.exception('')
-  if (ws):
-    ws.close_connection()
+from arvados.commands.ws import main
+main()
index 0ad6725831deb78801cd497c4d4fd149024da706..bd3419c89dae47d857c22cd199c49f1722c3f8e0 100644 (file)
@@ -231,7 +231,13 @@ def fixture(fix):
     '''load a fixture yaml file'''
     with open(os.path.join(SERVICES_SRC_DIR, 'api', "test", "fixtures",
                            fix + ".yml")) as f:
-        return yaml.load(f.read())
+        yaml_file = f.read()
+        try:
+          trim_index = yaml_file.index("# Test Helper trims the rest of the file")
+          yaml_file = yaml_file[0:trim_index]
+        except ValueError:
+          pass
+        return yaml.load(yaml_file)
 
 def authorize_with(token):
     '''token is the symbolic name of the token from the api_client_authorizations fixture'''
index e9cb838cb37022ecad51ec669c91b643bd7fafef..0d81fdf738caf3d3120725479c8b110d27204c66 100644 (file)
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 
-import apiclient.errors
 import arvados
 import httplib2
 import json
@@ -8,8 +7,9 @@ import mimetypes
 import os
 import run_test_server
 import unittest
+from apiclient import errors as apiclient_errors
+from apiclient import http as apiclient_http
 
-from apiclient.http import RequestMockBuilder
 from arvados_testutil import fake_httplib2_response
 
 if not mimetypes.inited:
@@ -41,7 +41,7 @@ class ArvadosApiClientTest(unittest.TestCase):
             'arvados.humans.list': (None, json.dumps(
                     {'items_available': 0, 'items': []})),
             }
-        req_builder = RequestMockBuilder(mock_responses)
+        req_builder = apiclient_http.RequestMockBuilder(mock_responses)
         cls.api = arvados.api('v1', cache=False,
                               host=os.environ['ARVADOS_API_HOST'],
                               token='discovery-doc-only-no-token-needed',
@@ -58,14 +58,14 @@ class ArvadosApiClientTest(unittest.TestCase):
         self.assertEqual(answer['items_available'], len(answer['items']))
 
     def test_exceptions_include_errors(self):
-        with self.assertRaises(apiclient.errors.HttpError) as err_ctx:
+        with self.assertRaises(apiclient_errors.HttpError) as err_ctx:
             self.api.humans().get(uuid='xyz-xyz-abcdef').execute()
         err_s = str(err_ctx.exception)
         for msg in ["Bad UUID format", "Bad output format"]:
             self.assertIn(msg, err_s)
 
     def test_exceptions_without_errors_have_basic_info(self):
-        with self.assertRaises(apiclient.errors.HttpError) as err_ctx:
+        with self.assertRaises(apiclient_errors.HttpError) as err_ctx:
             self.api.humans().delete(uuid='xyz-xyz-abcdef').execute()
         self.assertIn("500", str(err_ctx.exception))
 
index 1dae978c843b052a19469c1e8dfc0ee876a0ede9..032ac51f0d445a5b03e751cf569f5c835307c367 100644 (file)
@@ -2,27 +2,51 @@ import run_test_server
 import unittest
 import arvados
 import arvados.events
-import time
-
-class WebsocketTest(run_test_server.TestCaseWithServers):
-    MAIN_SERVER = {'websockets': True}
+import threading
 
+class EventTestBase(object):
     def on_event(self, ev):
         if self.state == 1:
             self.assertEqual(200, ev['status'])
             self.state = 2
+            self.subscribed.set()
         elif self.state == 2:
             self.assertEqual(self.h[u'uuid'], ev[u'object_uuid'])
             self.state = 3
+            self.done.set()
         elif self.state == 3:
             self.fail()
 
     def runTest(self):
+        self.ws = None
         self.state = 1
+        self.subscribed = threading.Event()
+        self.done = threading.Event()
 
         run_test_server.authorize_with("admin")
         api = arvados.api('v1', cache=False)
-        arvados.events.subscribe(api, [['object_uuid', 'is_a', 'arvados#human']], lambda ev: self.on_event(ev))
-        time.sleep(1)
+        self.ws = arvados.events.subscribe(arvados.api('v1', cache=False), [['object_uuid', 'is_a', 'arvados#human']], self.on_event, poll_fallback=2)
+        self.assertIsInstance(self.ws, self.WS_TYPE)
+        self.subscribed.wait(10)
         self.h = api.humans().create(body={}).execute()
-        time.sleep(1)
+        self.done.wait(10)
+        self.assertEqual(3, self.state)
+
+class WebsocketTest(run_test_server.TestCaseWithServers, EventTestBase):
+    MAIN_SERVER = {'websockets': True}
+    WS_TYPE = arvados.events.EventClient
+
+    def tearDown(self):
+        if self.ws:
+            self.ws.close()
+        super(WebsocketTest, self).tearDown()
+
+
+class PollClientTest(run_test_server.TestCaseWithServers, EventTestBase):
+    MAIN_SERVER = {}
+    WS_TYPE = arvados.events.PollClient
+
+    def tearDown(self):
+        if self.ws:
+            self.ws.close()
+        super(PollClientTest, self).tearDown()
index 67cf109ff3fa2d343ac2a2bbda8dc9d6b8fa2053..996d3fc7da560cca5c163a6c3e9dee558d2359b9 100644 (file)
@@ -72,7 +72,7 @@ gem 'database_cleaner'
 gem 'themes_for_rails'
 
 gem 'arvados', '>= 0.1.20140919104705'
-gem 'arvados-cli', '>= 0.1.20140919104705'
+gem 'arvados-cli', '>= 0.1.20141014201516'
 
 # pg_power lets us use partial indexes in schema.rb in Rails 3
 gem 'pg_power'
index bb327d05576d7374d51647ffb066785000087276..ce79f854dea56e3006fe049ee52988c690839ded 100644 (file)
@@ -41,12 +41,12 @@ GEM
       google-api-client (~> 0.6.3)
       json (>= 1.7.7)
       jwt (>= 0.1.5, < 1.0.0)
-    arvados-cli (0.1.20140919104705)
+    arvados-cli (0.1.20141014201516)
       activesupport (~> 3.2, >= 3.2.13)
       andand (~> 1.3, >= 1.3.3)
-      arvados (~> 0.1.0)
+      arvados (~> 0.1, >= 0.1.0)
       curb (~> 0.8)
-      google-api-client (~> 0.6.3)
+      google-api-client (~> 0.6, >= 0.6.3)
       json (~> 1.7, >= 1.7.7)
       jwt (>= 0.1.5, < 1.0.0)
       oj (~> 2.0, >= 2.0.3)
@@ -224,7 +224,7 @@ DEPENDENCIES
   acts_as_api
   andand
   arvados (>= 0.1.20140919104705)
-  arvados-cli (>= 0.1.20140919104705)
+  arvados-cli (>= 0.1.20141014201516)
   coffee-rails (~> 3.2.0)
   database_cleaner
   factory_girl_rails
index 901b7423e4965eedef8e20e8e1b5fd868b55c27a..e2bef88548fbc8ddab4848e3937d089ac3c62bd5 100644 (file)
@@ -96,7 +96,7 @@ class Arvados::V1::JobsController < ApplicationController
 
   def cancel
     reload_object_before_update
-    @object.update_attributes! cancelled_at: Time.now
+    @object.update_attributes! state: Job::Cancelled
     show
   end
 
index c9ac096fb5a7dff6ae79dfadb45ada2893c164d3..f2a04b3b29e79efdcd417d68a60e7ee40f921f06 100644 (file)
@@ -3,13 +3,6 @@ class Arvados::V1::NodesController < ApplicationController
   skip_before_filter :find_object_by_uuid, :only => :ping
   skip_before_filter :render_404_if_no_object, :only => :ping
 
-  def create
-    @object = Node.new
-    @object.save!
-    @object.start!(lambda { |h| ping_arvados_v1_node_url(h) })
-    show
-  end
-
   def update
     if resource_attrs[:job_uuid]
       @object.job_readable = readable_job_uuids(resource_attrs[:job_uuid]).any?
index 2e17747839cb9819d22ad6e49487bb921172cef4..13ccd7033560318c2db2687928a08a4e5de0d44e 100644 (file)
@@ -110,7 +110,8 @@ class ArvadosModel < ActiveRecord::Base
     unless (owner_uuid == current_user.uuid or
             current_user.is_admin or
             (current_user.groups_i_can(:manage) & [uuid, owner_uuid]).any?)
-      if current_user.groups_i_can(:write).index(uuid)
+      if ((current_user.groups_i_can(:write) + [current_user.uuid]) &
+          [uuid, owner_uuid]).any?
         return [owner_uuid, current_user.uuid]
       else
         return [owner_uuid]
@@ -526,7 +527,6 @@ class ArvadosModel < ActiveRecord::Base
     log = Log.new(event_type: event_type).fill_object(self)
     yield log
     log.save!
-    connection.execute "NOTIFY logs, '#{log.id}'"
     log_start_state
   end
 
index 2482256e1415821dd306d50cb579e878f4ce9fd2..864e60832f4e8312307fd628ba333ba3e64a2dec 100644 (file)
@@ -9,12 +9,12 @@ class Job < ArvadosModel
   before_create :ensure_unique_submit_id
   after_commit :trigger_crunch_dispatch_if_cancelled, :on => :update
   before_validation :set_priority
-  before_validation :update_timestamps_when_state_changes
   before_validation :update_state_from_old_state_attrs
   validate :ensure_script_version_is_commit
   validate :find_docker_image_locator
   validate :validate_status
   validate :validate_state_change
+  before_save :update_timestamps_when_state_changes
 
   has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version
   has_many(:nodes, foreign_key: :job_uuid, primary_key: :uuid)
@@ -261,6 +261,7 @@ class Job < ArvadosModel
 
   def update_timestamps_when_state_changes
     return if not (state_changed? or new_record?)
+
     case state
     when Running
       self.started_at ||= Time.now
index 34e6dfa354ca93a742ca4913b89d40d7accab147..39f789e69f9ebf89cf9ca00887f047a8f74b9578 100644 (file)
@@ -5,6 +5,7 @@ class Log < ArvadosModel
   serialize :properties, Hash
   before_validation :set_default_event_at
   attr_accessor :object, :object_kind
+  after_save :send_notify
 
   api_accessible :user, extend: :common do |t|
     t.add :id
@@ -80,4 +81,8 @@ class Log < ArvadosModel
     # logs can have references to deleted objects
   end
 
+  def send_notify
+    connection.execute "NOTIFY logs, '#{self.id}'"
+  end
+
 end
index c791f8e25595109f124216a1ad9648f763a40704..db39ab658d9b1ad7c3c9a2263349ea3e0cfa6869 100644 (file)
@@ -92,11 +92,6 @@ class Node < ArvadosModel
     if o[:ec2_instance_id]
       if !self.info['ec2_instance_id']
         self.info['ec2_instance_id'] = o[:ec2_instance_id]
-        if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
-          tag_cmd = ("ec2-create-tags #{o[:ec2_instance_id]} " +
-                     "--tag 'Name=#{self.uuid}'")
-          `#{tag_cmd}`
-        end
       elsif self.info['ec2_instance_id'] != o[:ec2_instance_id]
         logger.debug "Multiple nodes have credentials for #{self.uuid}"
         raise "#{self.uuid} is already running at #{self.info['ec2_instance_id']} so rejecting ping from #{o[:ec2_instance_id]}"
@@ -117,11 +112,6 @@ class Node < ArvadosModel
         raise "No available node slots" if try_slot == MAX_SLOTS
       end while true
       self.hostname = self.class.hostname_for_slot(self.slot_number)
-      if info['ec2_instance_id']
-        if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
-          `ec2-create-tags #{self.info['ec2_instance_id']} --tag 'hostname=#{self.hostname}'`
-        end
-      end
     end
 
     # Record other basic stats
@@ -136,50 +126,6 @@ class Node < ArvadosModel
     save!
   end
 
-  def start!(ping_url_method)
-    ensure_permission_to_save
-    ping_url = ping_url_method.call({ id: self.uuid, ping_secret: self.info['ping_secret'] })
-    if (Rails.configuration.compute_node_ec2run_args and
-        Rails.configuration.compute_node_ami)
-      ec2_args = ["--user-data '#{ping_url}'",
-                  "-t c1.xlarge -n 1",
-                  Rails.configuration.compute_node_ec2run_args,
-                  Rails.configuration.compute_node_ami
-                 ]
-      ec2run_cmd = ["ec2-run-instances",
-                    "--client-token", self.uuid,
-                    ec2_args].flatten.join(' ')
-      ec2spot_cmd = ["ec2-request-spot-instances",
-                     "-p #{Rails.configuration.compute_node_spot_bid} --type one-time",
-                     ec2_args].flatten.join(' ')
-    else
-      ec2run_cmd = ''
-      ec2spot_cmd = ''
-    end
-    self.info['ec2_run_command'] = ec2run_cmd
-    self.info['ec2_spot_command'] = ec2spot_cmd
-    self.info['ec2_start_command'] = ec2spot_cmd
-    logger.info "#{self.uuid} ec2_start_command= #{ec2spot_cmd.inspect}"
-    result = `#{ec2spot_cmd} 2>&1`
-    self.info['ec2_start_result'] = result
-    logger.info "#{self.uuid} ec2_start_result= #{result.inspect}"
-    result.match(/INSTANCE\s*(i-[0-9a-f]+)/) do |m|
-      instance_id = m[1]
-      self.info['ec2_instance_id'] = instance_id
-      if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
-        `ec2-create-tags #{instance_id} --tag 'Name=#{self.uuid}'`
-      end
-    end
-    result.match(/SPOTINSTANCEREQUEST\s*(sir-[0-9a-f]+)/) do |m|
-      sir_id = m[1]
-      self.info['ec2_sir_id'] = sir_id
-      if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
-        `ec2-create-tags #{sir_id} --tag 'Name=#{self.uuid}'`
-      end
-    end
-    self.save!
-  end
-
   protected
 
   def ensure_ping_secret
index 6e7facd5d550ee45bd948a255e28e5b18ddec6cb..ecd50ccdc4f3b0c601631f96bc89eaa16eebf0a7 100644 (file)
@@ -29,6 +29,7 @@ class User < ArvadosModel
     t.add :is_admin
     t.add :is_invited
     t.add :prefs
+    t.add :writable_by
   end
 
   ALL_PERMISSIONS = {read: true, write: true, manage: true}
index b85df8cb4b21935ca080a447e89e6d3401ecd9e7..6c77f26c9a20e9a254bf9f27f83800f7c594ae62 100644 (file)
@@ -77,9 +77,6 @@ common:
   # crunch-job must be able to stat() it.
   crunch_refresh_trigger: /tmp/crunch_refresh_trigger
 
-  # Maximum number of log events that may be generated by a single job.
-  crunch_limit_log_events_per_job: 65536
-
   # These two settings control how frequently log events are flushed to the
   # database.  Log lines are buffered until either crunch_log_bytes_per_event
   # has been reached or crunch_log_seconds_between_events has elapsed since
@@ -105,15 +102,9 @@ common:
   # Path to /etc/dnsmasq.d, or false = do not update dnsmasq data.
   dnsmasq_conf_dir: false
 
-  # Set to AMI id (ami-123456) to auto-start nodes. See app/models/node.rb
-  compute_node_ami: false
-  compute_node_ec2run_args: -g arvados-compute
-  compute_node_spot_bid: 0.11
-
   compute_node_domain: false
   compute_node_nameservers:
     - 192.168.1.1
-  compute_node_ec2_tag_enable: false
 
   # The version below is suitable for AWS.
   # To use it, copy it to your application.yml, uncomment, and change <%# to <%=
index 4a6141ccf3c23d82c215ee5994090d8b3532e82b..ea1c2103853d31f04aaf3a94826a7ed8a0156930 100644 (file)
@@ -12,5 +12,8 @@ Server::Application.configure do
       :mount => "/websocket",
       :websocket_only => (ENV['ARVADOS_WEBSOCKETS'] == "ws-only")
     }
+    Rails.logger.info "Websockets #{ENV['ARVADOS_WEBSOCKETS']}, running at /websocket"
+  else
+    Rails.logger.info "Websockets disabled"
   end
 end
index 50400ee86bf1b6827cbc64dddc024bf8d47e5364..bccbeea4bb62ed951cdcc3f6a6ad2d2ac9098d7d 100644 (file)
@@ -57,29 +57,40 @@ class EventBus
           # Start with log rows readable by user, sorted in ascending order
           logs = Log.readable_by(ws.user).order("id asc")
 
+          cond_id = nil
+          cond_out = []
+          param_out = []
+
           if ws.last_log_id
             # Client is only interested in log rows that are newer than the
             # last log row seen by the client.
-            logs = logs.where("logs.id > ?", ws.last_log_id)
+            cond_id = "logs.id > ?"
+            param_out << ws.last_log_id
           elsif id
             # No last log id, so only look at the most recently changed row
-            logs = logs.where("logs.id = ?", id.to_i)
+            cond_id = "logs.id = ?"
+            param_out << id.to_i
           else
             return
           end
 
           # Now process filters provided by client
-          cond_out = []
-          param_out = []
           ws.filters.each do |filter|
             ft = record_filters filter.filters, Log
-            cond_out += ft[:cond_out]
-            param_out += ft[:param_out]
+            if ft[:cond_out].any?
+              # Join the clauses within a single subscription filter with AND
+              # so it is consistent with regular queries
+              cond_out << "(#{ft[:cond_out].join ') AND ('})"
+              param_out += ft[:param_out]
+            end
           end
 
           # Add filters to query
           if cond_out.any?
-            logs = logs.where('(' + cond_out.join(') OR (') + ')', *param_out)
+            # Join subscriptions with OR
+            logs = logs.where(cond_id + " AND ((#{cond_out.join ') OR ('}))", *param_out)
+          else
+            logs = logs.where(cond_id, *param_out)
           end
 
           # Finally execute query and actually send the matching log rows
@@ -92,8 +103,8 @@ class EventBus
           ws.last_log_id = id.to_i
         end
       rescue Exception => e
-        puts "Error publishing event: #{$!}"
-        puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+        Rails.logger.warn "Error publishing event: #{$!}"
+        Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
         ws.send ({status: 500, message: 'error'}.to_json)
         ws.close
       end
@@ -118,7 +129,7 @@ class EventBus
           # Add a filter.  This gets the :filters field which is the same
           # format as used for regular index queries.
           ws.filters << Filter.new(p)
-          ws.send ({status: 200, message: 'subscribe ok'}.to_json)
+          ws.send ({status: 200, message: 'subscribe ok', filter: p}.to_json)
 
           # Send any pending events
           push_events ws
@@ -143,8 +154,8 @@ class EventBus
     rescue Oj::Error => e
       ws.send ({status: 400, message: "malformed request"}.to_json)
     rescue Exception => e
-      puts "Error handling message: #{$!}"
-      puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+      Rails.logger.warn "Error handling message: #{$!}"
+      Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
       ws.send ({status: 500, message: 'error'}.to_json)
       ws.close
     end
index bb20aef6601683b9ff78e98f636f5d552ff08584..d3147225d63d4c805320302c8a62eae1f2a2c2fc 100755 (executable)
@@ -342,7 +342,8 @@ class Dispatcher
         stderr_flushed_at: Time.new(0),
         bytes_logged: 0,
         events_logged: 0,
-        log_throttle_timestamp: Time.new(0),
+        log_throttle_is_open: true,
+        log_throttle_reset_time: Time.now + Rails.configuration.crunch_log_throttle_period,
         log_throttle_bytes_so_far: 0,
         log_throttle_lines_so_far: 0,
         log_throttle_bytes_skipped: 0,
@@ -356,46 +357,46 @@ class Dispatcher
   # the log line should go to output or not.  Modifies "line" in place to
   # replace it with an error if a logging limit is tripped.
   def rate_limit running_job, line
-    if running_job[:bytes_logged] > Rails.configuration.crunch_limit_log_bytes_per_job
-      # Don't log anything if the hard cap has already been exceeded
-      return false
-    end
-
-    now = Time.now
-    throttle_period = Rails.configuration.crunch_log_throttle_period
-
-    if running_job[:log_throttle_bytes_skipped] > 0
-      # We've skipped some log in the current time period already, so continue to
-      # skip the log
-      running_job[:log_throttle_bytes_skipped] += line.size
-      return false
-    end
-
-    # Count lines and bytes logged in this period, and total bytes logged for the job
-    running_job[:log_throttle_lines_so_far] += 1
-    running_job[:log_throttle_bytes_so_far] += line.size
-    running_job[:bytes_logged] += line.size
-
-    if running_job[:log_throttle_bytes_so_far] > Rails.configuration.crunch_log_throttle_bytes or
-        running_job[:log_throttle_lines_so_far] > Rails.configuration.crunch_log_throttle_lines
-      # We've exceeded the per-period throttle, so start skipping
-      running_job[:log_throttle_bytes_skipped] += line.size
-
-      # Replace log line with a message about skipping the log
-      remaining_time = throttle_period - (now - running_job[:log_throttle_timestamp])
-      if running_job[:log_throttle_bytes_so_far] > Rails.configuration.crunch_log_throttle_bytes
-        line.replace "Exceeded rate #{Rails.configuration.crunch_log_throttle_bytes} bytes per #{throttle_period} seconds (crunch_log_throttle_bytes), logging will be silenced for the next #{remaining_time.round} seconds\n"
-      else
-        line.replace "Exceeded rate #{Rails.configuration.crunch_log_throttle_lines} lines per #{throttle_period} seconds (crunch_log_throttle_lines), logging will be silenced for the next #{remaining_time.round} seconds\n"
+    message = false
+    linesize = line.size
+    if running_job[:log_throttle_is_open]
+      running_job[:log_throttle_lines_so_far] += 1
+      running_job[:log_throttle_bytes_so_far] += linesize
+      running_job[:bytes_logged] += linesize
+
+      if (running_job[:bytes_logged] >
+          Rails.configuration.crunch_limit_log_bytes_per_job)
+        message = "Exceeded log limit #{Rails.configuration.crunch_limit_log_bytes_per_job} bytes (crunch_limit_log_bytes_per_job). Log will be truncated."
+        running_job[:log_throttle_reset_time] = Time.now + 100.years
+        running_job[:log_throttle_is_open] = false
+
+      elsif (running_job[:log_throttle_bytes_so_far] >
+             Rails.configuration.crunch_log_throttle_bytes)
+        remaining_time = running_job[:log_throttle_reset_time] - Time.now
+        message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_bytes} bytes per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_bytes). Logging will be silenced for the next #{remaining_time.round} seconds.\n"
+        running_job[:log_throttle_is_open] = false
+
+      elsif (running_job[:log_throttle_lines_so_far] >
+             Rails.configuration.crunch_log_throttle_lines)
+        remaining_time = running_job[:log_throttle_reset_time] - Time.now
+        message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_lines} lines per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_lines), logging will be silenced for the next #{remaining_time.round} seconds.\n"
+        running_job[:log_throttle_is_open] = false
       end
     end
 
-    if running_job[:bytes_logged] > Rails.configuration.crunch_limit_log_bytes_per_job
-      # Replace log line with a message about truncating the log
-      line.replace "Exceeded log limit #{Rails.configuration.crunch_limit_log_bytes_per_job} bytes (crunch_limit_log_bytes_per_job).  Log will be truncated."
+    if not running_job[:log_throttle_is_open]
+      # Don't log anything if any limit has been exceeded. Just count lossage.
+      running_job[:log_throttle_bytes_skipped] += linesize
     end
 
-    true
+    if message
+      # Yes, write to logs, but use our "rate exceeded" message
+      # instead of the log message that exceeded the limit.
+      line.replace message
+      true
+    else
+      running_job[:log_throttle_is_open]
+    end
   end
 
   def read_pipes
@@ -403,59 +404,94 @@ class Dispatcher
       job = j[:job]
 
       now = Time.now
-      if (now - j[:log_throttle_timestamp]) > Rails.configuration.crunch_log_throttle_period
-        # It has been more than throttle_period seconds since the last checkpoint so reset the
-        # throttle
+      if now > j[:log_throttle_reset_time]
+        # It has been more than throttle_period seconds since the last
+        # checkpoint so reset the throttle
         if j[:log_throttle_bytes_skipped] > 0
-          j[:stderr_buf_to_flush] << "Skipped #{j[:log_throttle_bytes_skipped]} bytes of log"
+          message = "#{job_uuid} ! Skipped #{j[:log_throttle_bytes_skipped]} bytes of log"
+          $stderr.puts message
+          j[:stderr_buf_to_flush] << "#{Time.now.ctime.to_s} #{message}\n"
         end
 
-        j[:log_throttle_timestamp] = now
+        j[:log_throttle_reset_time] = now + Rails.configuration.crunch_log_throttle_period
         j[:log_throttle_bytes_so_far] = 0
         j[:log_throttle_lines_so_far] = 0
         j[:log_throttle_bytes_skipped] = 0
+        j[:log_throttle_is_open] = true
       end
 
       j[:buf].each do |stream, streambuf|
         # Read some data from the child stream
-        buf = false
+        buf = ''
         begin
-          buf = j[stream].read_nonblock(2**16)
+          # It's important to use a big enough buffer here. When we're
+          # being flooded with logs, we must read and discard many
+          # bytes at once. Otherwise, we can easily peg a CPU with
+          # time-checking and other loop overhead. (Quick tests show a
+          # 1MiB buffer working 2.5x as fast as a 64 KiB buffer.)
+          #
+          # So don't reduce this buffer size!
+          buf = j[stream].read_nonblock(2**20)
         rescue Errno::EAGAIN, EOFError
         end
 
-        if buf
-          # Add to the stream buffer
-          streambuf << buf
-
-          # Check for at least one complete line
-          if streambuf.index "\n"
-            lines = streambuf.lines("\n").to_a
-
-            # check if the last line is partial or not
-            streambuf.replace(if streambuf[-1] == "\n"
-                                ''        # ends on a newline
-                              else
-                                lines.pop # Put the partial line back into the buffer
-                              end)
-
-            # Now spool the lines to the log output buffer
-            lines.each do |line|
-              # rate_limit returns true or false as to whether to actually log
-              # the line or not.  It also modifies "line" in place to replace
-              # it with an error if a logging limit is tripped.
-              if rate_limit j, line
-                $stderr.print "#{job_uuid} ! " unless line.index(job_uuid)
-                $stderr.puts line
-                pub_msg = "#{Time.now.ctime.to_s} #{line.strip} \n"
-                j[:stderr_buf_to_flush] << pub_msg
-              end
-              # Send log output to the logs table
-              write_log j
+        # Short circuit the counting code if we're just going to throw
+        # away the data anyway.
+        if not j[:log_throttle_is_open]
+          j[:log_throttle_bytes_skipped] += streambuf.size + buf.size
+          streambuf.replace ''
+          next
+        elsif buf == ''
+          next
+        end
+
+        # Append to incomplete line from previous read, if any
+        streambuf << buf
+
+        bufend = ''
+        streambuf.each_line do |line|
+          if not line.end_with? $/
+            if line.size > Rails.configuration.crunch_log_throttle_bytes
+              # Without a limit here, we'll use 2x an arbitrary amount
+              # of memory, and waste a lot of time copying strings
+              # around, all without providing any feedback to anyone
+              # about what's going on _or_ hitting any of our throttle
+              # limits.
+              #
+              # Here we leave "line" alone, knowing it will never be
+              # sent anywhere: rate_limit() will reach
+              # crunch_log_throttle_bytes immediately. However, we'll
+              # leave [...] in bufend: if the trailing end of the long
+              # line does end up getting sent anywhere, it will have
+              # some indication that it is incomplete.
+              bufend = "[...]"
+            else
+              # If line length is sane, we'll wait for the rest of the
+              # line to appear in the next read_pipes() call.
+              bufend = line
+              break
             end
           end
+          # rate_limit returns true or false as to whether to actually log
+          # the line or not.  It also modifies "line" in place to replace
+          # it with an error if a logging limit is tripped.
+          if rate_limit j, line
+            $stderr.print "#{job_uuid} ! " unless line.index(job_uuid)
+            $stderr.puts line
+            pub_msg = "#{Time.now.ctime.to_s} #{line.strip}\n"
+            j[:stderr_buf_to_flush] << pub_msg
+          end
         end
+
+        # Leave the trailing incomplete line (if any) in streambuf for
+        # next time.
+        streambuf.replace bufend
       end
+      # Flush buffered logs to the logs table, if appropriate. We have
+      # to do this even if we didn't collect any new logs this time:
+      # otherwise, buffered data older than seconds_between_events
+      # won't get flushed until new data arrives.
+      write_log j
     end
   end
 
@@ -498,10 +534,12 @@ class Dispatcher
     $stderr.puts "dispatch: child #{pid_done} exit"
     $stderr.puts "dispatch: job #{job_done.uuid} end"
 
-    # Ensure every last drop of stdout and stderr is consumed
+    # Ensure every last drop of stdout and stderr is consumed.
     read_pipes
-    j_done[:stderr_flushed_at] = Time.new(0) # reset flush timestamp to make sure log gets written
-    write_log j_done # write any remaining logs
+    # Reset flush timestamp to make sure log gets written.
+    j_done[:stderr_flushed_at] = Time.new(0)
+    # Write any remaining logs.
+    write_log j_done
 
     j_done[:buf].each do |stream, streambuf|
       if streambuf != ''
@@ -611,7 +649,6 @@ class Dispatcher
   # send message to log table. we want these records to be transient
   def write_log running_job
     return if running_job[:stderr_buf_to_flush] == ''
-    return if running_job[:events_logged] > Rails.configuration.crunch_limit_log_events_per_job
 
     # Send out to log event if buffer size exceeds the bytes per event or if
     # it has been at least crunch_log_seconds_between_events seconds since
@@ -619,11 +656,6 @@ class Dispatcher
     if running_job[:stderr_buf_to_flush].size > Rails.configuration.crunch_log_bytes_per_event or
         (Time.now - running_job[:stderr_flushed_at]) >= Rails.configuration.crunch_log_seconds_between_events
       begin
-        # Just reached crunch_limit_log_events_per_job so replace log with notification.
-        if running_job[:events_logged] == Rails.configuration.crunch_limit_log_events_per_job
-          running_job[:stderr_buf_to_flush] =
-            "Exceeded live log limit #{Rails.configuration.crunch_limit_log_events_per_job} events (crunch_limit_log_events_per_job).  Live log will be truncated."
-        end
         log = Log.new(object_uuid: running_job[:job].uuid,
                       event_type: 'stderr',
                       owner_uuid: running_job[:job].owner_uuid,
index 32560c3397ffa9751155aff3da36f8988471ddea..3b5df3795c097bdd33b026a2599eb32f10cacb34 100644 (file)
@@ -49,6 +49,12 @@ project_viewer:
   api_token: projectviewertoken1234567890abcdefghijklmnopqrstuv
   expires_at: 2038-01-01 00:00:00
 
+subproject_admin:
+  api_client: untrusted
+  user: subproject_admin
+  api_token: subprojectadmintoken1234567890abcdefghijklmnopqrst
+  expires_at: 2038-01-01 00:00:00
+
 admin_vm:
   api_client: untrusted
   user: admin
@@ -180,3 +186,9 @@ user_foo_in_sharing_group:
   user: user_foo_in_sharing_group
   api_token: 2p1pou8p4ls208mcbedeewlotghppenobcyrmyhq8pyf51xd8u
   expires_at: 2038-01-01 00:00:00
+
+user1_with_load:
+  api_client: untrusted
+  user: user1_with_load
+  api_token: 1234k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi
+  expires_at: 2038-01-01 00:00:00
index 2fb235c7080dd674e59f5f8cf52d5f6df2a7c22d..045e1c71903aa121478001a7b4f79cd53ab6b221 100644 (file)
@@ -257,3 +257,52 @@ collection_owned_by_foo:
   owner_uuid: zzzzz-tpzed-81hsbo6mk8nl05c
   created_at: 2014-02-03T17:22:54Z
   name: collection_owned_by_foo
+
+collection_to_remove_from_subproject:
+  # The Workbench tests remove this from subproject.
+  uuid: zzzzz-4zz18-subprojgonecoll
+  portable_data_hash: 2386ca6e3fffd4be5e197a72c6c80fb2+51
+  manifest_text: ". 8258b505536a9ab47baa2f4281cb932a+9 0:9:missingno\n"
+  owner_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+  created_at: 2014-10-15T10:45:00
+  name: Collection to remove from subproject
+
+collection_with_files_in_subdir:
+  uuid: zzzzz-4zz18-filesinsubdir00
+  name: collection_files_in_subdir
+  portable_data_hash: 85877ca2d7e05498dd3d109baf2df106+95
+  owner_uuid: zzzzz-tpzed-user1withloadab
+  created_at: 2014-02-03T17:22:54Z
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+  modified_at: 2014-02-03T17:22:54Z
+  updated_at: 2014-02-03T17:22:54Z
+  manifest_text: ". 85877ca2d7e05498dd3d109baf2df106+95+A3a4e26a366ee7e4ed3e476ccf05354761be2e4ae@545a9920 0:95:file_in_subdir1\n./subdir2/subdir3 2bbc341c702df4d8f42ec31f16c10120+64+A315d7e7bad2ce937e711fc454fae2d1194d14d64@545a9920 0:32:file1_in_subdir3.txt 32:32:file2_in_subdir3.txt\n./subdir2/subdir3/subdir4 2bbc341c702df4d8f42ec31f16c10120+64+A315d7e7bad2ce937e711fc454fae2d1194d14d64@545a9920 0:32:file1_in_subdir4.txt 32:32:file2_in_subdir4.txt"
+
+# Test Helper trims the rest of the file
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
+
+# collections in project_with_10_collections
+<% for i in 1..10 do %>
+collection_<%=i%>_of_10:
+  name: Collection_<%= i %>
+  portable_data_hash: ea10d51bcf88862dbcc36eb292017dfd+45
+  manifest_text: ". 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz\n"
+  uuid: zzzzz-4zz18-10gneyn6brkx<%= i.to_s.rjust(3, '0') %>
+  owner_uuid: zzzzz-j7d0g-0010collections
+  created_at: <%= i.minute.ago.to_s(:db) %>
+<% end %>
+
+# collections in project_with_201_collections
+<% for i in 1..201 do %>
+collection_<%=i%>_of_201:
+  name: Collection_<%= i %>
+  portable_data_hash: ea10d51bcf88862dbcc36eb292017dfd+45
+  manifest_text: ". 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz\n"
+  uuid: zzzzz-4zz18-201gneyn6brd<%= i.to_s.rjust(3, '0') %>
+  owner_uuid: zzzzz-j7d0g-0201collections
+  created_at: <%= i.minute.ago.to_s(:db) %>
+<% end %>
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
index 8f209e9b9c691b56da02c187aba8302932c0fe8e..764261daea7b898b857e39c09067c9ec09d97dfa 100644 (file)
@@ -130,10 +130,69 @@ active_user_has_can_manage:
   name: Active user has can_manage
 
 # Group for testing granting permission between users who share a group.
-#
 group_for_sharing_tests:
   uuid: zzzzz-j7d0g-t4ucgncwteul7zt
   owner_uuid: zzzzz-tpzed-000000000000000
   name: Group for sharing tests
   description: Users who can share objects with each other
   group_class: role
+
+project_with_10_collections:
+  uuid: zzzzz-j7d0g-0010collections
+  owner_uuid: zzzzz-tpzed-user1withloadab
+  created_at: 2014-04-21 15:37:48 -0400
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+  modified_at: 2014-04-21 15:37:48 -0400
+  updated_at: 2014-04-21 15:37:48 -0400
+  name: project with 10 collections
+  description: This will result in one page in the display
+  group_class: project
+
+project_with_201_collections:
+  uuid: zzzzz-j7d0g-0201collections
+  owner_uuid: zzzzz-tpzed-user1withloadab
+  created_at: 2014-04-21 15:37:48 -0400
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+  modified_at: 2014-04-21 15:37:48 -0400
+  updated_at: 2014-04-21 15:37:48 -0400
+  name: project with 201 collections
+  description: This will result in two pages in the display
+  group_class: project
+
+project_with_10_pipelines:
+  uuid: zzzzz-j7d0g-000010pipelines
+  owner_uuid: zzzzz-tpzed-user1withloadab
+  created_at: 2014-04-21 15:37:48 -0400
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+  modified_at: 2014-04-21 15:37:48 -0400
+  updated_at: 2014-04-21 15:37:48 -0400
+  name: project with 10 pipelines
+  description: project with 10 pipelines
+  group_class: project
+
+project_with_2_pipelines_and_200_jobs:
+  uuid: zzzzz-j7d0g-nnjobspipelines
+  owner_uuid: zzzzz-tpzed-user1withloadab
+  created_at: 2014-04-21 15:37:48 -0400
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+  modified_at: 2014-04-21 15:37:48 -0400
+  updated_at: 2014-04-21 15:37:48 -0400
+  name: project with 2 pipelines and 200 jobs
+  description: This will result in two pages in the display
+  group_class: project
+
+project_with_25_pipelines:
+  uuid: zzzzz-j7d0g-000025pipelines
+  owner_uuid: zzzzz-tpzed-user1withloadab
+  created_at: 2014-04-21 15:37:48 -0400
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+  modified_at: 2014-04-21 15:37:48 -0400
+  updated_at: 2014-04-21 15:37:48 -0400
+  name: project with 25 pipelines
+  description: project with 25 pipelines
+  group_class: project
index 2be5ef9066e9db7c125a6eccc219ad1c3d706785..1381078ee1b1c3f5db3288c663e624a56429c287 100644 (file)
@@ -260,3 +260,53 @@ job_with_real_log:
   log: 0b9a7787660e1fce4a93f33e01376ba6+81
   script_version: 7def43a4d3f20789dda4700f703b5514cc3ed250
   state: Complete
+
+cancelled:
+  uuid: zzzzz-8i9sb-4cf0abc123e809j
+  owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  cancelled_at: <%= 1.minute.ago.to_s(:db) %>
+  cancelled_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  cancelled_by_client_uuid: zzzzz-ozdt8-obw7foaks3qjyej
+  created_at: <%= 4.minute.ago.to_s(:db) %>
+  started_at: <%= 3.minute.ago.to_s(:db) %>
+  finished_at: ~
+  script_version: 1de84a854e2b440dc53bf42f8548afa4c17da332
+  running: false
+  success: ~
+  output: ~
+  priority: 0
+  log: ~
+  is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  tasks_summary:
+    failed: 0
+    todo: 3
+    running: 1
+    done: 1
+  runtime_constraints: {}
+  state: Cancelled
+
+job_in_subproject:
+  uuid: zzzzz-8i9sb-subprojectjob01
+  created_at: 2014-10-15 12:00:00
+  owner_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+  log: ~
+  repository: foo
+  script: hash
+  script_version: 4fe459abe02d9b365932b8f5dc419439ab4e2577
+  state: Complete
+
+# Test Helper trims the rest of the file
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
+
+# jobs in project_with_2_pipelines_and_200_jobs
+<% for i in 1..200 do %>
+job_<%=i%>_of_200:
+  uuid: zzzzz-8i9sb-0vsrcqi7whch<%= i.to_s.rjust(3, '0') %>
+  created_at: <%= i.minute.ago.to_s(:db) %>
+  owner_uuid: zzzzz-j7d0g-nnjobspipelines
+  script_version: 7def43a4d3f20789dda4700f703b5514cc3ed250
+  state: Complete
+<% end %>
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
index 9e139714911f066a217468abebdbb841a03b95d4..899e9f0b3b615bb0df5a8c73f56c67db1f650b54 100644 (file)
@@ -377,6 +377,20 @@ project_viewer_can_read_project:
   head_uuid: zzzzz-j7d0g-v955i6s2oi1cbso
   properties: {}
 
+subproject_admin_can_manage_subproject:
+  uuid: zzzzz-o0j2j-subprojadminlnk
+  owner_uuid: zzzzz-tpzed-000000000000000
+  created_at: 2014-10-15 10:00:00 -0000
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  modified_at: 2014-10-15 10:00:00 -0000
+  updated_at: 2014-10-15 10:00:00 -0000
+  tail_uuid: zzzzz-tpzed-subprojectadmin
+  link_class: permission
+  name: can_manage
+  head_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+  properties: {}
+
 foo_collection_tag:
   uuid: zzzzz-o0j2j-eedahfaho8aphiv
   owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
@@ -735,3 +749,16 @@ user_bar_is_in_sharing_group:
   name: can_read
   head_uuid: zzzzz-tpzed-n3oaj4sm5fcnwib
 
+user1-with-load_member_of_all_users_group:
+  uuid: zzzzz-o0j2j-user1-with-load
+  owner_uuid: zzzzz-tpzed-000000000000000
+  created_at: 2014-01-24 20:42:26 -0800
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-d9tiejq69daie8f
+  modified_at: 2014-01-24 20:42:26 -0800
+  updated_at: 2014-01-24 20:42:26 -0800
+  tail_uuid: zzzzz-tpzed-user1withloadab
+  link_class: permission
+  name: can_read
+  head_uuid: zzzzz-j7d0g-fffffffffffffff
+  properties: {}
index a3d372bbca1a6add5c5504911ff8e1119298f3d2..53305ade85aedcee93635dca04f090bed7e2b68c 100644 (file)
@@ -4,6 +4,12 @@ new_pipeline:
   owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
   created_at: <%= 1.minute.ago.to_s(:db) %>
 
+new_pipeline_in_subproject:
+  state: New
+  uuid: zzzzz-d1hrv-subprojpipeline
+  owner_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+  created_at: <%= 1.minute.ago.to_s(:db) %>
+
 has_component_with_no_script_parameters:
   state: Ready
   uuid: zzzzz-d1hrv-1xfj6xkicf2muk2
@@ -143,3 +149,70 @@ pipeline_with_newer_template:
           required: true
           dataclass: Collection
           title: foo instance input
+
+# Test Helper trims the rest of the file
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
+
+# pipelines in project_with_10_pipelines
+<% for i in 0..9 do %>
+pipeline_<%=i%>_of_10:
+  name: pipeline_<%= i %>
+  state: Failed
+  uuid: zzzzz-d1hrv-10pipelines0<%= i.to_s.rjust(3, '0') %>
+  owner_uuid: zzzzz-j7d0g-000010pipelines
+  created_at: <%= (2*i).hour.ago.to_s(:db) %>
+  started_at: <%= (2*i).hour.ago.to_s(:db) %>
+  finished_at: <%= i.minute.ago.to_s(:db) %>
+  components:
+    foo:
+      script: foo
+      script_version: master
+      script_parameters:
+        input:
+          required: true
+          dataclass: Collection
+          title: foo instance input
+<% end %>
+
+# pipelines in project_with_2_pipelines_and_200_jobs
+<% for i in 0..1 do %>
+pipeline_<%=i%>_of_2_pipelines_and_200_jobs:
+  name: pipeline_<%= i %>
+  state: New
+  uuid: zzzzz-d1hrv-abcgneyn6brx<%= i.to_s.rjust(3, '0') %>
+  owner_uuid: zzzzz-j7d0g-nnjobspipelines
+  created_at: <%= i.minute.ago.to_s(:db) %>
+  components:
+    foo:
+      script: foo
+      script_version: master
+      script_parameters:
+        input:
+          required: true
+          dataclass: Collection
+          title: foo instance input
+<% end %>
+
+# pipelines in project_with_25_pipelines
+<% for i in 0..24 do %>
+pipeline_<%=i%>_of_25:
+  name: pipeline_<%=i%>
+  state: Failed
+  uuid: zzzzz-d1hrv-25pipelines0<%= i.to_s.rjust(3, '0') %>
+  owner_uuid: zzzzz-j7d0g-000025pipelines
+  created_at: <%= i.hour.ago.to_s(:db) %>
+  started_at: <%= i.hour.ago.to_s(:db) %>
+  finished_at: <%= i.minute.ago.to_s(:db) %>
+  components:
+    foo:
+      script: foo
+      script_version: master
+      script_parameters:
+        input:
+          required: true
+          dataclass: Collection
+          title: foo instance input
+<% end %>
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
index 2dd72ab4b18ed0a7f64bd1c3cf8db19ca48faacc..17ae82e901246bab0471efa26cc6fa454ff27cdd 100644 (file)
@@ -85,6 +85,20 @@ future_project_user:
       organization: example.com
       role: Computational biologist
 
+subproject_admin:
+  owner_uuid: zzzzz-tpzed-000000000000000
+  uuid: zzzzz-tpzed-subprojectadmin
+  email: subproject-admin@arvados.local
+  first_name: Subproject
+  last_name: Admin
+  identity_url: https://subproject-admin.openid.local
+  is_active: true
+  is_admin: false
+  prefs:
+    profile:
+      organization: example.com
+      role: Computational biologist
+
 spectator:
   owner_uuid: zzzzz-tpzed-000000000000000
   uuid: zzzzz-tpzed-l1s2piq4t4mps8r
@@ -200,3 +214,17 @@ user_bar_in_sharing_group:
   identity_url: https://user_bar_in_sharing_group.openid.local
   is_active: true
   is_admin: false
+
+user1_with_load:
+  owner_uuid: zzzzz-tpzed-000000000000000
+  uuid: zzzzz-tpzed-user1withloadab
+  email: user1_with_load@arvados.local
+  first_name: user1_with_load
+  last_name: User
+  identity_url: https://user1_with_load.openid.local
+  is_active: true
+  is_admin: false
+  prefs:
+    profile:
+      organization: example.com
+      role: IT
index e648523323a0cd0bf1f8f0bdfcddb2e33ee2067a..07e7f840a1bafedcd456d2d674e1ad243b59ec0a 100644 (file)
@@ -101,7 +101,38 @@ class Arvados::V1::JobsControllerTest < ActionController::TestCase
                  'trigger file should be created when job is cancelled')
   end
 
-  test "cancelling a cancelled jobs stays cancelled" do
+  [
+   [:put, :update, {job:{cancelled_at: Time.now}}, :success],
+   [:put, :update, {job:{cancelled_at: nil}}, :unprocessable_entity],
+   [:put, :update, {job:{state: 'Cancelled'}}, :success],
+   [:put, :update, {job:{state: 'Queued'}}, :unprocessable_entity],
+   [:put, :update, {job:{state: 'Running'}}, :unprocessable_entity],
+   [:put, :update, {job:{state: 'Failed'}}, :unprocessable_entity],
+   [:put, :update, {job:{state: 'Complete'}}, :unprocessable_entity],
+   [:post, :cancel, {}, :success],
+  ].each do |http_method, action, params, expected_response|
+    test "cancelled job stays cancelled after #{[http_method, action, params].inspect}" do
+      # We need to verify that "cancel" creates a trigger file, so first
+      # let's make sure there is no stale trigger file.
+      begin
+        File.unlink(Rails.configuration.crunch_refresh_trigger)
+      rescue Errno::ENOENT
+      end
+
+      authorize_with :active
+      self.send http_method, action, { id: jobs(:cancelled).uuid }.merge(params)
+      assert_response expected_response
+      if expected_response == :success
+        job = json_response
+        assert_not_nil job['cancelled_at'], 'job cancelled again using #{attribute}=#{value} did not have cancelled_at value'
+        assert_equal job['state'], 'Cancelled', 'cancelled again job state changed when updated using using #{attribute}=#{value}'
+      end
+      # Verify database record still says Cancelled
+      assert_equal 'Cancelled', Job.find(jobs(:cancelled).id).state, 'job was un-cancelled'
+    end
+  end
+
+  test "cancelled job updated to any other state change results in error" do
     # We need to verify that "cancel" creates a trigger file, so first
     # let's make sure there is no stale trigger file.
     begin
index dd942b691ed177bf09abb5993ac19aa8a92bde56..d74450587dc222524a73c7a604a59643bdd6e826 100644 (file)
@@ -68,7 +68,7 @@ class Arvados::V1::NodesControllerTest < ActionController::TestCase
 
   test "create node" do
     authorize_with :admin
-    post :create
+    post :create, {node: {}}
     assert_response :success
     assert_not_nil json_response['uuid']
     assert_not_nil json_response['info'].is_a? Hash
index 9c4d18b24152012f8161ec8ab0c0d1521c611413..2d26370b749f5b07dc866855563c62cd31f9c03a 100644 (file)
@@ -779,6 +779,16 @@ class Arvados::V1::UsersControllerTest < ActionController::TestCase
     assert_equal false, found_email, 'Expected no email after updating profile'
   end
 
+  test "user API response includes writable_by" do
+    authorize_with :active
+    get :current
+    assert_response :success
+    assert_includes(json_response["writable_by"], users(:active).uuid,
+                    "user's writable_by should include self")
+    assert_includes(json_response["writable_by"], users(:active).owner_uuid,
+                    "user's writable_by should include its owner_uuid")
+  end
+
 
   NON_ADMIN_USER_DATA = ["uuid", "kind", "is_active", "email", "first_name",
                          "last_name"].sort
index 925d879906030be0b106437d39b0f1f8561cdaa5..fbc18c50c77171c64fae4c647517d3eba56f699e 100644 (file)
@@ -240,6 +240,42 @@ class WebsocketTest < ActionDispatch::IntegrationTest
     assert_equal human.uuid, human_ev_uuid
   end
 
+
+  test "connect, subscribe, compound filter" do
+    state = 1
+    t1 = nil
+
+    authorize_with :admin
+
+    ws_helper :admin do |ws|
+      ws.on :open do |event|
+        ws.send ({method: 'subscribe', filters: [['object_uuid', 'is_a', 'arvados#trait'], ['event_type', '=', 'update']]}.to_json)
+      end
+
+      ws.on :message do |event|
+        d = Oj.load event.data
+        case state
+        when 1
+          assert_equal 200, d["status"]
+          t1 = Trait.create("name" => "foo")
+          t1.name = "bar"
+          t1.save!
+          state = 2
+         when 2
+          assert_equal 'update', d['event_type']
+          state = 3
+          ws.close
+        when 3
+          assert false, "Should not get any more events"
+        end
+      end
+
+    end
+
+    assert_equal 3, state
+    assert_not_nil t1
+  end
+
   test "connect, subscribe, ask events starting at seq num" do
     state = 1
     human = nil
index 3413f6486a43a3c412cfd18d442f8ac92e480521..d59b44926fa88a857ce10b34eaca1b4a5d3cc769 100644 (file)
@@ -88,7 +88,7 @@ class ActionController::TestCase
   def check_counter action
     @counter += 1
     if @counter == 2
-     # assert_equal 1, 2, "Multiple actions in functional test"
+      assert_equal 1, 2, "Multiple actions in functional test"
     end
   end
 
index 7adfaaa67879d78288b24306410d14c0f02f5670..0cfdfa88ee168186b11958d92742098cf7dba6c6 100644 (file)
@@ -2,6 +2,8 @@ package main
 
 import (
        "bufio"
+       "bytes"
+       "errors"
        "flag"
        "fmt"
        "io"
@@ -10,217 +12,338 @@ import (
        "os"
        "os/exec"
        "os/signal"
+       "strconv"
        "strings"
        "syscall"
        "time"
 )
 
-func ReadLineByLine(inp io.ReadCloser, out chan string, finish chan bool) {
-       s := bufio.NewScanner(inp)
+/*
+#include <unistd.h>
+#include <sys/types.h>
+#include <pwd.h>
+#include <stdlib.h>
+*/
+import "C"
+
+// The above block of magic allows us to look up user_hz via _SC_CLK_TCK.
+
+type Cgroup struct {
+       root   string
+       parent string
+       cid    string
+}
+
+func CopyPipeToChan(in io.Reader, out chan string, done chan<- bool) {
+       s := bufio.NewScanner(in)
        for s.Scan() {
                out <- s.Text()
        }
-       finish <- true
+       done <- true
 }
 
-func OutputChannel(stdout chan string, stderr chan string) {
-       for {
-               select {
-               case s, ok := <-stdout:
-                       if ok {
-                               fmt.Fprintln(os.Stdout, s)
-                       } else {
-                               return
-                       }
-               case s, ok := <-stderr:
-                       if ok {
-                               fmt.Fprintln(os.Stderr, s)
-                       } else {
-                               return
-                       }
-               }
+func CopyChanToPipe(in <-chan string, out io.Writer) {
+       for s := range in {
+               fmt.Fprintln(out, s)
        }
 }
 
-func FindStat(cgroup_root string, cgroup_parent string, container_id string, statgroup string, stat string) string {
-       var path string
-       path = fmt.Sprintf("%s/%s/%s/%s/%s.%s", cgroup_root, statgroup, cgroup_parent, container_id, statgroup, stat)
-       if _, err := os.Stat(path); err == nil {
-               return path
+var logChan chan string
+func LogPrintf(format string, args ...interface{}) {
+       if logChan == nil {
+               return
        }
-       path = fmt.Sprintf("%s/%s/%s/%s.%s", cgroup_root, cgroup_parent, container_id, statgroup, stat)
-       if _, err := os.Stat(path); err == nil {
-               return path
+       logChan <- fmt.Sprintf("crunchstat: " + format, args...)
+}
+
+func ReadAllOrWarn(in *os.File) ([]byte, error) {
+       content, err := ioutil.ReadAll(in)
+       if err != nil {
+               LogPrintf("read %s: %s", in.Name(), err)
        }
-       path = fmt.Sprintf("%s/%s/%s.%s", cgroup_root, statgroup, statgroup, stat)
-       if _, err := os.Stat(path); err == nil {
-               return path
+       return content, err
+}
+
+var reportedStatFile = map[string]string{}
+
+// Open the cgroup stats file in /sys/fs corresponding to the target
+// cgroup, and return an *os.File. If no stats file is available,
+// return nil.
+//
+// TODO: Instead of trying all options, choose a process in the
+// container, and read /proc/PID/cgroup to determine the appropriate
+// cgroup root for the given statgroup. (This will avoid falling back
+// to host-level stats during container setup and teardown.)
+func OpenStatFile(cgroup Cgroup, statgroup string, stat string) (*os.File, error) {
+       var paths = []string{
+               fmt.Sprintf("%s/%s/%s/%s/%s", cgroup.root, statgroup, cgroup.parent, cgroup.cid, stat),
+               fmt.Sprintf("%s/%s/%s/%s", cgroup.root, cgroup.parent, cgroup.cid, stat),
+               fmt.Sprintf("%s/%s/%s", cgroup.root, statgroup, stat),
+               fmt.Sprintf("%s/%s", cgroup.root, stat),
+       }
+       var path string
+       var file *os.File
+       var err error
+       for _, path = range paths {
+               file, err = os.Open(path)
+               if err == nil {
+                       break
+               } else {
+                       path = ""
+               }
        }
-       path = fmt.Sprintf("%s/%s.%s", cgroup_root, statgroup, stat)
-       if _, err := os.Stat(path); err == nil {
-               return path
+       if pathWas, ok := reportedStatFile[stat]; !ok || pathWas != path {
+               // Log whenever we start using a new/different cgroup
+               // stat file for a given statistic. This typically
+               // happens 1 to 3 times per statistic, depending on
+               // whether we happen to collect stats [a] before any
+               // processes have been created in the container and
+               // [b] after all contained processes have exited.
+               reportedStatFile[stat] = path
+               if path == "" {
+                       LogPrintf("did not find stats file: stat %s, statgroup %s, cid %s, parent %s, root %s", stat, statgroup, cgroup.cid, cgroup.parent, cgroup.root)
+               } else {
+                       LogPrintf("reading stats from %s", path)
+               }
        }
-       return ""
+       return file, err
 }
 
-func PollCgroupStats(cgroup_root string, cgroup_parent string, container_id string, stderr chan string, poll int64) {
-       //var last_usage int64 = 0
-       var last_user int64 = 0
-       var last_sys int64 = 0
-       var last_cpucount int64 = 0
-
-       type Disk struct {
-               last_read  int64
-               next_read  int64
-               last_write int64
-               next_write int64
+func GetContainerNetStats(cgroup Cgroup) (io.Reader, error) {
+       procsFile, err := OpenStatFile(cgroup, "cpuacct", "cgroup.procs")
+       if err != nil {
+               return nil, err
        }
+       defer procsFile.Close()
+       reader := bufio.NewScanner(procsFile)
+       for reader.Scan() {
+               taskPid := reader.Text()
+               statsFilename := fmt.Sprintf("/proc/%s/net/dev", taskPid)
+               stats, err := ioutil.ReadFile(statsFilename)
+               if err != nil {
+                       LogPrintf("read %s: %s", statsFilename, err)
+                       continue
+               }
+               return strings.NewReader(string(stats)), nil
+       }
+       return nil, errors.New("Could not read stats for any proc in container")
+}
 
-       disk := make(map[string]*Disk)
-
-       //cpuacct_usage := FindStat(cgroup_path, "cpuacct", "usage")
-       cpuacct_stat := FindStat(cgroup_root, cgroup_parent, container_id, "cpuacct", "stat")
-       blkio_io_service_bytes := FindStat(cgroup_root, cgroup_parent, container_id, "blkio", "io_service_bytes")
-       cpuset_cpus := FindStat(cgroup_root, cgroup_parent, container_id, "cpuset", "cpus")
-       memory_stat := FindStat(cgroup_root, cgroup_parent, container_id, "memory", "stat")
+type IoSample struct {
+       sampleTime time.Time
+       txBytes    int64
+       rxBytes    int64
+}
 
-       if cpuacct_stat != "" {
-               stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuacct_stat)
-       }
-       if blkio_io_service_bytes != "" {
-               stderr <- fmt.Sprintf("crunchstat: reading stats from %s", blkio_io_service_bytes)
+func DoBlkIoStats(cgroup Cgroup, lastSample map[string]IoSample) {
+       c, err := OpenStatFile(cgroup, "blkio", "blkio.io_service_bytes")
+       if err != nil {
+               return
        }
-       if cpuset_cpus != "" {
-               stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuset_cpus)
+       defer c.Close()
+       b := bufio.NewScanner(c)
+       var sampleTime = time.Now()
+       newSamples := make(map[string]IoSample)
+       for b.Scan() {
+               var device, op string
+               var val int64
+               if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &val); err != nil {
+                       continue
+               }
+               var thisSample IoSample
+               var ok bool
+               if thisSample, ok = newSamples[device]; !ok {
+                       thisSample = IoSample{sampleTime, -1, -1}
+               }
+               switch op {
+               case "Read":
+                       thisSample.rxBytes = val
+               case "Write":
+                       thisSample.txBytes = val
+               }
+               newSamples[device] = thisSample
        }
-       if memory_stat != "" {
-               stderr <- fmt.Sprintf("crunchstat: reading stats from %s", memory_stat)
+       for dev, sample := range newSamples {
+               if sample.txBytes < 0 || sample.rxBytes < 0 {
+                       continue
+               }
+               delta := ""
+               if prev, ok := lastSample[dev]; ok {
+                       delta = fmt.Sprintf(" -- interval %.4f seconds %d write %d read",
+                               sample.sampleTime.Sub(prev.sampleTime).Seconds(),
+                               sample.txBytes-prev.txBytes,
+                               sample.rxBytes-prev.rxBytes)
+               }
+               LogPrintf("blkio:%s %d write %d read%s", dev, sample.txBytes, sample.rxBytes, delta)
+               lastSample[dev] = sample
        }
+}
 
-       var elapsed int64 = poll
+type MemSample struct {
+       sampleTime time.Time
+       memStat    map[string]int64
+}
 
-       for {
-               /*{
-                       c, _ := os.Open(cpuacct_usage)
-                       b, _ := ioutil.ReadAll(c)
-                       var next int64
-                       fmt.Sscanf(string(b), "%d", &next)
-                       if last_usage != 0 {
-                               stderr <- fmt.Sprintf("crunchstat: cpuacct.usage %v", (next-last_usage)/10000000)
-                       }
-                       //fmt.Printf("usage %d %d %d %d%%\n", last_usage, next, next-last_usage, (next-last_usage)/10000000)
-                       last_usage = next
-                       c.Close()
-               }*/
-               var cpus int64 = 0
-               if cpuset_cpus != "" {
-                       c, _ := os.Open(cpuset_cpus)
-                       b, _ := ioutil.ReadAll(c)
-                       sp := strings.Split(string(b), ",")
-                       for _, v := range sp {
-                               var min, max int64
-                               n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
-                               if n == 2 {
-                                       cpus += (max - min) + 1
-                               } else {
-                                       cpus += 1
-                               }
-                       }
+func DoMemoryStats(cgroup Cgroup) {
+       c, err := OpenStatFile(cgroup, "memory", "memory.stat")
+       if err != nil {
+               return
+       }
+       defer c.Close()
+       b := bufio.NewScanner(c)
+       thisSample := MemSample{time.Now(), make(map[string]int64)}
+       wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
+       for b.Scan() {
+               var stat string
+               var val int64
+               if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err != nil {
+                       continue
+               }
+               thisSample.memStat[stat] = val
+       }
+       var outstat bytes.Buffer
+       for _, key := range wantStats {
+               if val, ok := thisSample.memStat[key]; ok {
+                       outstat.WriteString(fmt.Sprintf(" %d %s", val, key))
+               }
+       }
+       LogPrintf("mem%s", outstat.String())
+}
 
-                       if cpus != last_cpucount {
-                               stderr <- fmt.Sprintf("crunchstat: cpuset.cpus %v", cpus)
-                       }
-                       last_cpucount = cpus
+func DoNetworkStats(cgroup Cgroup, lastSample map[string]IoSample) {
+       sampleTime := time.Now()
+       stats, err := GetContainerNetStats(cgroup)
+       if err != nil {
+               return
+       }
 
-                       c.Close()
+       scanner := bufio.NewScanner(stats)
+       for scanner.Scan() {
+               var ifName string
+               var rx, tx int64
+               words := strings.Fields(scanner.Text())
+               if len(words) != 17 {
+                       // Skip lines with wrong format
+                       continue
                }
-               if cpus == 0 {
-                       cpus = 1
+               ifName = strings.TrimRight(words[0], ":")
+               if ifName == "lo" || ifName == "" {
+                       // Skip loopback interface and lines with wrong format
+                       continue
                }
-               if cpuacct_stat != "" {
-                       c, _ := os.Open(cpuacct_stat)
-                       b, _ := ioutil.ReadAll(c)
-                       var next_user int64
-                       var next_sys int64
-                       fmt.Sscanf(string(b), "user %d\nsystem %d", &next_user, &next_sys)
-                       c.Close()
-
-                       if last_user != 0 {
-                               user_diff := next_user - last_user
-                               sys_diff := next_sys - last_sys
-                               // Assume we're reading stats based on 100
-                               // jiffies per second.  Because the elapsed
-                               // time is in milliseconds, we need to boost
-                               // that to 1000 jiffies per second, then boost
-                               // it by another 100x to get a percentage, then
-                               // finally divide by the actual elapsed time
-                               // and the number of cpus to get average load
-                               // over the polling period.
-                               user_pct := (user_diff * 10 * 100) / (elapsed * cpus)
-                               sys_pct := (sys_diff * 10 * 100) / (elapsed * cpus)
-
-                               stderr <- fmt.Sprintf("crunchstat: cpuacct.stat user %v", user_pct)
-                               stderr <- fmt.Sprintf("crunchstat: cpuacct.stat sys %v", sys_pct)
-                       }
-
-                       /*fmt.Printf("user %d %d %d%%\n", last_user, next_user, next_user-last_user)
-                       fmt.Printf("sys %d %d %d%%\n", last_sys, next_sys, next_sys-last_sys)
-                       fmt.Printf("sum %d%%\n", (next_user-last_user)+(next_sys-last_sys))*/
-                       last_user = next_user
-                       last_sys = next_sys
+               if tx, err = strconv.ParseInt(words[9], 10, 64); err != nil {
+                       continue
                }
-               if blkio_io_service_bytes != "" {
-                       c, _ := os.Open(blkio_io_service_bytes)
-                       b := bufio.NewScanner(c)
-                       var device, op string
-                       var next int64
-                       for b.Scan() {
-                               if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &next); err == nil {
-                                       if disk[device] == nil {
-                                               disk[device] = new(Disk)
-                                       }
-                                       if op == "Read" {
-                                               disk[device].last_read = disk[device].next_read
-                                               disk[device].next_read = next
-                                               if disk[device].last_read > 0 && (disk[device].next_read != disk[device].last_read) {
-                                                       stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s read %v", device, disk[device].next_read-disk[device].last_read)
-                                               }
-                                       }
-                                       if op == "Write" {
-                                               disk[device].last_write = disk[device].next_write
-                                               disk[device].next_write = next
-                                               if disk[device].last_write > 0 && (disk[device].next_write != disk[device].last_write) {
-                                                       stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s write %v", device, disk[device].next_write-disk[device].last_write)
-                                               }
-                                       }
-                               }
-                       }
-                       c.Close()
+               if rx, err = strconv.ParseInt(words[1], 10, 64); err != nil {
+                       continue
                }
+               nextSample := IoSample{}
+               nextSample.sampleTime = sampleTime
+               nextSample.txBytes = tx
+               nextSample.rxBytes = rx
+               var delta string
+               if prev, ok := lastSample[ifName]; ok {
+                       interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
+                       delta = fmt.Sprintf(" -- interval %.4f seconds %d tx %d rx",
+                               interval,
+                               tx-prev.txBytes,
+                               rx-prev.rxBytes)
+               }
+               LogPrintf("net:%s %d tx %d rx%s", ifName, tx, rx, delta)
+               lastSample[ifName] = nextSample
+       }
+}
 
-               if memory_stat != "" {
-                       c, _ := os.Open(memory_stat)
-                       b := bufio.NewScanner(c)
-                       var stat string
-                       var val int64
-                       for b.Scan() {
-                               if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err == nil {
-                                       if stat == "rss" {
-                                               stderr <- fmt.Sprintf("crunchstat: memory.stat rss %v", val)
-                                       }
-                               }
-                       }
-                       c.Close()
+type CpuSample struct {
+       hasData    bool // to distinguish the zero value from real data
+       sampleTime time.Time
+       user       float64
+       sys        float64
+       cpus       int64
+}
+
+// Return the number of CPUs available in the container. Return 0 if
+// we can't figure out the real number of CPUs.
+func GetCpuCount(cgroup Cgroup) int64 {
+       cpusetFile, err := OpenStatFile(cgroup, "cpuset", "cpuset.cpus")
+       if err != nil {
+               return 0
+       }
+       defer cpusetFile.Close()
+       b, err := ReadAllOrWarn(cpusetFile)
+       sp := strings.Split(string(b), ",")
+       cpus := int64(0)
+       for _, v := range sp {
+               var min, max int64
+               n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
+               if n == 2 {
+                       cpus += (max - min) + 1
+               } else {
+                       cpus += 1
                }
+       }
+       return cpus
+}
 
-               bedtime := time.Now()
-               time.Sleep(time.Duration(poll) * time.Millisecond)
-               morning := time.Now()
-               elapsed = morning.Sub(bedtime).Nanoseconds() / int64(time.Millisecond)
+func DoCpuStats(cgroup Cgroup, lastSample *CpuSample) {
+       statFile, err := OpenStatFile(cgroup, "cpuacct", "cpuacct.stat")
+       if err != nil {
+               return
        }
+       defer statFile.Close()
+       b, err := ReadAllOrWarn(statFile)
+       if err != nil {
+               return
+       }
+
+       nextSample := CpuSample{true, time.Now(), 0, 0, GetCpuCount(cgroup)}
+       var userTicks, sysTicks int64
+       fmt.Sscanf(string(b), "user %d\nsystem %d", &userTicks, &sysTicks)
+       user_hz := float64(C.sysconf(C._SC_CLK_TCK))
+       nextSample.user = float64(userTicks) / user_hz
+       nextSample.sys = float64(sysTicks) / user_hz
+
+       delta := ""
+       if lastSample.hasData {
+               delta = fmt.Sprintf(" -- interval %.4f seconds %.4f user %.4f sys",
+                       nextSample.sampleTime.Sub(lastSample.sampleTime).Seconds(),
+                       nextSample.user-lastSample.user,
+                       nextSample.sys-lastSample.sys)
+       }
+       LogPrintf("cpu %.4f user %.4f sys %d cpus%s",
+               nextSample.user, nextSample.sys, nextSample.cpus, delta)
+       *lastSample = nextSample
 }
 
-func main() {
+func PollCgroupStats(cgroup Cgroup, poll int64, stop_poll_chan <-chan bool) {
+       var lastNetSample = map[string]IoSample{}
+       var lastDiskSample = map[string]IoSample{}
+       var lastCpuSample = CpuSample{}
+
+       poll_chan := make(chan bool, 1)
+       go func() {
+               // Send periodic poll events.
+               poll_chan <- true
+               for {
+                       time.Sleep(time.Duration(poll) * time.Millisecond)
+                       poll_chan <- true
+               }
+       }()
+       for {
+               select {
+               case <-stop_poll_chan:
+                       return
+               case <-poll_chan:
+                       // Emit stats, then select again.
+               }
+               DoMemoryStats(cgroup)
+               DoCpuStats(cgroup, &lastCpuSample)
+               DoBlkIoStats(cgroup, lastDiskSample)
+               DoNetworkStats(cgroup, lastNetSample)
+       }
+}
+
+func run(logger *log.Logger) error {
 
        var (
                cgroup_root    string
@@ -238,21 +361,16 @@ func main() {
 
        flag.Parse()
 
-       logger := log.New(os.Stderr, "crunchstat: ", 0)
-
        if cgroup_root == "" {
                logger.Fatal("Must provide -cgroup-root")
        }
 
-       // Make output channel
-       stdout_chan := make(chan string)
-       stderr_chan := make(chan string)
+       logChan = make(chan string, 1)
+       defer close(logChan)
        finish_chan := make(chan bool)
-       defer close(stdout_chan)
-       defer close(stderr_chan)
        defer close(finish_chan)
 
-       go OutputChannel(stdout_chan, stderr_chan)
+       go CopyChanToPipe(logChan, os.Stderr)
 
        var cmd *exec.Cmd
 
@@ -262,9 +380,10 @@ func main() {
 
                logger.Print("Running ", flag.Args())
 
-               // Child process will read from our stdin pipe (we
-               // close our copy below)
+               // Child process will use our stdin and stdout pipes
+               // (we close our copies below)
                cmd.Stdin = os.Stdin
+               cmd.Stdout = os.Stdout
 
                // Forward SIGINT and SIGTERM to inner process
                term := make(chan os.Signal, 1)
@@ -273,69 +392,73 @@ func main() {
                        if cmd.Process != nil {
                                cmd.Process.Signal(catch)
                        }
-                       logger.Print("caught signal:", catch)
+                       logger.Print("caught signal: ", catch)
                }(term)
                signal.Notify(term, syscall.SIGTERM)
                signal.Notify(term, syscall.SIGINT)
 
-               // Funnel stdout and stderr from subprocess to output channels
-               stdout_pipe, err := cmd.StdoutPipe()
-               if err != nil {
-                       logger.Fatal(err)
-               }
-               go ReadLineByLine(stdout_pipe, stdout_chan, finish_chan)
-
+               // Funnel stderr through our channel
                stderr_pipe, err := cmd.StderrPipe()
                if err != nil {
                        logger.Fatal(err)
                }
-               go ReadLineByLine(stderr_pipe, stderr_chan, finish_chan)
+               go CopyPipeToChan(stderr_pipe, logChan, finish_chan)
 
                // Run subprocess
                if err := cmd.Start(); err != nil {
                        logger.Fatal(err)
                }
-       }
 
-       // Close standard input in this (parent) process
-       os.Stdin.Close()
+               // Close stdin/stdout in this (parent) process
+               os.Stdin.Close()
+               os.Stdout.Close()
+       }
 
        // Read the cid file
        var container_id string
        if cgroup_cidfile != "" {
                // wait up to 'wait' seconds for the cid file to appear
+               ok := false
                var i time.Duration
                for i = 0; i < time.Duration(wait)*time.Second; i += (100 * time.Millisecond) {
-                       f, err := os.Open(cgroup_cidfile)
-                       if err == nil {
-                               cid, err2 := ioutil.ReadAll(f)
-                               if err2 == nil && len(cid) > 0 {
-                                       container_id = string(cid)
-                                       f.Close()
-                                       break
-                               }
+                       cid, err := ioutil.ReadFile(cgroup_cidfile)
+                       if err == nil && len(cid) > 0 {
+                               ok = true
+                               container_id = string(cid)
+                               break
                        }
                        time.Sleep(100 * time.Millisecond)
                }
-               if cgroup_root == "" {
+               if !ok {
                        logger.Printf("Could not read cid file %s", cgroup_cidfile)
                }
        }
 
-       go PollCgroupStats(cgroup_root, cgroup_parent, container_id, stderr_chan, poll)
+       stop_poll_chan := make(chan bool, 1)
+       cgroup := Cgroup{cgroup_root, cgroup_parent, container_id}
+       go PollCgroupStats(cgroup, poll, stop_poll_chan)
 
-       // Wait for each of stdout and stderr to drain
-       <-finish_chan
+       // When the child exits, tell the polling goroutine to stop.
+       defer func() { stop_poll_chan <- true }()
+
+       // Wait for CopyPipeToChan to consume child's stderr pipe
        <-finish_chan
 
-       if err := cmd.Wait(); err != nil {
+       return cmd.Wait()
+}
+
+func main() {
+       logger := log.New(os.Stderr, "crunchstat: ", 0)
+       if err := run(logger); err != nil {
                if exiterr, ok := err.(*exec.ExitError); ok {
                        // The program has exited with an exit code != 0
 
-                       // This works on both Unix and Windows. Although package
-                       // syscall is generally platform dependent, WaitStatus is
-                       // defined for both Unix and Windows and in both cases has
-                       // an ExitStatus() method with the same signature.
+                       // This works on both Unix and
+                       // Windows. Although package syscall is
+                       // generally platform dependent, WaitStatus is
+                       // defined for both Unix and Windows and in
+                       // both cases has an ExitStatus() method with
+                       // the same signature.
                        if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
                                os.Exit(status.ExitStatus())
                        }
diff --git a/services/crunchstat/crunchstat_test.go b/services/crunchstat/crunchstat_test.go
new file mode 100644 (file)
index 0000000..48988a1
--- /dev/null
@@ -0,0 +1,50 @@
+package main
+
+import (
+       "os"
+       "regexp"
+       "testing"
+)
+
+func TestReadAllOrWarnFail(t *testing.T) {
+       logChan = make(chan string)
+       go func() {
+               defer close(logChan)
+               // The special file /proc/self/mem can be opened for
+               // reading, but reading from byte 0 returns an error.
+               f, err := os.Open("/proc/self/mem")
+               if err != nil {
+                       t.Fatalf("Opening /proc/self/mem: %s", err)
+               }
+               if x, err := ReadAllOrWarn(f); err == nil {
+                       t.Fatalf("Expected error, got %v", x)
+               }
+       }()
+       if _, ok := <-logChan; !ok {
+               t.Fatalf("Expected error message about nonexistent file")
+       }
+       if msg, ok := <-logChan; ok {
+               t.Fatalf("Expected channel to close, got %s", msg)
+       }
+}
+
+func TestReadAllOrWarnSuccess(t *testing.T) {
+       logChan = make(chan string)
+       go func() {
+               defer close(logChan)
+               f, err := os.Open("./crunchstat_test.go")
+               if err != nil {
+                       t.Fatalf("Opening ./crunchstat_test.go: %s", err)
+               }
+               data, err := ReadAllOrWarn(f)
+               if err != nil {
+                       t.Fatalf("got error %s", err)
+               }
+               if matched, err := regexp.MatchString("^package main\n", string(data)); err != nil || !matched {
+                       t.Fatalf("data failed regexp: %s", err)
+               }
+       }()
+       if msg, ok := <-logChan; ok {
+               t.Fatalf("Expected channel to close, got %s", msg)
+       }
+}
index c261bc31972079b1119f300b506945e893e0e4aa..ce7e42b9750138c7f6ac2353b66513cec70b52bd 100644 (file)
@@ -17,6 +17,7 @@ import apiclient
 import json
 import logging
 import time
+import _strptime
 import calendar
 import threading
 from arvados.util import portable_data_hash_pattern, uuid_pattern, collection_uuid_pattern, group_uuid_pattern, user_uuid_pattern, link_uuid_pattern
index a927b87ea6575812c1b5fb728e50e946b6b46589..de4ccafc28dd90f6ddbab4690bcc02976dcfa4af 100644 (file)
@@ -80,12 +80,12 @@ func main() {
 
        if pidfile != "" {
                f, err := os.Create(pidfile)
-               if err == nil {
-                       fmt.Fprint(f, os.Getpid())
-                       f.Close()
-               } else {
-                       log.Printf("Error writing pid file (%s): %s", pidfile, err.Error())
+               if err != nil {
+                       log.Fatalf("Error writing pid file (%s): %s", pidfile, err.Error())
                }
+               fmt.Fprint(f, os.Getpid())
+               f.Close()
+               defer os.Remove(pidfile)
        }
 
        kc.Want_replicas = default_replicas
@@ -104,30 +104,17 @@ func main() {
                s := <-sig
                log.Println("caught signal:", s)
                listener.Close()
+               listener = nil
        }(term)
        signal.Notify(term, syscall.SIGTERM)
        signal.Notify(term, syscall.SIGINT)
 
-       if pidfile != "" {
-               f, err := os.Create(pidfile)
-               if err == nil {
-                       fmt.Fprint(f, os.Getpid())
-                       f.Close()
-               } else {
-                       log.Printf("Error writing pid file (%s): %s", pidfile, err.Error())
-               }
-       }
-
        log.Printf("Arvados Keep proxy started listening on %v with server list %v", listener.Addr(), kc.ServiceRoots())
 
        // Start listening for requests.
        http.Serve(listener, MakeRESTRouter(!no_get, !no_put, &kc))
 
        log.Println("shutting down")
-
-       if pidfile != "" {
-               os.Remove(pidfile)
-       }
 }
 
 type ApiTokenCache struct {
index 5944b2c21d0d27d776c6ab54cc3d444160aaa16f..f6d163c1f19fe7449adb764f7d98f728438b3fa8 100644 (file)
@@ -34,6 +34,25 @@ func pythonDir() string {
        return fmt.Sprintf("%s/../../sdk/python/tests", cwd)
 }
 
+// Wait (up to 1 second) for keepproxy to listen on a port. This
+// avoids a race condition where we hit a "connection refused" error
+// because we start testing the proxy too soon.
+func waitForListener() {
+       const (ms = 5)
+       for i := 0; listener == nil && i < 1000; i += ms {
+               time.Sleep(ms * time.Millisecond)
+       }
+       if listener == nil {
+               log.Fatalf("Timed out waiting for listener to start")
+       }
+}
+
+func closeListener() {
+       if listener != nil {
+               listener.Close()
+       }
+}
+
 func (s *ServerRequiredSuite) SetUpSuite(c *C) {
        cwd, _ := os.Getwd()
        defer os.Chdir(cwd)
@@ -155,7 +174,8 @@ func (s *ServerRequiredSuite) TestPutAskGet(c *C) {
        os.Setenv("ARVADOS_EXTERNAL_CLIENT", "")
        log.Print("keepclient created")
 
-       defer listener.Close()
+       waitForListener()
+       defer closeListener()
 
        hash := fmt.Sprintf("%x", md5.Sum([]byte("foo")))
        var hash2 string
@@ -199,7 +219,8 @@ func (s *ServerRequiredSuite) TestPutAskGetForbidden(c *C) {
        log.Print("TestPutAndGet start")
 
        kc := runProxy(c, []string{"keepproxy"}, "123abc", 29951)
-       defer listener.Close()
+       waitForListener()
+       defer closeListener()
 
        log.Print("keepclient created")
 
@@ -240,7 +261,8 @@ func (s *ServerRequiredSuite) TestGetDisabled(c *C) {
        log.Print("TestGetDisabled start")
 
        kc := runProxy(c, []string{"keepproxy", "-no-get"}, "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h", 29952)
-       defer listener.Close()
+       waitForListener()
+       defer closeListener()
 
        hash := fmt.Sprintf("%x", md5.Sum([]byte("baz")))
 
@@ -279,7 +301,8 @@ func (s *ServerRequiredSuite) TestPutDisabled(c *C) {
        log.Print("TestPutDisabled start")
 
        kc := runProxy(c, []string{"keepproxy", "-no-put"}, "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h", 29953)
-       defer listener.Close()
+       waitForListener()
+       defer closeListener()
 
        {
                hash2, rep, err := kc.PutB([]byte("quux"))
index 0829ce9f92eebfbc7ab77ee78eb4730f36050441..ca609157aa670cb13f3f6325fd1b596a85ec7f70 100644 (file)
@@ -224,21 +224,15 @@ func TestPutHandler(t *testing.T) {
 }
 
 // Test /index requests:
-//   - enforce_permissions off | unauthenticated /index request
-//   - enforce_permissions off | unauthenticated /index/prefix request
-//   - enforce_permissions off | authenticated /index request        | non-superuser
-//   - enforce_permissions off | authenticated /index/prefix request | non-superuser
-//   - enforce_permissions off | authenticated /index request        | superuser
-//   - enforce_permissions off | authenticated /index/prefix request | superuser
-//   - enforce_permissions on  | unauthenticated /index request
-//   - enforce_permissions on  | unauthenticated /index/prefix request
-//   - enforce_permissions on  | authenticated /index request        | non-superuser
-//   - enforce_permissions on  | authenticated /index/prefix request | non-superuser
-//   - enforce_permissions on  | authenticated /index request        | superuser
-//   - enforce_permissions on  | authenticated /index/prefix request | superuser
+//   - unauthenticated /index request
+//   - unauthenticated /index/prefix request
+//   - authenticated   /index request        | non-superuser
+//   - authenticated   /index/prefix request | non-superuser
+//   - authenticated   /index request        | superuser
+//   - authenticated   /index/prefix request | superuser
 //
 // The only /index requests that should succeed are those issued by the
-// superuser when enforce_permissions = true.
+// superuser. They should pass regardless of the value of enforce_permissions.
 //
 func TestIndexHandler(t *testing.T) {
        defer teardown()
@@ -289,95 +283,58 @@ func TestIndexHandler(t *testing.T) {
                api_token: data_manager_token,
        }
 
-       // ----------------------------
-       // enforce_permissions disabled
-       // All /index requests should fail.
-       enforce_permissions = false
+       // -------------------------------------------------------------
+       // Only the superuser should be allowed to issue /index requests.
+
+  // ---------------------------
+  // enforce_permissions enabled
+       // This setting should not affect tests passing.
+  enforce_permissions = true
 
        // unauthenticated /index request
-       // => PermissionError
+       // => UnauthorizedError
        response := IssueRequest(rest, unauthenticated_req)
        ExpectStatusCode(t,
-               "enforce_permissions off, unauthenticated request",
-               PermissionError.HTTPCode,
+               "enforce_permissions on, unauthenticated request",
+               UnauthorizedError.HTTPCode,
                response)
 
        // unauthenticated /index/prefix request
-       // => PermissionError
+       // => UnauthorizedError
        response = IssueRequest(rest, unauth_prefix_req)
        ExpectStatusCode(t,
-               "enforce_permissions off, unauthenticated /index/prefix request",
-               PermissionError.HTTPCode,
+               "permissions on, unauthenticated /index/prefix request",
+               UnauthorizedError.HTTPCode,
                response)
 
        // authenticated /index request, non-superuser
-       // => PermissionError
+       // => UnauthorizedError
        response = IssueRequest(rest, authenticated_req)
        ExpectStatusCode(t,
-               "enforce_permissions off, authenticated request, non-superuser",
-               PermissionError.HTTPCode,
+               "permissions on, authenticated request, non-superuser",
+               UnauthorizedError.HTTPCode,
                response)
 
        // authenticated /index/prefix request, non-superuser
-       // => PermissionError
+       // => UnauthorizedError
        response = IssueRequest(rest, auth_prefix_req)
        ExpectStatusCode(t,
-               "enforce_permissions off, authenticated /index/prefix request, non-superuser",
-               PermissionError.HTTPCode,
+               "permissions on, authenticated /index/prefix request, non-superuser",
+               UnauthorizedError.HTTPCode,
                response)
 
-       // authenticated /index request, superuser
-       // => PermissionError
+       // superuser /index request
+       // => OK
        response = IssueRequest(rest, superuser_req)
        ExpectStatusCode(t,
-               "enforce_permissions off, superuser request",
-               PermissionError.HTTPCode,
-               response)
-
-       // superuser /index/prefix request
-       // => PermissionError
-       response = IssueRequest(rest, superuser_prefix_req)
-       ExpectStatusCode(t,
-               "enforce_permissions off, superuser /index/prefix request",
-               PermissionError.HTTPCode,
-               response)
-
-       // ---------------------------
-       // enforce_permissions enabled
-       // Only the superuser should be allowed to issue /index requests.
-       enforce_permissions = true
-
-       // unauthenticated /index request
-       // => PermissionError
-       response = IssueRequest(rest, unauthenticated_req)
-       ExpectStatusCode(t,
-               "enforce_permissions on, unauthenticated request",
-               PermissionError.HTTPCode,
-               response)
-
-       // unauthenticated /index/prefix request
-       // => PermissionError
-       response = IssueRequest(rest, unauth_prefix_req)
-       ExpectStatusCode(t,
-               "permissions on, unauthenticated /index/prefix request",
-               PermissionError.HTTPCode,
-               response)
-
-       // authenticated /index request, non-superuser
-       // => PermissionError
-       response = IssueRequest(rest, authenticated_req)
-       ExpectStatusCode(t,
-               "permissions on, authenticated request, non-superuser",
-               PermissionError.HTTPCode,
+               "permissions on, superuser request",
+               http.StatusOK,
                response)
 
-       // authenticated /index/prefix request, non-superuser
-       // => PermissionError
-       response = IssueRequest(rest, auth_prefix_req)
-       ExpectStatusCode(t,
-               "permissions on, authenticated /index/prefix request, non-superuser",
-               PermissionError.HTTPCode,
-               response)
+       // ----------------------------
+       // enforce_permissions disabled
+       // Valid Request should still pass.
+       enforce_permissions = false
 
        // superuser /index request
        // => OK
@@ -387,6 +344,8 @@ func TestIndexHandler(t *testing.T) {
                http.StatusOK,
                response)
 
+
+
        expected := `^` + TEST_HASH + `\+\d+ \d+\n` +
                TEST_HASH_2 + `\+\d+ \d+\n$`
        match, _ := regexp.MatchString(expected, response.Body.String())
index 1ef991565d221b31fa1e83d365df21b2567e4db8..27d1e908c56ab46d535fc97bc1f2b79bf349387a 100644 (file)
@@ -244,18 +244,15 @@ func PutBlockHandler(resp http.ResponseWriter, req *http.Request) {
 //     A HandleFunc to address /index and /index/{prefix} requests.
 //
 func IndexHandler(resp http.ResponseWriter, req *http.Request) {
-       prefix := mux.Vars(req)["prefix"]
-
-       // Only the data manager may issue /index requests,
-       // and only if enforce_permissions is enabled.
-       // All other requests return 403 Forbidden.
-       api_token := GetApiToken(req)
-       if !enforce_permissions ||
-               api_token == "" ||
-               data_manager_token != api_token {
-               http.Error(resp, PermissionError.Error(), PermissionError.HTTPCode)
+       // Reject unauthorized requests.
+       if !IsDataManagerToken(GetApiToken(req)) {
+               http.Error(resp, UnauthorizedError.Error(), UnauthorizedError.HTTPCode)
+               log.Printf("%s %s: %s\n", req.Method, req.URL, UnauthorizedError.Error())
                return
        }
+
+       prefix := mux.Vars(req)["prefix"]
+
        var index string
        for _, vol := range KeepVM.Volumes() {
                index = index + vol.Index(prefix)
index 143815576b33f077a89fead6e46efb795de20f9d..1048f53130315525da0a80c1214207a9eb101cd5 100644 (file)
@@ -80,23 +80,24 @@ func SignLocator(blob_locator string, api_token string, expiry time.Time) string
                "@" + timestamp_hex
 }
 
+var signedLocatorRe = regexp.MustCompile(`^([[:xdigit:]]{32}).*\+A([[:xdigit:]]{40})@([[:xdigit:]]{8})`)
+
 // VerifySignature returns true if the signature on the signed_locator
 // can be verified using the given api_token.
 func VerifySignature(signed_locator string, api_token string) bool {
-       if re, err := regexp.Compile(`^([a-f0-9]{32}(\+[0-9]+)?).*\+A[[:xdigit:]]+@([[:xdigit:]]{8})`); err == nil {
-               if matches := re.FindStringSubmatch(signed_locator); matches != nil {
-                       blob_locator := matches[1]
-                       timestamp_hex := matches[3]
-                       if expire_ts, err := ParseHexTimestamp(timestamp_hex); err == nil {
-                               // Fail signatures with expired timestamps.
-                               if expire_ts.Before(time.Now()) {
-                                       return false
-                               }
-                               return signed_locator == SignLocator(blob_locator, api_token, expire_ts)
-                       }
-               }
+       matches := signedLocatorRe.FindStringSubmatch(signed_locator)
+       if matches == nil {
+               // Could not find a permission signature at all
+               return false
+       }
+       blob_hash := matches[1]
+       sig_hex := matches[2]
+       exp_hex := matches[3]
+       if exp_time, err := ParseHexTimestamp(exp_hex); err != nil || exp_time.Before(time.Now()) {
+               // Signature is expired, or timestamp is unparseable
+               return false
        }
-       return false
+       return sig_hex == MakePermSignature(blob_hash, api_token, exp_hex)
 }
 
 func ParseHexTimestamp(timestamp_hex string) (ts time.Time, err error) {
index d1c6b50496d02a8cb982c1a295962bf5dcfa884a..d0081cd01014b69abf9910ffad059077d19549ee 100644 (file)
@@ -5,7 +5,7 @@ import (
        "time"
 )
 
-var (
+const (
        known_hash    = "acbd18db4cc2f85cedef654fccc4a4d8"
        known_locator = known_hash + "+3"
        known_token   = "hocfupkn2pjhrpgp2vxv8rsku7tvtx49arbc9s4bvu7p7wxqvk"
@@ -18,7 +18,8 @@ var (
                "786u5rw2a9gx743dj3fgq2irk"
        known_signature      = "257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a"
        known_timestamp      = "7fffffff"
-       known_signed_locator = known_locator + "+A" + known_signature + "@" + known_timestamp
+       known_sig_hint       = "+A" + known_signature + "@" + known_timestamp
+       known_signed_locator = known_locator + known_sig_hint
 )
 
 func TestSignLocator(t *testing.T) {
@@ -43,14 +44,34 @@ func TestVerifySignature(t *testing.T) {
        }
 }
 
+func TestVerifySignatureExtraHints(t *testing.T) {
+       PermissionSecret = []byte(known_key)
+       defer func() { PermissionSecret = nil }()
+
+       if !VerifySignature(known_locator + "+K@xyzzy" + known_sig_hint, known_token) {
+               t.Fatal("Verify cannot handle hint before permission signature")
+       }
+
+       if !VerifySignature(known_locator + known_sig_hint + "+Zfoo", known_token) {
+               t.Fatal("Verify cannot handle hint after permission signature")
+       }
+
+       if !VerifySignature(known_locator + "+K@xyzzy" + known_sig_hint + "+Zfoo", known_token) {
+               t.Fatal("Verify cannot handle hints around permission signature")
+       }
+}
+
 // The size hint on the locator string should not affect signature validation.
 func TestVerifySignatureWrongSize(t *testing.T) {
        PermissionSecret = []byte(known_key)
        defer func() { PermissionSecret = nil }()
 
-       signed_locator_wrong_size := known_hash + "+999999+A" + known_signature + "@" + known_timestamp
-       if !VerifySignature(signed_locator_wrong_size, known_token) {
-               t.Fail()
+       if !VerifySignature(known_hash + "+999999" + known_sig_hint, known_token) {
+               t.Fatal("Verify cannot handle incorrect size hint")
+       }
+
+       if !VerifySignature(known_hash + known_sig_hint, known_token) {
+               t.Fatal("Verify cannot handle missing size hint")
        }
 }
 
index 75a75229a6861f83f45b9264bab9d577d7d0f880..4db2a5338400af7aa8c31089df63cf8cf71c502e 100644 (file)
@@ -103,6 +103,7 @@ func (v *UnixVolume) Touch(loc string) error {
        if err != nil {
                return err
        }
+       defer f.Close()
        if e := lockfile(f); e != nil {
                return e
        }
@@ -259,6 +260,7 @@ func (v *UnixVolume) Delete(loc string) error {
        if err != nil {
                return err
        }
+       defer f.Close()
        if e := lockfile(f); e != nil {
                return e
        }
index d6aeac618582a3c555d92b1515654fc8386a6795..7a10fc5c60a3db5c22001f9264a77561016a6181 100644 (file)
@@ -5,6 +5,7 @@ import (
        "fmt"
        "io/ioutil"
        "os"
+       "syscall"
        "testing"
        "time"
 )
@@ -110,32 +111,41 @@ func TestPutTouch(t *testing.T) {
        if err := v.Put(TEST_HASH, TEST_BLOCK); err != nil {
                t.Error(err)
        }
-       old_mtime, err := v.Mtime(TEST_HASH)
-       if err != nil {
-               t.Error(err)
-       }
-       if old_mtime.IsZero() {
-               t.Errorf("v.Mtime(%s) returned a zero mtime\n", TEST_HASH)
+
+       // We'll verify { t0 < threshold < t1 }, where t0 is the
+       // existing block's timestamp on disk before Put() and t1 is
+       // its timestamp after Put().
+       threshold := time.Now().Add(-time.Second)
+
+       // Set the stored block's mtime far enough in the past that we
+       // can see the difference between "timestamp didn't change"
+       // and "timestamp granularity is too low".
+       {
+               oldtime := time.Now().Add(-20 * time.Second).Unix()
+               if err := syscall.Utime(v.blockPath(TEST_HASH),
+                       &syscall.Utimbuf{oldtime, oldtime}); err != nil {
+                       t.Error(err)
+               }
+
+               // Make sure v.Mtime() agrees the above Utime really worked.
+               if t0, err := v.Mtime(TEST_HASH); err != nil || t0.IsZero() || !t0.Before(threshold) {
+                       t.Errorf("Setting mtime failed: %v, %v", t0, err)
+               }
        }
-       // Sleep for 1s, then put the block again.  The volume
-       // should report a more recent mtime.
-       //
-       // TODO(twp): this would be better handled with a mock Time object.
-       // Alternatively, set the mtime manually to some moment in the past
-       // (maybe a v.SetMtime method?)
-       //
-       time.Sleep(time.Second)
+
+       // Write the same block again.
        if err := v.Put(TEST_HASH, TEST_BLOCK); err != nil {
                t.Error(err)
        }
-       new_mtime, err := v.Mtime(TEST_HASH)
+
+       // Verify threshold < t1
+       t1, err := v.Mtime(TEST_HASH)
        if err != nil {
                t.Error(err)
        }
-
-       if !new_mtime.After(old_mtime) {
-               t.Errorf("v.Put did not update the block mtime:\nold_mtime = %v\nnew_mtime = %v\n",
-                       old_mtime, new_mtime)
+       if t1.Before(threshold) {
+               t.Errorf("t1 %v must be >= threshold %v after v.Put ",
+                       t1, threshold)
        }
 }
 
diff --git a/services/nodemanager/.gitignore b/services/nodemanager/.gitignore
new file mode 100644 (file)
index 0000000..d6bc7fe
--- /dev/null
@@ -0,0 +1,5 @@
+*.pyc
+*.egg
+*.egg-info
+build/
+dist/
diff --git a/services/nodemanager/README.rst b/services/nodemanager/README.rst
new file mode 100644 (file)
index 0000000..8713260
--- /dev/null
@@ -0,0 +1,39 @@
+====================
+Arvados Node Manager
+====================
+
+Overview
+--------
+
+This package provides ``arvados-node-manager``.  It dynamically starts
+and stops compute nodes on an Arvados_ cloud installation based on job
+demand.
+
+.. _Arvados: https://arvados.org/
+
+Setup
+-----
+
+1. Install the package.
+
+2. Write a configuration file.  ``doc/ec2.example.cfg`` documents all
+   of the options available, with specific tunables for EC2 clouds.
+
+3. Run ``arvados-node-manager --config YOURCONFIGFILE`` using whatever
+   supervisor you like (e.g., runit).
+
+Testing and Development
+-----------------------
+
+To run tests, just run::
+
+  python setup.py test
+
+Our `hacking guide
+<https://arvados.org/projects/arvados/wiki/Hacking_Node_Manager>`_
+provides an architectural overview of the Arvados Node Manager to help
+you find your way around the source.  The `Lifecycle of an Arvados
+compute node
+<https://arvados.org/projects/arvados/wiki/Lifecycle_of_an_Arvados_compute_node>`_
+page explains how it works in concert with other Arvados components to
+prepare a node for compute work.
diff --git a/services/nodemanager/arvnodeman/__init__.py b/services/nodemanager/arvnodeman/__init__.py
new file mode 100644 (file)
index 0000000..a1ecac7
--- /dev/null
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import _strptime  # See <http://bugs.python.org/issue7980#msg221094>.
+import logging
+
+logger = logging.getLogger('arvnodeman')
+logger.addHandler(logging.NullHandler())
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
new file mode 100644 (file)
index 0000000..46a103e
--- /dev/null
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import logging
+import time
+
+import pykka
+
+from .config import actor_class
+
+def _notify_subscribers(response, subscribers):
+    """Send the response to all the subscriber methods.
+
+    If any of the subscriber actors have stopped, remove them from the
+    subscriber set.
+    """
+    dead_subscribers = set()
+    for subscriber in subscribers:
+        try:
+            subscriber(response)
+        except pykka.ActorDeadError:
+            dead_subscribers.add(subscriber)
+    subscribers.difference_update(dead_subscribers)
+
+class RemotePollLoopActor(actor_class):
+    """Abstract actor class to regularly poll a remote service.
+
+    This actor sends regular requests to a remote service, and sends each
+    response to subscribers.  It takes care of error handling, and retrying
+    requests with exponential backoff.
+
+    To use this actor, define CLIENT_ERRORS and the _send_request method.
+    If you also define an _item_key method, this class will support
+    subscribing to a specific item by key in responses.
+    """
+    CLIENT_ERRORS = ()
+
+    def __init__(self, client, timer_actor, poll_wait=60, max_poll_wait=180):
+        super(RemotePollLoopActor, self).__init__()
+        self._client = client
+        self._timer = timer_actor
+        self._logger = logging.getLogger(self.LOGGER_NAME)
+        self._later = self.actor_ref.proxy()
+        self._polling_started = False
+        self.log_prefix = "{} (at {})".format(self.__class__.__name__, id(self))
+        self.min_poll_wait = poll_wait
+        self.max_poll_wait = max_poll_wait
+        self.poll_wait = self.min_poll_wait
+        self.all_subscribers = set()
+        self.key_subscribers = {}
+        if hasattr(self, '_item_key'):
+            self.subscribe_to = self._subscribe_to
+
+    def _start_polling(self):
+        if not self._polling_started:
+            self._polling_started = True
+            self._later.poll()
+
+    def subscribe(self, subscriber):
+        self.all_subscribers.add(subscriber)
+        self._logger.debug("%r subscribed to all events", subscriber)
+        self._start_polling()
+
+    # __init__ exposes this method to the proxy if the subclass defines
+    # _item_key.
+    def _subscribe_to(self, key, subscriber):
+        self.key_subscribers.setdefault(key, set()).add(subscriber)
+        self._logger.debug("%r subscribed to events for '%s'", subscriber, key)
+        self._start_polling()
+
+    def _send_request(self):
+        raise NotImplementedError("subclasses must implement request method")
+
+    def _got_response(self, response):
+        self._logger.debug("%s got response with %d items",
+                           self.log_prefix, len(response))
+        self.poll_wait = self.min_poll_wait
+        _notify_subscribers(response, self.all_subscribers)
+        if hasattr(self, '_item_key'):
+            items = {self._item_key(x): x for x in response}
+            for key, subscribers in self.key_subscribers.iteritems():
+                _notify_subscribers(items.get(key), subscribers)
+
+    def _got_error(self, error):
+        self.poll_wait = min(self.poll_wait * 2, self.max_poll_wait)
+        return "{} got error: {} - waiting {} seconds".format(
+            self.log_prefix, error, self.poll_wait)
+
+    def poll(self, scheduled_start=None):
+        self._logger.debug("%s sending poll", self.log_prefix)
+        start_time = time.time()
+        if scheduled_start is None:
+            scheduled_start = start_time
+        try:
+            response = self._send_request()
+        except Exception as error:
+            errmsg = self._got_error(error)
+            if isinstance(error, self.CLIENT_ERRORS):
+                self._logger.warning(errmsg)
+            else:
+                self._logger.exception(errmsg)
+            next_poll = start_time + self.poll_wait
+        else:
+            self._got_response(response)
+            next_poll = scheduled_start + self.poll_wait
+        end_time = time.time()
+        if next_poll < end_time:  # We've drifted too much; start fresh.
+            next_poll = end_time + self.poll_wait
+        self._timer.schedule(next_poll, self._later.poll, next_poll)
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
new file mode 100644 (file)
index 0000000..0d4ee7b
--- /dev/null
@@ -0,0 +1,395 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import functools
+import itertools
+import logging
+import time
+
+import pykka
+
+from ..clientactor import _notify_subscribers
+from .. import config
+
+def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
+    hostname = arvados_node.get('hostname') or default_hostname
+    return '{}.{}'.format(hostname, arvados_node['domain'])
+
+def arvados_node_mtime(node):
+    return time.mktime(time.strptime(node['modified_at'] + 'UTC',
+                                     '%Y-%m-%dT%H:%M:%SZ%Z')) - time.timezone
+
+def timestamp_fresh(timestamp, fresh_time):
+    return (time.time() - timestamp) < fresh_time
+
+class BaseComputeNodeDriver(object):
+    """Abstract base class for compute node drivers.
+
+    libcloud abstracts away many of the differences between cloud providers,
+    but managing compute nodes requires some cloud-specific features (e.g.,
+    on EC2 we use tags to identify compute nodes).  Compute node drivers
+    are responsible for translating the node manager's cloud requests to a
+    specific cloud's vocabulary.
+
+    Subclasses must implement arvados_create_kwargs (to update node
+    creation kwargs with information about the specific Arvados node
+    record), sync_node, and node_start_time.
+    """
+    def __init__(self, auth_kwargs, list_kwargs, create_kwargs, driver_class):
+        self.real = driver_class(**auth_kwargs)
+        self.list_kwargs = list_kwargs
+        self.create_kwargs = create_kwargs
+
+    def __getattr__(self, name):
+        # Proxy non-extension methods to the real driver.
+        if (not name.startswith('_') and not name.startswith('ex_')
+              and hasattr(self.real, name)):
+            return getattr(self.real, name)
+        else:
+            return super(BaseComputeNodeDriver, self).__getattr__(name)
+
+    def search_for(self, term, list_method, key=lambda item: item.id):
+        cache_key = (list_method, term)
+        if cache_key not in self.SEARCH_CACHE:
+            results = [item for item in getattr(self.real, list_method)()
+                       if key(item) == term]
+            count = len(results)
+            if count != 1:
+                raise ValueError("{} returned {} results for '{}'".format(
+                        list_method, count, term))
+            self.SEARCH_CACHE[cache_key] = results[0]
+        return self.SEARCH_CACHE[cache_key]
+
+    def list_nodes(self):
+        return self.real.list_nodes(**self.list_kwargs)
+
+    def arvados_create_kwargs(self, arvados_node):
+        raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
+
+    def create_node(self, size, arvados_node):
+        kwargs = self.create_kwargs.copy()
+        kwargs.update(self.arvados_create_kwargs(arvados_node))
+        kwargs['size'] = size
+        return self.real.create_node(**kwargs)
+
+    def sync_node(self, cloud_node, arvados_node):
+        # When a compute node first pings the API server, the API server
+        # will automatically assign some attributes on the corresponding
+        # node record, like hostname.  This method should propagate that
+        # information back to the cloud node appropriately.
+        raise NotImplementedError("BaseComputeNodeDriver.sync_node")
+
+    @classmethod
+    def node_start_time(cls, node):
+        raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
+
+
+ComputeNodeDriverClass = BaseComputeNodeDriver
+
+class ComputeNodeStateChangeBase(config.actor_class):
+    """Base class for actors that change a compute node's state.
+
+    This base class takes care of retrying changes and notifying
+    subscribers when the change is finished.
+    """
+    def __init__(self, logger_name, timer_actor, retry_wait, max_retry_wait):
+        super(ComputeNodeStateChangeBase, self).__init__()
+        self._later = self.actor_ref.proxy()
+        self._timer = timer_actor
+        self._logger = logging.getLogger(logger_name)
+        self.min_retry_wait = retry_wait
+        self.max_retry_wait = max_retry_wait
+        self.retry_wait = retry_wait
+        self.subscribers = set()
+
+    @staticmethod
+    def _retry(errors):
+        """Retry decorator for an actor method that makes remote requests.
+
+        Use this function to decorator an actor method, and pass in a
+        tuple of exceptions to catch.  This decorator will schedule
+        retries of that method with exponential backoff if the
+        original method raises any of the given errors.
+        """
+        def decorator(orig_func):
+            @functools.wraps(orig_func)
+            def wrapper(self, *args, **kwargs):
+                try:
+                    orig_func(self, *args, **kwargs)
+                except errors as error:
+                    self._logger.warning(
+                        "Client error: %s - waiting %s seconds",
+                        error, self.retry_wait)
+                    self._timer.schedule(self.retry_wait,
+                                         getattr(self._later,
+                                                 orig_func.__name__),
+                                         *args, **kwargs)
+                    self.retry_wait = min(self.retry_wait * 2,
+                                          self.max_retry_wait)
+                else:
+                    self.retry_wait = self.min_retry_wait
+            return wrapper
+        return decorator
+
+    def _finished(self):
+        _notify_subscribers(self._later, self.subscribers)
+        self.subscribers = None
+
+    def subscribe(self, subscriber):
+        if self.subscribers is None:
+            try:
+                subscriber(self._later)
+            except pykka.ActorDeadError:
+                pass
+        else:
+            self.subscribers.add(subscriber)
+
+
+class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
+    """Actor to create and set up a cloud compute node.
+
+    This actor prepares an Arvados node record for a new compute node
+    (either creating one or cleaning one passed in), then boots the
+    actual compute node.  It notifies subscribers when the cloud node
+    is successfully created (the last step in the process for Node
+    Manager to handle).
+    """
+    def __init__(self, timer_actor, arvados_client, cloud_client,
+                 cloud_size, arvados_node=None,
+                 retry_wait=1, max_retry_wait=180):
+        super(ComputeNodeSetupActor, self).__init__(
+            'arvnodeman.nodeup', timer_actor, retry_wait, max_retry_wait)
+        self._arvados = arvados_client
+        self._cloud = cloud_client
+        self.cloud_size = cloud_size
+        self.arvados_node = None
+        self.cloud_node = None
+        if arvados_node is None:
+            self._later.create_arvados_node()
+        else:
+            self._later.prepare_arvados_node(arvados_node)
+
+    @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+    def create_arvados_node(self):
+        self.arvados_node = self._arvados.nodes().create(body={}).execute()
+        self._later.create_cloud_node()
+
+    @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+    def prepare_arvados_node(self, node):
+        self.arvados_node = self._arvados.nodes().update(
+            uuid=node['uuid'],
+            body={'hostname': None,
+                  'ip_address': None,
+                  'slot_number': None,
+                  'first_ping_at': None,
+                  'last_ping_at': None,
+                  'info': {'ec2_instance_id': None,
+                           'last_action': "Prepared by Node Manager"}}
+            ).execute()
+        self._later.create_cloud_node()
+
+    @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+    def create_cloud_node(self):
+        self._logger.info("Creating cloud node with size %s.",
+                          self.cloud_size.name)
+        self.cloud_node = self._cloud.create_node(self.cloud_size,
+                                                  self.arvados_node)
+        self._logger.info("Cloud node %s created.", self.cloud_node.id)
+        self._finished()
+
+    def stop_if_no_cloud_node(self):
+        if self.cloud_node is None:
+            self.stop()
+
+
+class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
+    """Actor to shut down a compute node.
+
+    This actor simply destroys a cloud node, retrying as needed.
+    """
+    def __init__(self, timer_actor, cloud_client, cloud_node,
+                 retry_wait=1, max_retry_wait=180):
+        super(ComputeNodeShutdownActor, self).__init__(
+            'arvnodeman.nodedown', timer_actor, retry_wait, max_retry_wait)
+        self._cloud = cloud_client
+        self.cloud_node = cloud_node
+        self._later.shutdown_node()
+
+    @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+    def shutdown_node(self):
+        self._cloud.destroy_node(self.cloud_node)
+        self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
+        self._finished()
+
+
+class ComputeNodeUpdateActor(config.actor_class):
+    """Actor to dispatch one-off cloud management requests.
+
+    This actor receives requests for small cloud updates, and
+    dispatches them to a real driver.  ComputeNodeMonitorActors use
+    this to perform maintenance tasks on themselves.  Having a
+    dedicated actor for this gives us the opportunity to control the
+    flow of requests; e.g., by backing off when errors occur.
+
+    This actor is most like a "traditional" Pykka actor: there's no
+    subscribing, but instead methods return real driver results.  If
+    you're interested in those results, you should get them from the
+    Future that the proxy method returns.  Be prepared to handle exceptions
+    from the cloud driver when you do.
+    """
+    def __init__(self, cloud_factory, max_retry_wait=180):
+        super(ComputeNodeUpdateActor, self).__init__()
+        self._cloud = cloud_factory()
+        self.max_retry_wait = max_retry_wait
+        self.error_streak = 0
+        self.next_request_time = time.time()
+
+    def _throttle_errors(orig_func):
+        @functools.wraps(orig_func)
+        def wrapper(self, *args, **kwargs):
+            throttle_time = self.next_request_time - time.time()
+            if throttle_time > 0:
+                time.sleep(throttle_time)
+            self.next_request_time = time.time()
+            try:
+                result = orig_func(self, *args, **kwargs)
+            except config.CLOUD_ERRORS:
+                self.error_streak += 1
+                self.next_request_time += min(2 ** self.error_streak,
+                                              self.max_retry_wait)
+                raise
+            else:
+                self.error_streak = 0
+                return result
+        return wrapper
+
+    @_throttle_errors
+    def sync_node(self, cloud_node, arvados_node):
+        return self._cloud.sync_node(cloud_node, arvados_node)
+
+
+class ShutdownTimer(object):
+    """Keep track of a cloud node's shutdown windows.
+
+    Instantiate this class with a timestamp of when a cloud node started,
+    and a list of durations (in minutes) of when the node must not and may
+    be shut down, alternating.  The class will tell you when a shutdown
+    window is open, and when the next open window will start.
+    """
+    def __init__(self, start_time, shutdown_windows):
+        # The implementation is easiest if we have an even number of windows,
+        # because then windows always alternate between open and closed.
+        # Rig that up: calculate the first shutdown window based on what's
+        # passed in.  Then, if we were given an odd number of windows, merge
+        # that first window into the last one, since they both# represent
+        # closed state.
+        first_window = shutdown_windows[0]
+        shutdown_windows = list(shutdown_windows[1:])
+        self._next_opening = start_time + (60 * first_window)
+        if len(shutdown_windows) % 2:
+            shutdown_windows.append(first_window)
+        else:
+            shutdown_windows[-1] += first_window
+        self.shutdown_windows = itertools.cycle([60 * n
+                                                 for n in shutdown_windows])
+        self._open_start = self._next_opening
+        self._open_for = next(self.shutdown_windows)
+
+    def _advance_opening(self):
+        while self._next_opening < time.time():
+            self._open_start = self._next_opening
+            self._next_opening += self._open_for + next(self.shutdown_windows)
+            self._open_for = next(self.shutdown_windows)
+
+    def next_opening(self):
+        self._advance_opening()
+        return self._next_opening
+
+    def window_open(self):
+        self._advance_opening()
+        return 0 < (time.time() - self._open_start) < self._open_for
+
+
+class ComputeNodeMonitorActor(config.actor_class):
+    """Actor to manage a running compute node.
+
+    This actor gets updates about a compute node's cloud and Arvados records.
+    It uses this information to notify subscribers when the node is eligible
+    for shutdown.
+    """
+    def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
+                 timer_actor, update_actor, arvados_node=None,
+                 poll_stale_after=600, node_stale_after=3600):
+        super(ComputeNodeMonitorActor, self).__init__()
+        self._later = self.actor_ref.proxy()
+        self._logger = logging.getLogger('arvnodeman.computenode')
+        self._last_log = None
+        self._shutdowns = shutdown_timer
+        self._timer = timer_actor
+        self._update = update_actor
+        self.cloud_node = cloud_node
+        self.cloud_node_start_time = cloud_node_start_time
+        self.poll_stale_after = poll_stale_after
+        self.node_stale_after = node_stale_after
+        self.subscribers = set()
+        self.arvados_node = None
+        self._later.update_arvados_node(arvados_node)
+        self.last_shutdown_opening = None
+        self._later.consider_shutdown()
+
+    def subscribe(self, subscriber):
+        self.subscribers.add(subscriber)
+
+    def _debug(self, msg, *args):
+        if msg == self._last_log:
+            return
+        self._last_log = msg
+        self._logger.debug(msg, *args)
+
+    def _shutdown_eligible(self):
+        if self.arvados_node is None:
+            return timestamp_fresh(self.cloud_node_start_time,
+                                   self.node_stale_after)
+        else:
+            return (timestamp_fresh(arvados_node_mtime(self.arvados_node),
+                                    self.poll_stale_after) and
+                    (self.arvados_node['info'].get('slurm_state') == 'idle'))
+
+    def consider_shutdown(self):
+        next_opening = self._shutdowns.next_opening()
+        if self._shutdowns.window_open():
+            if self._shutdown_eligible():
+                self._debug("Node %s suggesting shutdown.", self.cloud_node.id)
+                _notify_subscribers(self._later, self.subscribers)
+            else:
+                self._debug("Node %s shutdown window open but node busy.",
+                            self.cloud_node.id)
+        else:
+            self._debug("Node %s shutdown window closed.  Next at %s.",
+                        self.cloud_node.id, time.ctime(next_opening))
+        if self.last_shutdown_opening != next_opening:
+            self._timer.schedule(next_opening, self._later.consider_shutdown)
+            self.last_shutdown_opening = next_opening
+
+    def offer_arvados_pair(self, arvados_node):
+        if self.arvados_node is not None:
+            return None
+        elif arvados_node['ip_address'] in self.cloud_node.private_ips:
+            self._later.update_arvados_node(arvados_node)
+            return self.cloud_node.id
+        else:
+            return None
+
+    def update_cloud_node(self, cloud_node):
+        if cloud_node is not None:
+            self.cloud_node = cloud_node
+            self._later.consider_shutdown()
+
+    def update_arvados_node(self, arvados_node):
+        if arvados_node is not None:
+            self.arvados_node = arvados_node
+            new_hostname = arvados_node_fqdn(self.arvados_node)
+            if new_hostname != self.cloud_node.name:
+                self._update.sync_node(self.cloud_node, self.arvados_node)
+            self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/dummy.py b/services/nodemanager/arvnodeman/computenode/dummy.py
new file mode 100644 (file)
index 0000000..6c39fea
--- /dev/null
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+
+from . import BaseComputeNodeDriver, arvados_node_fqdn
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+    """Compute node driver wrapper for libcloud's dummy driver.
+
+    This class provides the glue necessary to run the node manager with a
+    dummy cloud.  It's useful for testing.
+    """
+    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.DUMMY)
+    DEFAULT_REAL = DEFAULT_DRIVER('ComputeNodeDriver')
+    DUMMY_START_TIME = time.time()
+
+    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+                 driver_class=DEFAULT_DRIVER):
+        super(ComputeNodeDriver, self).__init__(
+            auth_kwargs, list_kwargs, create_kwargs, driver_class)
+        if driver_class is self.DEFAULT_DRIVER:
+            self.real = self.DEFAULT_REAL
+
+    def _ensure_private_ip(self, node):
+        if not node.private_ips:
+            node.private_ips = ['10.10.0.{}'.format(node.id)]
+
+    def arvados_create_kwargs(self, arvados_node):
+        return {}
+
+    def list_nodes(self):
+        nodelist = super(ComputeNodeDriver, self).list_nodes()
+        for node in nodelist:
+            self._ensure_private_ip(node)
+        return nodelist
+
+    def create_node(self, size, arvados_node):
+        node = super(ComputeNodeDriver, self).create_node(size, arvados_node)
+        self._ensure_private_ip(node)
+        return node
+
+    def sync_node(self, cloud_node, arvados_node):
+        cloud_node.name = arvados_node_fqdn(arvados_node)
+
+    @classmethod
+    def node_start_time(cls, node):
+        return cls.DUMMY_START_TIME
diff --git a/services/nodemanager/arvnodeman/computenode/ec2.py b/services/nodemanager/arvnodeman/computenode/ec2.py
new file mode 100644 (file)
index 0000000..359bed4
--- /dev/null
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import libcloud.compute.base as cloud_base
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+from libcloud.compute.drivers import ec2 as cloud_ec2
+
+from . import BaseComputeNodeDriver, arvados_node_fqdn
+
+### Monkeypatch libcloud to support AWS' new SecurityGroup API.
+# These classes can be removed when libcloud support specifying
+# security groups with the SecurityGroupId parameter.
+class ANMEC2Connection(cloud_ec2.EC2Connection):
+    def request(self, *args, **kwargs):
+        params = kwargs.get('params')
+        if (params is not None) and (params.get('Action') == 'RunInstances'):
+            for key in params.keys():
+                if key.startswith('SecurityGroup.'):
+                    new_key = key.replace('Group.', 'GroupId.', 1)
+                    params[new_key] = params.pop(key).id
+            kwargs['params'] = params
+        return super(ANMEC2Connection, self).request(*args, **kwargs)
+
+
+class ANMEC2NodeDriver(cloud_ec2.EC2NodeDriver):
+    connectionCls = ANMEC2Connection
+
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+    """Compute node driver wrapper for EC2.
+
+    This translates cloud driver requests to EC2's specific parameters.
+    """
+    DEFAULT_DRIVER = ANMEC2NodeDriver
+### End monkeypatch
+    SEARCH_CACHE = {}
+
+    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+                 driver_class=DEFAULT_DRIVER):
+        # We need full lists of keys up front because these loops modify
+        # dictionaries in-place.
+        for key in list_kwargs.keys():
+            list_kwargs[key.replace('_', ':')] = list_kwargs.pop(key)
+        self.tags = {key[4:]: value
+                     for key, value in list_kwargs.iteritems()
+                     if key.startswith('tag:')}
+        super(ComputeNodeDriver, self).__init__(
+            auth_kwargs, {'ex_filters': list_kwargs}, create_kwargs,
+            driver_class)
+        for key in self.create_kwargs.keys():
+            init_method = getattr(self, '_init_' + key, None)
+            if init_method is not None:
+                new_pair = init_method(self.create_kwargs.pop(key))
+                if new_pair is not None:
+                    self.create_kwargs[new_pair[0]] = new_pair[1]
+
+    def _init_image_id(self, image_id):
+        return 'image', self.search_for(image_id, 'list_images')
+
+    def _init_ping_host(self, ping_host):
+        self.ping_host = ping_host
+
+    def _init_security_groups(self, group_names):
+        return 'ex_security_groups', [
+            self.search_for(gname.strip(), 'ex_get_security_groups')
+            for gname in group_names.split(',')]
+
+    def _init_subnet_id(self, subnet_id):
+        return 'ex_subnet', self.search_for(subnet_id, 'ex_list_subnets')
+
+    def _init_ssh_key(self, filename):
+        with open(filename) as ssh_file:
+            key = cloud_base.NodeAuthSSHKey(ssh_file.read())
+        return 'auth', key
+
+    def arvados_create_kwargs(self, arvados_node):
+        result = {'ex_metadata': self.tags.copy(),
+                  'name': arvados_node_fqdn(arvados_node)}
+        ping_secret = arvados_node['info'].get('ping_secret')
+        if ping_secret is not None:
+            ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
+                        format(self.ping_host, arvados_node['uuid'],
+                               ping_secret))
+            result['ex_userdata'] = ping_url
+        return result
+
+    def sync_node(self, cloud_node, arvados_node):
+        metadata = self.arvados_create_kwargs(arvados_node)
+        tags = metadata['ex_metadata']
+        tags['Name'] = metadata['name']
+        self.real.ex_create_tags(cloud_node, tags)
+
+    @classmethod
+    def node_start_time(cls, node):
+        time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
+        return time.mktime(time.strptime(
+                time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
new file mode 100644 (file)
index 0000000..1699584
--- /dev/null
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import ConfigParser
+import importlib
+import logging
+import ssl
+
+import arvados
+import httplib2
+import libcloud.common.types as cloud_types
+import pykka
+from apiclient import errors as apierror
+
+# IOError is the base class for socket.error and friends.
+# It seems like it hits the sweet spot for operations we want to retry:
+# it's low-level, but unlikely to catch code bugs.
+NETWORK_ERRORS = (IOError, ssl.SSLError)
+ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
+CLOUD_ERRORS = NETWORK_ERRORS + (cloud_types.LibcloudError,)
+
+actor_class = pykka.ThreadingActor
+
+class NodeManagerConfig(ConfigParser.SafeConfigParser):
+    """Node Manager Configuration class.
+
+    This a standard Python ConfigParser, with additional helper methods to
+    create objects instantiated with configuration information.
+    """
+
+    LOGGING_NONLEVELS = frozenset(['file'])
+
+    def __init__(self, *args, **kwargs):
+        # Can't use super() because SafeConfigParser is an old-style class.
+        ConfigParser.SafeConfigParser.__init__(self, *args, **kwargs)
+        for sec_name, settings in {
+            'Arvados': {'insecure': 'no',
+                        'timeout': '15'},
+            'Daemon': {'max_nodes': '1',
+                       'poll_time': '60',
+                       'max_poll_time': '300',
+                       'poll_stale_after': '600',
+                       'node_stale_after': str(60 * 60 * 2)},
+            'Logging': {'file': '/dev/stderr',
+                        'level': 'WARNING'},
+        }.iteritems():
+            if not self.has_section(sec_name):
+                self.add_section(sec_name)
+            for opt_name, value in settings.iteritems():
+                if not self.has_option(sec_name, opt_name):
+                    self.set(sec_name, opt_name, value)
+
+    def get_section(self, section, transformer=None):
+        result = self._dict()
+        for key, value in self.items(section):
+            if transformer is not None:
+                try:
+                    value = transformer(value)
+                except (TypeError, ValueError):
+                    pass
+            result[key] = value
+        return result
+
+    def log_levels(self):
+        return {key: getattr(logging, self.get('Logging', key).upper())
+                for key in self.options('Logging')
+                if key not in self.LOGGING_NONLEVELS}
+
+    def new_arvados_client(self):
+        if self.has_option('Daemon', 'certs_file'):
+            certs_file = self.get('Daemon', 'certs_file')
+        else:
+            certs_file = None
+        insecure = self.getboolean('Arvados', 'insecure')
+        http = httplib2.Http(timeout=self.getint('Arvados', 'timeout'),
+                             ca_certs=certs_file,
+                             disable_ssl_certificate_validation=insecure)
+        return arvados.api('v1',
+                           cache=False,  # Don't reuse an existing client.
+                           host=self.get('Arvados', 'host'),
+                           token=self.get('Arvados', 'token'),
+                           insecure=insecure,
+                           http=http)
+
+    def new_cloud_client(self):
+        module = importlib.import_module('arvnodeman.computenode.' +
+                                         self.get('Cloud', 'provider'))
+        auth_kwargs = self.get_section('Cloud Credentials')
+        if 'timeout' in auth_kwargs:
+            auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
+        return module.ComputeNodeDriver(auth_kwargs,
+                                        self.get_section('Cloud List'),
+                                        self.get_section('Cloud Create'))
+
+    def node_sizes(self, all_sizes):
+        size_kwargs = {}
+        for sec_name in self.sections():
+            sec_words = sec_name.split(None, 2)
+            if sec_words[0] != 'Size':
+                continue
+            size_kwargs[sec_words[1]] = self.get_section(sec_name, int)
+        return [(size, size_kwargs[size.id]) for size in all_sizes
+                if size.id in size_kwargs]
+
+    def shutdown_windows(self):
+        return [int(n)
+                for n in self.get('Cloud', 'shutdown_windows').split(',')]
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
new file mode 100644 (file)
index 0000000..6ea3cdf
--- /dev/null
@@ -0,0 +1,307 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import functools
+import logging
+import time
+
+import pykka
+
+from . import computenode as cnode
+from .config import actor_class
+
+class _ComputeNodeRecord(object):
+    def __init__(self, actor=None, cloud_node=None, arvados_node=None,
+                 assignment_time=float('-inf')):
+        self.actor = actor
+        self.cloud_node = cloud_node
+        self.arvados_node = arvados_node
+        self.assignment_time = assignment_time
+
+
+class _BaseNodeTracker(object):
+    def __init__(self):
+        self.nodes = {}
+        self.orphans = {}
+
+    def __getitem__(self, key):
+        return self.nodes[key]
+
+    def __len__(self):
+        return len(self.nodes)
+
+    def get(self, key, default=None):
+        return self.nodes.get(key, default)
+
+    def record_key(self, record):
+        return self.item_key(getattr(record, self.RECORD_ATTR))
+
+    def add(self, record):
+        self.nodes[self.record_key(record)] = record
+
+    def update_record(self, key, item):
+        setattr(self.nodes[key], self.RECORD_ATTR, item)
+
+    def update_from(self, response):
+        unseen = set(self.nodes.iterkeys())
+        for item in response:
+            key = self.item_key(item)
+            if key in unseen:
+                unseen.remove(key)
+                self.update_record(key, item)
+            else:
+                yield key, item
+        self.orphans = {key: self.nodes.pop(key) for key in unseen}
+
+    def unpaired(self):
+        return (record for record in self.nodes.itervalues()
+                if getattr(record, self.PAIR_ATTR) is None)
+
+
+class _CloudNodeTracker(_BaseNodeTracker):
+    RECORD_ATTR = 'cloud_node'
+    PAIR_ATTR = 'arvados_node'
+    item_key = staticmethod(lambda cloud_node: cloud_node.id)
+
+
+class _ArvadosNodeTracker(_BaseNodeTracker):
+    RECORD_ATTR = 'arvados_node'
+    PAIR_ATTR = 'cloud_node'
+    item_key = staticmethod(lambda arvados_node: arvados_node['uuid'])
+
+    def find_stale_node(self, stale_time):
+        for record in self.nodes.itervalues():
+            node = record.arvados_node
+            if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node),
+                                          stale_time) and
+                  not cnode.timestamp_fresh(record.assignment_time,
+                                            stale_time)):
+                return node
+        return None
+
+
+class NodeManagerDaemonActor(actor_class):
+    """Node Manager daemon.
+
+    This actor subscribes to all information polls about cloud nodes,
+    Arvados nodes, and the job queue.  It creates a ComputeNodeMonitorActor
+    for every cloud node, subscribing them to poll updates
+    appropriately.  It creates and destroys cloud nodes based on job queue
+    demand, and stops the corresponding ComputeNode actors when their work
+    is done.
+    """
+    def __init__(self, server_wishlist_actor, arvados_nodes_actor,
+                 cloud_nodes_actor, cloud_update_actor, timer_actor,
+                 arvados_factory, cloud_factory,
+                 shutdown_windows, max_nodes,
+                 poll_stale_after=600, node_stale_after=7200,
+                 node_setup_class=cnode.ComputeNodeSetupActor,
+                 node_shutdown_class=cnode.ComputeNodeShutdownActor,
+                 node_actor_class=cnode.ComputeNodeMonitorActor):
+        super(NodeManagerDaemonActor, self).__init__()
+        self._node_setup = node_setup_class
+        self._node_shutdown = node_shutdown_class
+        self._node_actor = node_actor_class
+        self._cloud_updater = cloud_update_actor
+        self._timer = timer_actor
+        self._new_arvados = arvados_factory
+        self._new_cloud = cloud_factory
+        self._cloud_driver = self._new_cloud()
+        self._logger = logging.getLogger('arvnodeman.daemon')
+        self._later = self.actor_ref.proxy()
+        self.shutdown_windows = shutdown_windows
+        self.max_nodes = max_nodes
+        self.poll_stale_after = poll_stale_after
+        self.node_stale_after = node_stale_after
+        self.last_polls = {}
+        for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
+            poll_actor = locals()[poll_name + '_actor']
+            poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
+            setattr(self, '_{}_actor'.format(poll_name), poll_actor)
+            self.last_polls[poll_name] = -self.poll_stale_after
+        self.cloud_nodes = _CloudNodeTracker()
+        self.arvados_nodes = _ArvadosNodeTracker()
+        self.booting = {}       # Actor IDs to ComputeNodeSetupActors
+        self.booted = {}        # Cloud node IDs to _ComputeNodeRecords
+        self.shutdowns = {}     # Cloud node IDs to ComputeNodeShutdownActors
+        self._logger.debug("Daemon initialized")
+
+    def _update_poll_time(self, poll_key):
+        self.last_polls[poll_key] = time.time()
+
+    def _pair_nodes(self, node_record, arvados_node):
+        self._logger.info("Cloud node %s has associated with Arvados node %s",
+                          node_record.cloud_node.id, arvados_node['uuid'])
+        self._arvados_nodes_actor.subscribe_to(
+            arvados_node['uuid'], node_record.actor.update_arvados_node)
+        node_record.arvados_node = arvados_node
+        self.arvados_nodes.add(node_record)
+
+    def _new_node(self, cloud_node):
+        start_time = self._cloud_driver.node_start_time(cloud_node)
+        shutdown_timer = cnode.ShutdownTimer(start_time,
+                                             self.shutdown_windows)
+        actor = self._node_actor.start(
+            cloud_node=cloud_node,
+            cloud_node_start_time=start_time,
+            shutdown_timer=shutdown_timer,
+            update_actor=self._cloud_updater,
+            timer_actor=self._timer,
+            arvados_node=None,
+            poll_stale_after=self.poll_stale_after,
+            node_stale_after=self.node_stale_after).proxy()
+        actor.subscribe(self._later.node_can_shutdown)
+        self._cloud_nodes_actor.subscribe_to(cloud_node.id,
+                                             actor.update_cloud_node)
+        record = _ComputeNodeRecord(actor, cloud_node)
+        return record
+
+    def update_cloud_nodes(self, nodelist):
+        self._update_poll_time('cloud_nodes')
+        for key, node in self.cloud_nodes.update_from(nodelist):
+            self._logger.info("Registering new cloud node %s", key)
+            if key in self.booted:
+                record = self.booted.pop(key)
+            else:
+                record = self._new_node(node)
+            self.cloud_nodes.add(record)
+            for arv_rec in self.arvados_nodes.unpaired():
+                if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+                    self._pair_nodes(record, arv_rec.arvados_node)
+                    break
+        for key, record in self.cloud_nodes.orphans.iteritems():
+            record.actor.stop()
+            self.shutdowns.pop(key, None)
+
+    def update_arvados_nodes(self, nodelist):
+        self._update_poll_time('arvados_nodes')
+        for key, node in self.arvados_nodes.update_from(nodelist):
+            self._logger.info("Registering new Arvados node %s", key)
+            record = _ComputeNodeRecord(arvados_node=node)
+            self.arvados_nodes.add(record)
+        for arv_rec in self.arvados_nodes.unpaired():
+            arv_node = arv_rec.arvados_node
+            for cloud_rec in self.cloud_nodes.unpaired():
+                if cloud_rec.actor.offer_arvados_pair(arv_node).get():
+                    self._pair_nodes(cloud_rec, arv_node)
+                    break
+
+    def _node_count(self):
+        up = sum(len(nodelist) for nodelist in
+                 [self.cloud_nodes, self.booted, self.booting])
+        return up - len(self.shutdowns)
+
+    def _nodes_wanted(self):
+        return len(self.last_wishlist) - self._node_count()
+
+    def _nodes_excess(self):
+        return -self._nodes_wanted()
+
+    def update_server_wishlist(self, wishlist):
+        self._update_poll_time('server_wishlist')
+        self.last_wishlist = wishlist[:self.max_nodes]
+        nodes_wanted = self._nodes_wanted()
+        if nodes_wanted > 0:
+            self._later.start_node()
+        elif (nodes_wanted < 0) and self.booting:
+            self._later.stop_booting_node()
+
+    def _check_poll_freshness(orig_func):
+        """Decorator to inhibit a method when poll information is stale.
+
+        This decorator checks the timestamps of all the poll information the
+        daemon has received.  The decorated method is only called if none
+        of the timestamps are considered stale.
+        """
+        @functools.wraps(orig_func)
+        def wrapper(self, *args, **kwargs):
+            now = time.time()
+            if all(now - t < self.poll_stale_after
+                   for t in self.last_polls.itervalues()):
+                return orig_func(self, *args, **kwargs)
+            else:
+                return None
+        return wrapper
+
+    @_check_poll_freshness
+    def start_node(self):
+        nodes_wanted = self._nodes_wanted()
+        if nodes_wanted < 1:
+            return None
+        arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+        cloud_size = self.last_wishlist[nodes_wanted - 1]
+        self._logger.info("Want %s more nodes.  Booting a %s node.",
+                          nodes_wanted, cloud_size.name)
+        new_setup = self._node_setup.start(
+            timer_actor=self._timer,
+            arvados_client=self._new_arvados(),
+            arvados_node=arvados_node,
+            cloud_client=self._new_cloud(),
+            cloud_size=cloud_size).proxy()
+        self.booting[new_setup.actor_ref.actor_urn] = new_setup
+        if arvados_node is not None:
+            self.arvados_nodes[arvados_node['uuid']].assignment_time = (
+                time.time())
+        new_setup.subscribe(self._later.node_up)
+        if nodes_wanted > 1:
+            self._later.start_node()
+
+    def _actor_nodes(self, node_actor):
+        return pykka.get_all([node_actor.cloud_node, node_actor.arvados_node])
+
+    def node_up(self, setup_proxy):
+        cloud_node, arvados_node = self._actor_nodes(setup_proxy)
+        del self.booting[setup_proxy.actor_ref.actor_urn]
+        setup_proxy.stop()
+        record = self.cloud_nodes.get(cloud_node.id)
+        if record is None:
+            record = self._new_node(cloud_node)
+            self.booted[cloud_node.id] = record
+        self._pair_nodes(record, arvados_node)
+
+    @_check_poll_freshness
+    def stop_booting_node(self):
+        nodes_excess = self._nodes_excess()
+        if (nodes_excess < 1) or not self.booting:
+            return None
+        for key, node in self.booting.iteritems():
+            node.stop_if_no_cloud_node().get()
+            if not node.actor_ref.is_alive():
+                del self.booting[key]
+                if nodes_excess > 1:
+                    self._later.stop_booting_node()
+                break
+
+    @_check_poll_freshness
+    def node_can_shutdown(self, node_actor):
+        if self._nodes_excess() < 1:
+            return None
+        cloud_node, arvados_node = self._actor_nodes(node_actor)
+        if cloud_node.id in self.shutdowns:
+            return None
+        shutdown = self._node_shutdown.start(timer_actor=self._timer,
+                                             cloud_client=self._new_cloud(),
+                                             cloud_node=cloud_node).proxy()
+        self.shutdowns[cloud_node.id] = shutdown
+        shutdown.subscribe(self._later.node_finished_shutdown)
+
+    def node_finished_shutdown(self, shutdown_actor):
+        cloud_node_id = shutdown_actor.cloud_node.get().id
+        shutdown_actor.stop()
+        if cloud_node_id in self.booted:
+            self.booted.pop(cloud_node_id).actor.stop()
+            del self.shutdowns[cloud_node_id]
+
+    def shutdown(self):
+        self._logger.info("Shutting down after signal.")
+        self.poll_stale_after = -1  # Inhibit starting/stopping nodes
+        for bootnode in self.booting.itervalues():
+            bootnode.stop_if_no_cloud_node()
+        self._later.await_shutdown()
+
+    def await_shutdown(self):
+        if any(node.actor_ref.is_alive() for node in self.booting.itervalues()):
+            self._timer.schedule(time.time() + 1, self._later.await_shutdown)
+        else:
+            self.stop()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
new file mode 100644 (file)
index 0000000..0eb5b79
--- /dev/null
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import logging
+
+from . import clientactor
+from .config import ARVADOS_ERRORS
+
+class ServerCalculator(object):
+    """Generate cloud server wishlists from an Arvados job queue.
+
+    Instantiate this class with a list of cloud node sizes you're willing to
+    use, plus keyword overrides from the configuration.  Then you can pass
+    job queues to servers_for_queue.  It will return a list of node sizes
+    that would best satisfy the jobs, choosing the cheapest size that
+    satisfies each job, and ignoring jobs that can't be satisfied.
+    """
+
+    class CloudSizeWrapper(object):
+        def __init__(self, real_size, **kwargs):
+            self.real = real_size
+            for name in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price',
+                         'extra']:
+                setattr(self, name, getattr(self.real, name))
+            self.cores = kwargs.pop('cores')
+            self.scratch = self.disk
+            for name, override in kwargs.iteritems():
+                if not hasattr(self, name):
+                    raise ValueError("unrecognized size field '%s'" % (name,))
+                setattr(self, name, override)
+
+        def meets_constraints(self, **kwargs):
+            for name, want_value in kwargs.iteritems():
+                have_value = getattr(self, name)
+                if (have_value != 0) and (have_value < want_value):
+                    return False
+            return True
+
+
+    def __init__(self, server_list, max_nodes=None):
+        self.cloud_sizes = [self.CloudSizeWrapper(s, **kws)
+                            for s, kws in server_list]
+        self.cloud_sizes.sort(key=lambda s: s.price)
+        self.max_nodes = max_nodes or float('inf')
+        self.logger = logging.getLogger('arvnodeman.jobqueue')
+        self.logged_jobs = set()
+
+    @staticmethod
+    def coerce_int(x, fallback):
+        try:
+            return int(x)
+        except (TypeError, ValueError):
+            return fallback
+
+    def cloud_size_for_constraints(self, constraints):
+        want_value = lambda key: self.coerce_int(constraints.get(key), 0)
+        wants = {'cores': want_value('min_cores_per_node'),
+                 'ram': want_value('min_ram_mb_per_node'),
+                 'scratch': want_value('min_scratch_mb_per_node')}
+        for size in self.cloud_sizes:
+            if size.meets_constraints(**wants):
+                return size
+        return None
+
+    def servers_for_queue(self, queue):
+        servers = []
+        seen_jobs = set()
+        for job in queue:
+            seen_jobs.add(job['uuid'])
+            constraints = job['runtime_constraints']
+            want_count = self.coerce_int(constraints.get('min_nodes'), 1)
+            cloud_size = self.cloud_size_for_constraints(constraints)
+            if cloud_size is None:
+                if job['uuid'] not in self.logged_jobs:
+                    self.logged_jobs.add(job['uuid'])
+                    self.logger.debug("job %s not satisfiable", job['uuid'])
+            elif (want_count < self.max_nodes):
+                servers.extend([cloud_size.real] * max(1, want_count))
+        self.logged_jobs.intersection_update(seen_jobs)
+        return servers
+
+
+class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
+    """Actor to generate server wishlists from the job queue.
+
+    This actor regularly polls Arvados' job queue, and uses the provided
+    ServerCalculator to turn that into a list of requested node sizes.  That
+    list is sent to subscribers on every poll.
+    """
+
+    CLIENT_ERRORS = ARVADOS_ERRORS
+    LOGGER_NAME = 'arvnodeman.jobqueue'
+
+    def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
+        super(JobQueueMonitorActor, self).__init__(
+            client, timer_actor, *args, **kwargs)
+        self._calculator = server_calc
+
+    def _send_request(self):
+        return self._client.jobs().queue().execute()['items']
+
+    def _got_response(self, queue):
+        server_list = self._calculator.servers_for_queue(queue)
+        self._logger.debug("Sending server wishlist: %s",
+                           ', '.join(s.name for s in server_list) or "(empty)")
+        return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
new file mode 100644 (file)
index 0000000..87f2dda
--- /dev/null
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import argparse
+import logging
+import signal
+import sys
+import time
+
+import daemon
+import pykka
+
+from . import config as nmconfig
+from .computenode import \
+    ComputeNodeSetupActor, ComputeNodeShutdownActor, ComputeNodeUpdateActor, \
+    ShutdownTimer
+from .daemon import NodeManagerDaemonActor
+from .jobqueue import JobQueueMonitorActor, ServerCalculator
+from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
+from .timedcallback import TimedCallBackActor
+
+node_daemon = None
+
+def abort(msg, code=1):
+    print("arvados-node-manager: " + msg)
+    sys.exit(code)
+
+def parse_cli(args):
+    parser = argparse.ArgumentParser(
+        prog='arvados-node-manager',
+        description="Dynamically allocate Arvados cloud compute nodes")
+    parser.add_argument(
+        '--foreground', action='store_true', default=False,
+        help="Run in the foreground.  Don't daemonize.")
+    parser.add_argument(
+        '--config', help="Path to configuration file")
+    return parser.parse_args(args)
+
+def load_config(path):
+    if not path:
+        abort("No --config file specified", 2)
+    config = nmconfig.NodeManagerConfig()
+    try:
+        with open(path) as config_file:
+            config.readfp(config_file)
+    except (IOError, OSError) as error:
+        abort("Error reading configuration file {}: {}".format(path, error))
+    return config
+
+def setup_logging(path, level, **sublevels):
+    handler = logging.FileHandler(path)
+    handler.setFormatter(logging.Formatter(
+            '%(asctime)s %(name)s[%(process)d] %(levelname)s: %(message)s',
+            '%Y-%m-%d %H:%M:%S'))
+    root_logger = logging.getLogger()
+    root_logger.addHandler(handler)
+    root_logger.setLevel(level)
+    for logger_name, sublevel in sublevels.iteritems():
+        sublogger = logging.getLogger(logger_name)
+        sublogger.setLevel(sublevel)
+
+def launch_pollers(config):
+    cloud_client = config.new_cloud_client()
+    arvados_client = config.new_arvados_client()
+    cloud_size_list = config.node_sizes(cloud_client.list_sizes())
+    if not cloud_size_list:
+        abort("No valid node sizes configured")
+
+    server_calculator = ServerCalculator(
+        cloud_size_list, config.getint('Daemon', 'max_nodes'))
+    poll_time = config.getint('Daemon', 'poll_time')
+    max_poll_time = config.getint('Daemon', 'max_poll_time')
+
+    timer = TimedCallBackActor.start(poll_time / 10.0).proxy()
+    cloud_node_poller = CloudNodeListMonitorActor.start(
+        cloud_client, timer, poll_time, max_poll_time).proxy()
+    arvados_node_poller = ArvadosNodeListMonitorActor.start(
+        arvados_client, timer, poll_time, max_poll_time).proxy()
+    job_queue_poller = JobQueueMonitorActor.start(
+        config.new_arvados_client(), timer, server_calculator,
+        poll_time, max_poll_time).proxy()
+    return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
+
+_caught_signals = {}
+def shutdown_signal(signal_code, frame):
+    current_count = _caught_signals.get(signal_code, 0)
+    _caught_signals[signal_code] = current_count + 1
+    if node_daemon is None:
+        pykka.ActorRegistry.stop_all()
+        sys.exit(-signal_code)
+    elif current_count == 0:
+        node_daemon.shutdown()
+    elif current_count == 1:
+        pykka.ActorRegistry.stop_all()
+    else:
+        sys.exit(-signal_code)
+
+def main(args=None):
+    global node_daemon
+    args = parse_cli(args)
+    config = load_config(args.config)
+
+    if not args.foreground:
+        daemon.DaemonContext().open()
+    for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
+        signal.signal(sigcode, shutdown_signal)
+
+    setup_logging(config.get('Logging', 'file'), **config.log_levels())
+    timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
+        launch_pollers(config)
+    cloud_node_updater = ComputeNodeUpdateActor.start(
+        config.new_cloud_client).proxy()
+    node_daemon = NodeManagerDaemonActor.start(
+        job_queue_poller, arvados_node_poller, cloud_node_poller,
+        cloud_node_updater, timer,
+        config.new_arvados_client, config.new_cloud_client,
+        config.shutdown_windows(), config.getint('Daemon', 'max_nodes'),
+        config.getint('Daemon', 'poll_stale_after'),
+        config.getint('Daemon', 'node_stale_after')).proxy()
+
+    signal.pause()
+    daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
+    while not daemon_stopped():
+        time.sleep(1)
+    pykka.ActorRegistry.stop_all()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
new file mode 100644 (file)
index 0000000..7ddfb7c
--- /dev/null
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+from . import clientactor
+from . import config
+
+class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
+    """Actor to poll the Arvados node list.
+
+    This actor regularly polls the list of Arvados node records, and
+    sends it to subscribers.
+    """
+
+    CLIENT_ERRORS = config.ARVADOS_ERRORS
+    LOGGER_NAME = 'arvnodeman.arvados_nodes'
+
+    def _item_key(self, node):
+        return node['uuid']
+
+    def _send_request(self):
+        return self._client.nodes().list(limit=10000).execute()['items']
+
+
+class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
+    """Actor to poll the cloud node list.
+
+    This actor regularly polls the cloud to get a list of running compute
+    nodes, and sends it to subscribers.
+    """
+
+    CLIENT_ERRORS = config.CLOUD_ERRORS
+    LOGGER_NAME = 'arvnodeman.cloud_nodes'
+
+    def _item_key(self, node):
+        return node.id
+
+    def _send_request(self):
+        return self._client.list_nodes()
diff --git a/services/nodemanager/arvnodeman/timedcallback.py b/services/nodemanager/arvnodeman/timedcallback.py
new file mode 100644 (file)
index 0000000..615f798
--- /dev/null
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import heapq
+import time
+
+import pykka
+
+from .config import actor_class
+
+class TimedCallBackActor(actor_class):
+    """Send messages to other actors on a schedule.
+
+    Other actors can call the schedule() method to schedule delivery of a
+    message at a later time.  This actor runs the necessary event loop for
+    delivery.
+    """
+    def __init__(self, max_sleep=1):
+        super(TimedCallBackActor, self).__init__()
+        self._proxy = self.actor_ref.proxy()
+        self.messages = []
+        self.max_sleep = max_sleep
+
+    def schedule(self, delivery_time, receiver, *args, **kwargs):
+        if not self.messages:
+            self._proxy.deliver()
+        heapq.heappush(self.messages, (delivery_time, receiver, args, kwargs))
+
+    def deliver(self):
+        if not self.messages:
+            return None
+        til_next = self.messages[0][0] - time.time()
+        if til_next < 0:
+            t, receiver, args, kwargs = heapq.heappop(self.messages)
+            try:
+                receiver(*args, **kwargs)
+            except pykka.ActorDeadError:
+                pass
+        else:
+            time.sleep(min(til_next, self.max_sleep))
+        self._proxy.deliver()
diff --git a/services/nodemanager/bin/arvados-node-manager b/services/nodemanager/bin/arvados-node-manager
new file mode 100644 (file)
index 0000000..3a91288
--- /dev/null
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+from arvnodeman.launcher import main
+main()
diff --git a/services/nodemanager/doc/ec2.example.cfg b/services/nodemanager/doc/ec2.example.cfg
new file mode 100644 (file)
index 0000000..a56e69e
--- /dev/null
@@ -0,0 +1,121 @@
+# EC2 configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Poll EC2 nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = ec2
+
+# It's usually most cost-effective to shut down compute nodes during narrow
+# windows of time.  For example, EC2 bills each node by the hour, so the best
+# time to shut down a node is right before a new hour of uptime starts.
+# Shutdown windows define these periods of time.  These are windows in
+# full minutes, separated by commas.  Counting from the time the node is
+# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
+# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
+# For example, "54, 5, 1" means the node may shut down from the 54th to the
+# 59th minute of each hour of uptime.
+# Specify at least two windows.  You can add as many as you need beyond that.
+shutdown_windows = 54, 5, 1
+
+[Cloud Credentials]
+key = KEY
+secret = SECRET_KEY
+region = us-east-1
+timeout = 60
+
+[Cloud List]
+# This section defines filters that find compute nodes.
+# Tags that you specify here will automatically be added to nodes you create.
+# Replace colons in Amazon filters with underscores
+# (e.g., write "tag:mytag" as "tag_mytag").
+instance-state-name = running
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+[Cloud Create]
+# New compute nodes will send pings to Arvados at this host.
+# You may specify a port, and use brackets to disambiguate IPv6 addresses.
+ping_host = hostname:port
+
+# Give the name of an SSH key on AWS...
+ex_keyname = string
+
+# ... or a file path for an SSH key that can log in to the compute node.
+# (One or the other, not both.)
+# ssh_key = path
+
+# The EC2 IDs of the image and subnet compute nodes should use.
+image_id = idstring
+subnet_id = idstring
+
+# Comma-separated EC2 IDs for the security group(s) assigned to each
+# compute node.
+security_groups = idstring1, idstring2
+
+[Size t2.medium]
+# You can define any number of Size sections to list EC2 sizes you're
+# willing to use.  The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue (N.B.: defining more than one size has not been
+# tested yet).
+# Each size section MUST define the number of cores it has.  You may also
+# want to define the number of mebibytes of scratch space for Crunch jobs.
+# You can also override Amazon's provided data fields by setting the same
+# names here.
+cores = 2
+scratch = 100
\ No newline at end of file
diff --git a/services/nodemanager/doc/local.example.cfg b/services/nodemanager/doc/local.example.cfg
new file mode 100644 (file)
index 0000000..8a6e626
--- /dev/null
@@ -0,0 +1,41 @@
+# You can use this configuration to run a development Node Manager for
+# testing.  It uses libcloud's dummy driver and your own development API server.
+# When new cloud nodes are created, you'll need to simulate the ping that
+# they send to the Arvados API server.  The easiest way I've found to do that
+# is through the API server Rails console: load the Node object, set its
+# IP address to 10.10.0.N (where N is the cloud node's ID), and save.
+
+[Daemon]
+max_nodes = 8
+poll_time = 15
+max_poll_time = 60
+poll_stale_after = 600
+node_stale_after = 300
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+level = DEBUG
+pykka = DEBUG
+apiclient = WARNING
+
+[Arvados]
+host = localhost:3030
+# This is the token for the text fixture's admin user.
+token = 4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h
+insecure = yes
+timeout = 15
+
+[Cloud]
+provider = dummy
+shutdown_windows = 1, 1
+timeout = 15
+
+[Cloud Credentials]
+creds = dummycreds
+
+[Cloud List]
+[Cloud Create]
+
+[Size 2]
+cores = 4
+scratch = 1234
diff --git a/services/nodemanager/setup.py b/services/nodemanager/setup.py
new file mode 100644 (file)
index 0000000..44a35f5
--- /dev/null
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+import time
+
+from setuptools import setup, find_packages
+
+SETUP_DIR = os.path.dirname(__file__) or "."
+cmd_opts = {'egg_info': {}}
+try:
+    git_tags = subprocess.check_output(
+        ['git', 'log', '--first-parent', '--max-count=1',
+         '--format=format:%ct %h', SETUP_DIR]).split()
+    assert len(git_tags) == 2
+except (AssertionError, OSError, subprocess.CalledProcessError):
+    pass
+else:
+    git_tags[0] = time.strftime('%Y%m%d%H%M%S', time.gmtime(int(git_tags[0])))
+    cmd_opts['egg_info']['tag_build'] = '.{}.{}'.format(*git_tags)
+
+setup(name='arvados-node-manager',
+      version='0.1',
+      description='Arvados compute node manager',
+      long_description=open(os.path.join(SETUP_DIR, 'README.rst')).read(),
+      author='Arvados',
+      author_email='info@arvados.org',
+      url="https://arvados.org",
+      license='GNU Affero General Public License, version 3.0',
+      packages=find_packages(),
+      install_requires=[
+        'apache-libcloud',
+        'arvados-python-client',
+        'pykka',
+        'python-daemon',
+        ],
+      scripts=['bin/arvados-node-manager'],
+      test_suite='tests',
+      tests_require=['mock>=1.0'],
+      zip_safe=False,
+      options=cmd_opts,
+      )
diff --git a/services/nodemanager/tests/__init__.py b/services/nodemanager/tests/__init__.py
new file mode 100644 (file)
index 0000000..c5eaf76
--- /dev/null
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import logging
+import os
+
+# Set the ANMTEST_LOGLEVEL environment variable to enable logging at that level.
+loglevel = os.environ.get('ANMTEST_LOGLEVEL', 'CRITICAL')
+logging.basicConfig(level=getattr(logging, loglevel.upper()))
+
+# Set the ANM_TIMEOUT environment variable to the maximum amount of time to
+# wait for tested actors to respond to important messages.  The default value
+# is very conservative, because a small value may produce false negatives on
+# slower systems.  If you're debugging a known timeout issue, however, you may
+# want to set this lower to speed up tests.
+pykka_timeout = int(os.environ.get('ANMTEST_TIMEOUT', '10'))
diff --git a/services/nodemanager/tests/test_clientactor.py b/services/nodemanager/tests/test_clientactor.py
new file mode 100644 (file)
index 0000000..57a0d32
--- /dev/null
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import mock
+import pykka
+
+import arvnodeman.clientactor as clientactor
+from . import testutil
+
+class RemotePollLoopActorTestCase(testutil.RemotePollLoopActorTestMixin,
+                                  unittest.TestCase):
+    class MockClientError(Exception):
+        pass
+
+    class TestActor(clientactor.RemotePollLoopActor):
+        LOGGER_NAME = 'arvnodeman.testpoll'
+
+        def _send_request(self):
+            return self._client()
+    TestActor.CLIENT_ERRORS = (MockClientError,)
+    TEST_CLASS = TestActor
+
+
+    def build_monitor(self, side_effect, *args, **kwargs):
+        super(RemotePollLoopActorTestCase, self).build_monitor(*args, **kwargs)
+        self.client.side_effect = side_effect
+
+    def test_poll_loop_starts_after_subscription(self):
+        self.build_monitor(['test1'])
+        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with('test1')
+        self.assertTrue(self.timer.schedule.called)
+
+    def test_poll_loop_continues_after_failure(self):
+        self.build_monitor(self.MockClientError)
+        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
+        self.assertTrue(self.stop_proxy(self.monitor),
+                        "poll loop died after error")
+        self.assertTrue(self.timer.schedule.called,
+                        "poll loop did not reschedule after error")
+        self.assertFalse(self.subscriber.called,
+                         "poll loop notified subscribers after error")
+
+    def test_late_subscribers_get_responses(self):
+        self.build_monitor(['pre_late_test', 'late_test'])
+        self.monitor.subscribe(lambda response: None).get(self.TIMEOUT)
+        self.monitor.subscribe(self.subscriber)
+        self.monitor.poll().get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with('late_test')
+
+    def test_survive_dead_subscriptions(self):
+        self.build_monitor(['survive1', 'survive2'])
+        dead_subscriber = mock.Mock(name='dead_subscriber')
+        dead_subscriber.side_effect = pykka.ActorDeadError
+        self.monitor.subscribe(dead_subscriber)
+        self.monitor.subscribe(self.subscriber)
+        self.monitor.poll().get(self.TIMEOUT)
+        self.assertTrue(self.stop_proxy(self.monitor),
+                        "poll loop died from dead subscriber")
+        self.subscriber.assert_called_with('survive2')
+
+    def check_poll_timers(self, *test_times):
+        schedule_mock = self.timer.schedule
+        last_expect = None
+        with mock.patch('time.time') as time_mock:
+            for fake_time, expect_next in test_times:
+                time_mock.return_value = fake_time
+                self.monitor.poll(last_expect).get(self.TIMEOUT)
+                self.assertTrue(schedule_mock.called)
+                self.assertEqual(expect_next, schedule_mock.call_args[0][0])
+                schedule_mock.reset_mock()
+                last_expect = expect_next
+
+    def test_poll_timing_on_consecutive_successes_with_drift(self):
+        self.build_monitor(['1', '2'], poll_wait=3, max_poll_wait=14)
+        self.check_poll_timers((0, 3), (4, 6))
+
+    def test_poll_backoff_on_failures(self):
+        self.build_monitor(self.MockClientError, poll_wait=3, max_poll_wait=14)
+        self.check_poll_timers((0, 6), (6, 18), (18, 32))
+
+    def test_poll_timing_after_error_recovery(self):
+        self.build_monitor(['a', self.MockClientError(), 'b'],
+                           poll_wait=3, max_poll_wait=14)
+        self.check_poll_timers((0, 3), (4, 10), (10, 13))
+
+    def test_no_subscriptions_by_key_without_support(self):
+        self.build_monitor([])
+        with self.assertRaises(AttributeError):
+            self.monitor.subscribe_to('key')
+
+
+class RemotePollLoopActorWithKeysTestCase(testutil.RemotePollLoopActorTestMixin,
+                                          unittest.TestCase):
+    class TestActor(RemotePollLoopActorTestCase.TestActor):
+        def _item_key(self, item):
+            return item['key']
+    TEST_CLASS = TestActor
+
+
+    def build_monitor(self, side_effect, *args, **kwargs):
+        super(RemotePollLoopActorWithKeysTestCase, self).build_monitor(
+            *args, **kwargs)
+        self.client.side_effect = side_effect
+
+    def test_key_subscription(self):
+        self.build_monitor([[{'key': 1}, {'key': 2}]])
+        self.monitor.subscribe_to(2, self.subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with({'key': 2})
+
+    def test_survive_dead_key_subscriptions(self):
+        item = {'key': 3}
+        self.build_monitor([[item], [item]])
+        dead_subscriber = mock.Mock(name='dead_subscriber')
+        dead_subscriber.side_effect = pykka.ActorDeadError
+        self.monitor.subscribe_to(3, dead_subscriber)
+        self.monitor.subscribe_to(3, self.subscriber)
+        self.monitor.poll().get(self.TIMEOUT)
+        self.assertTrue(self.stop_proxy(self.monitor),
+                        "poll loop died from dead key subscriber")
+        self.subscriber.assert_called_with(item)
+
+    def test_mixed_subscriptions(self):
+        item = {'key': 4}
+        self.build_monitor([[item], [item]])
+        key_subscriber = mock.Mock(name='key_subscriber')
+        self.monitor.subscribe(self.subscriber)
+        self.monitor.subscribe_to(4, key_subscriber)
+        self.monitor.poll().get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with([item])
+        key_subscriber.assert_called_with(item)
+
+    def test_subscription_to_missing_key(self):
+        self.build_monitor([[]])
+        self.monitor.subscribe_to('nonesuch', self.subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with(None)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
diff --git a/services/nodemanager/tests/test_computenode.py b/services/nodemanager/tests/test_computenode.py
new file mode 100644 (file)
index 0000000..57a86fd
--- /dev/null
@@ -0,0 +1,288 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import arvados.errors as arverror
+import httplib2
+import mock
+import pykka
+
+import arvnodeman.computenode as cnode
+from . import testutil
+
+class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
+    def make_mocks(self, arvados_effect=None, cloud_effect=None):
+        if arvados_effect is None:
+            arvados_effect = [testutil.arvados_node_mock()]
+        self.arvados_effect = arvados_effect
+        self.timer = testutil.MockTimer()
+        self.api_client = mock.MagicMock(name='api_client')
+        self.api_client.nodes().create().execute.side_effect = arvados_effect
+        self.api_client.nodes().update().execute.side_effect = arvados_effect
+        self.cloud_client = mock.MagicMock(name='cloud_client')
+        self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
+
+    def make_actor(self, arv_node=None):
+        if not hasattr(self, 'timer'):
+            self.make_mocks(arvados_effect=[arv_node])
+        self.setup_actor = cnode.ComputeNodeSetupActor.start(
+            self.timer, self.api_client, self.cloud_client,
+            testutil.MockSize(1), arv_node).proxy()
+
+    def test_creation_without_arvados_node(self):
+        self.make_actor()
+        self.assertEqual(self.arvados_effect[-1],
+                         self.setup_actor.arvados_node.get(self.TIMEOUT))
+        self.assertTrue(self.api_client.nodes().create().execute.called)
+        self.assertEqual(self.cloud_client.create_node(),
+                         self.setup_actor.cloud_node.get(self.TIMEOUT))
+
+    def test_creation_with_arvados_node(self):
+        self.make_actor(testutil.arvados_node_mock())
+        self.assertEqual(self.arvados_effect[-1],
+                         self.setup_actor.arvados_node.get(self.TIMEOUT))
+        self.assertTrue(self.api_client.nodes().update().execute.called)
+        self.assertEqual(self.cloud_client.create_node(),
+                         self.setup_actor.cloud_node.get(self.TIMEOUT))
+
+    def test_failed_calls_retried(self):
+        self.make_mocks([
+                arverror.ApiError(httplib2.Response({'status': '500'}), ""),
+                testutil.arvados_node_mock(),
+                ])
+        self.make_actor()
+        self.wait_for_assignment(self.setup_actor, 'cloud_node')
+
+    def test_stop_when_no_cloud_node(self):
+        self.make_mocks(
+            arverror.ApiError(httplib2.Response({'status': '500'}), ""))
+        self.make_actor()
+        self.setup_actor.stop_if_no_cloud_node()
+        self.assertTrue(
+            self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
+
+    def test_no_stop_when_cloud_node(self):
+        self.make_actor()
+        self.wait_for_assignment(self.setup_actor, 'cloud_node')
+        self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT)
+        self.assertTrue(self.stop_proxy(self.setup_actor),
+                        "actor was stopped by stop_if_no_cloud_node")
+
+    def test_subscribe(self):
+        self.make_mocks(
+            arverror.ApiError(httplib2.Response({'status': '500'}), ""))
+        self.make_actor()
+        subscriber = mock.Mock(name='subscriber_mock')
+        self.setup_actor.subscribe(subscriber)
+        self.api_client.nodes().create().execute.side_effect = [
+            testutil.arvados_node_mock()]
+        self.wait_for_assignment(self.setup_actor, 'cloud_node')
+        self.assertEqual(self.setup_actor.actor_ref.actor_urn,
+                         subscriber.call_args[0][0].actor_ref.actor_urn)
+
+    def test_late_subscribe(self):
+        self.make_actor()
+        subscriber = mock.Mock(name='subscriber_mock')
+        self.wait_for_assignment(self.setup_actor, 'cloud_node')
+        self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.setup_actor)
+        self.assertEqual(self.setup_actor.actor_ref.actor_urn,
+                         subscriber.call_args[0][0].actor_ref.actor_urn)
+
+
+class ComputeNodeShutdownActorTestCase(testutil.ActorTestMixin,
+                                       unittest.TestCase):
+    def make_mocks(self, cloud_node=None):
+        self.timer = testutil.MockTimer()
+        self.cloud_client = mock.MagicMock(name='cloud_client')
+        if cloud_node is None:
+            cloud_node = testutil.cloud_node_mock()
+        self.cloud_node = cloud_node
+
+    def make_actor(self, arv_node=None):
+        if not hasattr(self, 'timer'):
+            self.make_mocks()
+        self.shutdown_actor = cnode.ComputeNodeShutdownActor.start(
+            self.timer, self.cloud_client, self.cloud_node).proxy()
+
+    def test_easy_shutdown(self):
+        self.make_actor()
+        self.shutdown_actor.cloud_node.get(self.TIMEOUT)
+        self.stop_proxy(self.shutdown_actor)
+        self.assertTrue(self.cloud_client.destroy_node.called)
+
+    def test_late_subscribe(self):
+        self.make_actor()
+        subscriber = mock.Mock(name='subscriber_mock')
+        self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.shutdown_actor)
+        self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
+                         subscriber.call_args[0][0].actor_ref.actor_urn)
+
+
+class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
+                                     unittest.TestCase):
+    def make_actor(self):
+        self.driver = mock.MagicMock(name='driver_mock')
+        self.updater = cnode.ComputeNodeUpdateActor.start(self.driver).proxy()
+
+    def test_node_sync(self):
+        self.make_actor()
+        cloud_node = testutil.cloud_node_mock()
+        arv_node = testutil.arvados_node_mock()
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
+
+
+@mock.patch('time.time', return_value=1)
+class ShutdownTimerTestCase(unittest.TestCase):
+    def test_two_length_window(self, time_mock):
+        timer = cnode.ShutdownTimer(time_mock.return_value, [8, 2])
+        self.assertEqual(481, timer.next_opening())
+        self.assertFalse(timer.window_open())
+        time_mock.return_value += 500
+        self.assertEqual(1081, timer.next_opening())
+        self.assertTrue(timer.window_open())
+        time_mock.return_value += 200
+        self.assertEqual(1081, timer.next_opening())
+        self.assertFalse(timer.window_open())
+
+    def test_three_length_window(self, time_mock):
+        timer = cnode.ShutdownTimer(time_mock.return_value, [6, 3, 1])
+        self.assertEqual(361, timer.next_opening())
+        self.assertFalse(timer.window_open())
+        time_mock.return_value += 400
+        self.assertEqual(961, timer.next_opening())
+        self.assertTrue(timer.window_open())
+        time_mock.return_value += 200
+        self.assertEqual(961, timer.next_opening())
+        self.assertFalse(timer.window_open())
+
+
+class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
+                                      unittest.TestCase):
+    class MockShutdownTimer(object):
+        def _set_state(self, is_open, next_opening):
+            self.window_open = lambda: is_open
+            self.next_opening = lambda: next_opening
+
+
+    def make_mocks(self, node_num):
+        self.shutdowns = self.MockShutdownTimer()
+        self.shutdowns._set_state(False, 300)
+        self.timer = mock.MagicMock(name='timer_mock')
+        self.updates = mock.MagicMock(name='update_mock')
+        self.cloud_mock = testutil.cloud_node_mock(node_num)
+        self.subscriber = mock.Mock(name='subscriber_mock')
+
+    def make_actor(self, node_num=1, arv_node=None, start_time=None):
+        if not hasattr(self, 'cloud_mock'):
+            self.make_mocks(node_num)
+        if start_time is None:
+            start_time = time.time()
+        self.node_actor = cnode.ComputeNodeMonitorActor.start(
+            self.cloud_mock, start_time, self.shutdowns, self.timer,
+            self.updates, arv_node).proxy()
+        self.subscription = self.node_actor.subscribe(self.subscriber)
+
+    def test_init_shutdown_scheduling(self):
+        self.make_actor()
+        self.subscription.get(self.TIMEOUT)
+        self.assertTrue(self.timer.schedule.called)
+        self.assertEqual(300, self.timer.schedule.call_args[0][0])
+
+    def test_shutdown_subscription(self):
+        self.make_actor()
+        self.shutdowns._set_state(True, 600)
+        self.node_actor.consider_shutdown().get(self.TIMEOUT)
+        self.assertTrue(self.subscriber.called)
+        self.assertEqual(self.node_actor.actor_ref.actor_urn,
+                         self.subscriber.call_args[0][0].actor_ref.actor_urn)
+
+    def test_shutdown_without_arvados_node(self):
+        self.make_actor()
+        self.shutdowns._set_state(True, 600)
+        self.node_actor.consider_shutdown().get(self.TIMEOUT)
+        self.assertTrue(self.subscriber.called)
+
+    def test_no_shutdown_without_arvados_node_and_old_cloud_node(self):
+        self.make_actor(start_time=0)
+        self.shutdowns._set_state(True, 600)
+        self.node_actor.consider_shutdown().get(self.TIMEOUT)
+        self.assertFalse(self.subscriber.called)
+
+    def check_shutdown_rescheduled(self, window_open, next_window,
+                                   schedule_time=None):
+        self.shutdowns._set_state(window_open, next_window)
+        self.timer.schedule.reset_mock()
+        self.node_actor.consider_shutdown().get(self.TIMEOUT)
+        self.stop_proxy(self.node_actor)
+        self.assertTrue(self.timer.schedule.called)
+        if schedule_time is not None:
+            self.assertEqual(schedule_time, self.timer.schedule.call_args[0][0])
+        self.assertFalse(self.subscriber.called)
+
+    def test_shutdown_window_close_scheduling(self):
+        self.make_actor()
+        self.check_shutdown_rescheduled(False, 600, 600)
+
+    def test_no_shutdown_when_node_running_job(self):
+        self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
+        self.check_shutdown_rescheduled(True, 600)
+
+    def test_no_shutdown_when_node_state_unknown(self):
+        self.make_actor(5, testutil.arvados_node_mock(5, info={}))
+        self.check_shutdown_rescheduled(True, 600)
+
+    def test_no_shutdown_when_node_state_stale(self):
+        self.make_actor(6, testutil.arvados_node_mock(6, age=900))
+        self.check_shutdown_rescheduled(True, 600)
+
+    def test_arvados_node_match(self):
+        self.make_actor(2)
+        arv_node = testutil.arvados_node_mock(
+            2, hostname='compute-two.zzzzz.arvadosapi.com')
+        pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
+        self.assertEqual(self.cloud_mock.id, pair_id)
+        self.stop_proxy(self.node_actor)
+        self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
+
+    def test_arvados_node_mismatch(self):
+        self.make_actor(3)
+        arv_node = testutil.arvados_node_mock(1)
+        self.assertIsNone(
+            self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
+
+    def test_update_cloud_node(self):
+        self.make_actor(1)
+        self.make_mocks(2)
+        self.cloud_mock.id = '1'
+        self.node_actor.update_cloud_node(self.cloud_mock)
+        current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
+        self.assertEqual([testutil.ip_address_mock(2)],
+                         current_cloud.private_ips)
+
+    def test_missing_cloud_node_update(self):
+        self.make_actor(1)
+        self.node_actor.update_cloud_node(None)
+        current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
+        self.assertEqual([testutil.ip_address_mock(1)],
+                         current_cloud.private_ips)
+
+    def test_update_arvados_node(self):
+        self.make_actor(3)
+        job_uuid = 'zzzzz-jjjjj-updatejobnode00'
+        new_arvados = testutil.arvados_node_mock(3, job_uuid)
+        self.node_actor.update_arvados_node(new_arvados)
+        current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
+        self.assertEqual(job_uuid, current_arvados['job_uuid'])
+
+    def test_missing_arvados_node_update(self):
+        self.make_actor(4, testutil.arvados_node_mock(4))
+        self.node_actor.update_arvados_node(None)
+        current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
+        self.assertEqual(testutil.ip_address_mock(4),
+                         current_arvados['ip_address'])
diff --git a/services/nodemanager/tests/test_computenode_ec2.py b/services/nodemanager/tests/test_computenode_ec2.py
new file mode 100644 (file)
index 0000000..d1c9e43
--- /dev/null
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+
+import arvnodeman.computenode.ec2 as ec2
+from . import testutil
+
+class EC2ComputeNodeDriverTestCase(unittest.TestCase):
+    def setUp(self):
+        self.driver_mock = mock.MagicMock(name='driver_mock')
+
+    def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
+        create_kwargs.setdefault('ping_host', '100::')
+        return ec2.ComputeNodeDriver(
+            auth_kwargs, list_kwargs, create_kwargs,
+            driver_class=self.driver_mock)
+
+    def test_driver_instantiation(self):
+        kwargs = {'key': 'testkey'}
+        driver = self.new_driver(auth_kwargs=kwargs)
+        self.assertTrue(self.driver_mock.called)
+        self.assertEqual(kwargs, self.driver_mock.call_args[1])
+
+    def test_list_kwargs_become_filters(self):
+        # We're also testing tag name translation.
+        driver = self.new_driver(list_kwargs={'tag_test': 'true'})
+        driver.list_nodes()
+        list_method = self.driver_mock().list_nodes
+        self.assertTrue(list_method.called)
+        self.assertEqual({'tag:test': 'true'},
+                          list_method.call_args[1].get('ex_filters'))
+
+    def test_create_location_loaded_at_initialization(self):
+        kwargs = {'location': 'testregion'}
+        driver = self.new_driver(create_kwargs=kwargs)
+        self.assertTrue(self.driver_mock().list_locations)
+
+    def test_create_image_loaded_at_initialization(self):
+        kwargs = {'image': 'testimage'}
+        driver = self.new_driver(create_kwargs=kwargs)
+        self.assertTrue(self.driver_mock().list_images)
+
+    def test_create_includes_ping_secret(self):
+        arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
+        driver = self.new_driver()
+        driver.create_node(testutil.MockSize(1), arv_node)
+        create_method = self.driver_mock().create_node
+        self.assertTrue(create_method.called)
+        self.assertIn('ping_secret=ssshh',
+                      create_method.call_args[1].get('ex_userdata',
+                                                     'arg missing'))
+
+    def test_tags_created_from_arvados_node(self):
+        arv_node = testutil.arvados_node_mock(8)
+        cloud_node = testutil.cloud_node_mock(8)
+        driver = self.new_driver(list_kwargs={'tag:list': 'test'})
+        self.assertEqual({'ex_metadata': {'list': 'test'},
+                          'name': 'compute8.zzzzz.arvadosapi.com'},
+                         driver.arvados_create_kwargs(arv_node))
+
+    def test_tags_set_default_hostname_from_new_arvados_node(self):
+        arv_node = testutil.arvados_node_mock(hostname=None)
+        driver = self.new_driver()
+        actual = driver.arvados_create_kwargs(arv_node)
+        self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
+                         actual['name'])
+
+    def test_sync_node(self):
+        arv_node = testutil.arvados_node_mock(1)
+        cloud_node = testutil.cloud_node_mock(2)
+        driver = self.new_driver()
+        driver.sync_node(cloud_node, arv_node)
+        tag_mock = self.driver_mock().ex_create_tags
+        self.assertTrue(tag_mock.called)
+        self.assertEqual('compute1.zzzzz.arvadosapi.com',
+                         tag_mock.call_args[0][1].get('Name', 'no name'))
+
+    def test_node_create_time(self):
+        refsecs = int(time.time())
+        reftuple = time.gmtime(refsecs)
+        node = testutil.cloud_node_mock()
+        node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
+                                                   reftuple)}
+        self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
diff --git a/services/nodemanager/tests/test_config.py b/services/nodemanager/tests/test_config.py
new file mode 100644 (file)
index 0000000..3aa9541
--- /dev/null
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import io
+import logging
+import unittest
+
+import arvnodeman.config as nmconfig
+
+class NodeManagerConfigTestCase(unittest.TestCase):
+    TEST_CONFIG = u"""
+[Cloud]
+provider = dummy
+shutdown_windows = 52, 6, 2
+
+[Cloud Credentials]
+creds = dummy_creds
+
+[Cloud List]
+[Cloud Create]
+
+[Size 1]
+cores = 1
+
+[Logging]
+file = /dev/null
+level = DEBUG
+testlogger = INFO
+"""
+
+    def load_config(self, config=None, config_str=None):
+        if config is None:
+            config = nmconfig.NodeManagerConfig()
+        if config_str is None:
+            config_str = self.TEST_CONFIG
+        with io.StringIO(config_str) as config_fp:
+            config.readfp(config_fp)
+        return config
+
+    def test_seeded_defaults(self):
+        config = nmconfig.NodeManagerConfig()
+        sec_names = set(config.sections())
+        self.assertIn('Arvados', sec_names)
+        self.assertIn('Daemon', sec_names)
+        self.assertFalse(any(name.startswith('Size ') for name in sec_names))
+
+    def test_list_sizes(self):
+        config = self.load_config()
+        client = config.new_cloud_client()
+        sizes = config.node_sizes(client.list_sizes())
+        self.assertEqual(1, len(sizes))
+        size, kwargs = sizes[0]
+        self.assertEqual('Small', size.name)
+        self.assertEqual(1, kwargs['cores'])
+
+    def test_shutdown_windows(self):
+        config = self.load_config()
+        self.assertEqual([52, 6, 2], config.shutdown_windows())
+
+    def test_log_levels(self):
+        config = self.load_config()
+        self.assertEqual({'level': logging.DEBUG,
+                          'testlogger': logging.INFO},
+                         config.log_levels())
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
new file mode 100644 (file)
index 0000000..869ae4a
--- /dev/null
@@ -0,0 +1,238 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+
+import arvnodeman.daemon as nmdaemon
+from . import testutil
+
+class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
+                                     unittest.TestCase):
+    def make_daemon(self, cloud_nodes=[], arvados_nodes=[], want_sizes=[]):
+        for name in ['cloud_nodes', 'arvados_nodes', 'server_wishlist']:
+            setattr(self, name + '_poller', mock.MagicMock(name=name + '_mock'))
+        self.arv_factory = mock.MagicMock(name='arvados_mock')
+        self.cloud_factory = mock.MagicMock(name='cloud_mock')
+        self.cloud_factory().node_start_time.return_value = time.time()
+        self.cloud_updates = mock.MagicMock(name='updates_mock')
+        self.timer = testutil.MockTimer()
+        self.node_factory = mock.MagicMock(name='factory_mock')
+        self.node_setup = mock.MagicMock(name='setup_mock')
+        self.node_shutdown = mock.MagicMock(name='shutdown_mock')
+        self.daemon = nmdaemon.NodeManagerDaemonActor.start(
+            self.server_wishlist_poller, self.arvados_nodes_poller,
+            self.cloud_nodes_poller, self.cloud_updates, self.timer,
+            self.arv_factory, self.cloud_factory,
+            [54, 5, 1], 8, 600, 3600,
+            self.node_setup, self.node_shutdown, self.node_factory).proxy()
+        if cloud_nodes is not None:
+            self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
+        if arvados_nodes is not None:
+            self.daemon.update_arvados_nodes(arvados_nodes).get(self.TIMEOUT)
+        if want_sizes is not None:
+            self.daemon.update_server_wishlist(want_sizes).get(self.TIMEOUT)
+
+    def test_easy_node_creation(self):
+        size = testutil.MockSize(1)
+        self.make_daemon(want_sizes=[size])
+        self.stop_proxy(self.daemon)
+        self.assertTrue(self.node_setup.start.called)
+
+    def test_node_pairing(self):
+        cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(1)
+        self.make_daemon([cloud_node], [arv_node])
+        self.stop_proxy(self.daemon)
+        self.node_factory.start().proxy().offer_arvados_pair.assert_called_with(
+            arv_node)
+
+    def test_node_pairing_after_arvados_update(self):
+        cloud_node = testutil.cloud_node_mock(2)
+        arv_node = testutil.arvados_node_mock(2, ip_address=None)
+        self.make_daemon([cloud_node], None)
+        pair_func = self.node_factory.start().proxy().offer_arvados_pair
+        pair_func().get.return_value = None
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+        pair_func.assert_called_with(arv_node)
+
+        pair_func().get.return_value = cloud_node.id
+        pair_func.reset_mock()
+        arv_node = testutil.arvados_node_mock(2)
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+        pair_func.assert_called_with(arv_node)
+
+    def test_old_arvados_node_not_double_assigned(self):
+        arv_node = testutil.arvados_node_mock(3, age=9000)
+        size = testutil.MockSize(3)
+        self.make_daemon(arvados_nodes=[arv_node])
+        setup_ref = self.node_setup.start().proxy().actor_ref
+        setup_ref.actor_urn = 0
+        self.node_setup.start.reset_mock()
+        self.daemon.update_server_wishlist([size]).get(self.TIMEOUT)
+        self.daemon.max_nodes.get(self.TIMEOUT)
+        setup_ref.actor_urn += 1
+        self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        used_nodes = [call[1].get('arvados_node')
+                      for call in self.node_setup.start.call_args_list]
+        self.assertEqual(2, len(used_nodes))
+        self.assertIn(arv_node, used_nodes)
+        self.assertIn(None, used_nodes)
+
+    def test_node_count_satisfied(self):
+        self.make_daemon([testutil.cloud_node_mock()],
+                         want_sizes=[testutil.MockSize(1)])
+        self.stop_proxy(self.daemon)
+        self.assertFalse(self.node_setup.called)
+
+    def test_booting_nodes_counted(self):
+        cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(1)
+        server_wishlist = [testutil.MockSize(1)] * 2
+        self.make_daemon([cloud_node], [arv_node], server_wishlist)
+        self.daemon.max_nodes.get(self.TIMEOUT)
+        self.assertTrue(self.node_setup.start.called)
+        self.daemon.update_server_wishlist(server_wishlist).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.node_setup.start.call_count)
+
+    def mock_setup_actor(self, cloud_node, arv_node):
+        setup = mock.MagicMock(name='setup_node_mock')
+        setup.actor_ref = self.node_setup.start().proxy().actor_ref
+        self.node_setup.reset_mock()
+        setup.actor_urn = cloud_node.id
+        setup.cloud_node.get.return_value = cloud_node
+        setup.arvados_node.get.return_value = arv_node
+        return setup
+
+    def start_node_boot(self, cloud_node=None, arv_node=None, id_num=1):
+        if cloud_node is None:
+            cloud_node = testutil.cloud_node_mock(id_num)
+        if arv_node is None:
+            arv_node = testutil.arvados_node_mock(id_num)
+        self.make_daemon(want_sizes=[testutil.MockSize(id_num)])
+        self.daemon.max_nodes.get(self.TIMEOUT)
+        self.assertEqual(1, self.node_setup.start.call_count)
+        return self.mock_setup_actor(cloud_node, arv_node)
+
+    def test_no_duplication_when_booting_node_listed_fast(self):
+        # Test that we don't start two ComputeNodeMonitorActors when
+        # we learn about a booting node through a listing before we
+        # get the "node up" message from CloudNodeSetupActor.
+        cloud_node = testutil.cloud_node_mock(1)
+        setup = self.start_node_boot(cloud_node)
+        self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
+        self.assertTrue(self.node_factory.start.called)
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.node_factory.start.call_count)
+
+    def test_no_duplication_when_booted_node_listed(self):
+        cloud_node = testutil.cloud_node_mock(2)
+        setup = self.start_node_boot(cloud_node, id_num=2)
+        self.daemon.node_up(setup)
+        self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
+        self.assertEqual(1, self.node_factory.start.call_count)
+
+    def test_node_counted_after_boot_with_slow_listing(self):
+        # Test that, after we boot a compute node, we assume it exists
+        # even it doesn't appear in the listing (e.g., because of delays
+        # propagating tags).
+        setup = self.start_node_boot()
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertTrue(self.node_factory.start.called,
+                        "daemon not monitoring booted node")
+        self.daemon.update_cloud_nodes([])
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.node_factory.start.call_count,
+                         "daemon has duplicate monitors for booted node")
+        self.assertFalse(self.node_factory.start().proxy().stop.called,
+                         "daemon prematurely stopped monitoring a new node")
+
+    def test_booted_unlisted_node_counted(self):
+        setup = self.start_node_boot(id_num=1)
+        self.daemon.node_up(setup)
+        self.daemon.update_server_wishlist(
+            [testutil.MockSize(1)]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertFalse(self.node_setup.start.called,
+                         "daemon did not count booted node toward wishlist")
+
+    def test_booted_node_can_shutdown(self):
+        setup = self.start_node_boot()
+        self.daemon.node_up(setup)
+        self.daemon.update_server_wishlist([])
+        self.daemon.node_can_shutdown(
+            self.node_factory.start().proxy()).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertTrue(self.node_shutdown.start.called,
+                        "daemon did not shut down booted node on offer")
+
+    def test_booted_node_lifecycle(self):
+        cloud_node = testutil.cloud_node_mock(6)
+        setup = self.start_node_boot(cloud_node, id_num=6)
+        monitor = self.node_factory.start().proxy()
+        monitor.cloud_node.get.return_value = cloud_node
+        self.daemon.node_up(setup)
+        self.daemon.update_server_wishlist([])
+        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+        self.assertTrue(self.node_shutdown.start.called,
+                        "daemon did not shut down booted node on offer")
+        shutdown = self.node_shutdown.start().proxy()
+        shutdown.cloud_node.get.return_value = cloud_node
+        self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
+        self.assertTrue(shutdown.stop.called,
+                        "shutdown actor not stopped after finishing")
+        self.assertTrue(monitor.stop.called,
+                        "monitor for booted node not stopped after shutdown")
+        self.daemon.update_server_wishlist(
+            [testutil.MockSize(2)]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertTrue(self.node_setup.start.called,
+                        "second node not started after booted node stopped")
+
+    def test_booting_nodes_shut_down(self):
+        self.make_daemon(want_sizes=[testutil.MockSize(1)])
+        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertTrue(
+            self.node_setup.start().proxy().stop_if_no_cloud_node.called)
+
+    def test_shutdown_declined_at_wishlist_capacity(self):
+        cloud_node = testutil.cloud_node_mock(1)
+        size = testutil.MockSize(1)
+        self.make_daemon(cloud_nodes=[cloud_node], want_sizes=[size])
+        self.daemon.node_can_shutdown(
+            self.node_factory.start().proxy()).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertFalse(self.node_shutdown.start.called)
+
+    def test_shutdown_accepted_below_capacity(self):
+        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
+        node_actor = self.node_factory().proxy()
+        self.daemon.node_can_shutdown(node_actor).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertTrue(self.node_shutdown.start.called)
+
+    def test_clean_shutdown_waits_for_node_setup_finish(self):
+        self.make_daemon(want_sizes=[testutil.MockSize(1)])
+        self.daemon.max_nodes.get(self.TIMEOUT)
+        self.assertTrue(self.node_setup.start.called)
+        new_node = self.node_setup.start().proxy()
+        self.daemon.shutdown().get(self.TIMEOUT)
+        self.assertTrue(new_node.stop_if_no_cloud_node.called)
+        self.daemon.node_up(new_node).get(self.TIMEOUT)
+        self.assertTrue(new_node.stop.called)
+        self.assertTrue(
+            self.daemon.actor_ref.actor_stopped.wait(self.TIMEOUT))
+
+    def test_wishlist_ignored_after_shutdown(self):
+        size = testutil.MockSize(2)
+        self.make_daemon(want_sizes=[size])
+        self.daemon.shutdown().get(self.TIMEOUT)
+        self.daemon.update_server_wishlist([size] * 2).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.node_setup.start.call_count)
diff --git a/services/nodemanager/tests/test_jobqueue.py b/services/nodemanager/tests/test_jobqueue.py
new file mode 100644 (file)
index 0000000..0a4d136
--- /dev/null
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import arvnodeman.jobqueue as jobqueue
+from . import testutil
+
+class ServerCalculatorTestCase(unittest.TestCase):
+    def make_calculator(self, factors, **kwargs):
+        return jobqueue.ServerCalculator(
+            [(testutil.MockSize(n), {'cores': n}) for n in factors], **kwargs)
+
+    def calculate(self, servcalc, *constraints):
+        return servcalc.servers_for_queue(
+            [{'uuid': 'zzzzz-jjjjj-{:015x}'.format(index),
+              'runtime_constraints': cdict}
+             for index, cdict in enumerate(constraints)])
+
+    def test_empty_queue_needs_no_servers(self):
+        servcalc = self.make_calculator([1])
+        self.assertEqual([], servcalc.servers_for_queue([]))
+
+    def test_easy_server_count(self):
+        servcalc = self.make_calculator([1])
+        servlist = self.calculate(servcalc, {'min_nodes': 3})
+        self.assertEqual(3, len(servlist))
+
+    def test_implicit_server_count(self):
+        servcalc = self.make_calculator([1])
+        servlist = self.calculate(servcalc, {}, {'min_nodes': 3})
+        self.assertEqual(4, len(servlist))
+
+    def test_bad_min_nodes_override(self):
+        servcalc = self.make_calculator([1])
+        servlist = self.calculate(servcalc,
+                                  {'min_nodes': -2}, {'min_nodes': 'foo'})
+        self.assertEqual(2, len(servlist))
+
+    def test_ignore_unsatisfiable_jobs(self):
+        servcalc = self.make_calculator([1], max_nodes=9)
+        servlist = self.calculate(servcalc,
+                                  {'min_cores_per_node': 2},
+                                  {'min_ram_mb_per_node': 256},
+                                  {'min_nodes': 6},
+                                  {'min_nodes': 12},
+                                  {'min_scratch_mb_per_node': 200})
+        self.assertEqual(6, len(servlist))
+
+
+class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
+                                   unittest.TestCase):
+    TEST_CLASS = jobqueue.JobQueueMonitorActor
+
+    class MockCalculator(object):
+        @staticmethod
+        def servers_for_queue(queue):
+            return [testutil.MockSize(n) for n in queue]
+
+
+    def build_monitor(self, side_effect, *args, **kwargs):
+        super(JobQueueMonitorActorTestCase, self).build_monitor(*args, **kwargs)
+        self.client.jobs().queue().execute.side_effect = side_effect
+
+    def test_subscribers_get_server_lists(self):
+        self.build_monitor([{'items': [1, 2]}], self.MockCalculator())
+        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with([testutil.MockSize(1),
+                                            testutil.MockSize(2)])
+
+
+if __name__ == '__main__':
+    unittest.main()
+
diff --git a/services/nodemanager/tests/test_nodelist.py b/services/nodemanager/tests/test_nodelist.py
new file mode 100644 (file)
index 0000000..5346e7a
--- /dev/null
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import arvnodeman.nodelist as nodelist
+from . import testutil
+
+class ArvadosNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
+                                          unittest.TestCase):
+    TEST_CLASS = nodelist.ArvadosNodeListMonitorActor
+
+    def build_monitor(self, side_effect, *args, **kwargs):
+        super(ArvadosNodeListMonitorActorTestCase, self).build_monitor(
+            *args, **kwargs)
+        self.client.nodes().list().execute.side_effect = side_effect
+
+    def test_uuid_is_subscription_key(self):
+        node = testutil.arvados_node_mock()
+        self.build_monitor([{'items': [node]}])
+        self.monitor.subscribe_to(node['uuid'],
+                                  self.subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with(node)
+
+
+class CloudNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
+                                        unittest.TestCase):
+    TEST_CLASS = nodelist.CloudNodeListMonitorActor
+
+    class MockNode(object):
+        def __init__(self, count):
+            self.id = str(count)
+            self.name = 'test{}.example.com'.format(count)
+            self.private_ips = ['10.0.0.{}'.format(count)]
+            self.public_ips = []
+            self.size = None
+            self.state = 0
+
+
+    def build_monitor(self, side_effect, *args, **kwargs):
+        super(CloudNodeListMonitorActorTestCase, self).build_monitor(
+            *args, **kwargs)
+        self.client.list_nodes.side_effect = side_effect
+
+    def test_id_is_subscription_key(self):
+        node = self.MockNode(1)
+        self.build_monitor([[node]])
+        self.monitor.subscribe_to('1', self.subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.subscriber.assert_called_with(node)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
diff --git a/services/nodemanager/tests/test_timedcallback.py b/services/nodemanager/tests/test_timedcallback.py
new file mode 100644 (file)
index 0000000..1d1e6c3
--- /dev/null
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+import pykka
+
+import arvnodeman.timedcallback as timedcallback
+from . import testutil
+
+@testutil.no_sleep
+class TimedCallBackActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
+    def test_immediate_turnaround(self):
+        receiver = mock.Mock()
+        deliverer = timedcallback.TimedCallBackActor.start().proxy()
+        deliverer.schedule(time.time() - 1, receiver,
+                           'immediate').get(self.TIMEOUT)
+        self.stop_proxy(deliverer)
+        receiver.assert_called_with('immediate')
+
+    def test_delayed_turnaround(self):
+        receiver = mock.Mock()
+        with mock.patch('time.time', return_value=0) as mock_now:
+            deliverer = timedcallback.TimedCallBackActor.start().proxy()
+            deliverer.schedule(1, receiver, 'delayed')
+            deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+            self.assertFalse(receiver.called)
+            mock_now.return_value = 2
+            deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+            self.stop_proxy(deliverer)
+        receiver.assert_called_with('delayed')
+
+    def test_out_of_order_scheduling(self):
+        receiver = mock.Mock()
+        with mock.patch('time.time', return_value=1.5) as mock_now:
+            deliverer = timedcallback.TimedCallBackActor.start().proxy()
+            deliverer.schedule(2, receiver, 'second')
+            deliverer.schedule(1, receiver, 'first')
+            deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+            receiver.assert_called_with('first')
+            mock_now.return_value = 2.5
+            deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+            self.stop_proxy(deliverer)
+        receiver.assert_called_with('second')
+
+    def test_dead_actors_ignored(self):
+        receiver = mock.Mock(name='dead_actor', spec=pykka.ActorRef)
+        receiver.tell.side_effect = pykka.ActorDeadError
+        deliverer = timedcallback.TimedCallBackActor.start().proxy()
+        deliverer.schedule(time.time() - 1, receiver.tell,
+                           'error').get(self.TIMEOUT)
+        self.assertTrue(self.stop_proxy(deliverer), "deliverer died")
+        receiver.tell.assert_called_with('error')
+
+
+if __name__ == '__main__':
+    unittest.main()
+
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
new file mode 100644 (file)
index 0000000..0c63db3
--- /dev/null
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import mock
+import pykka
+
+from . import pykka_timeout
+
+no_sleep = mock.patch('time.sleep', lambda n: None)
+
+def arvados_node_mock(node_num=99, job_uuid=None, age=0, **kwargs):
+    if job_uuid is True:
+        job_uuid = 'zzzzz-jjjjj-jobjobjobjobjob'
+    slurm_state = 'idle' if (job_uuid is None) else 'alloc'
+    node = {'uuid': 'zzzzz-yyyyy-12345abcde67890',
+            'created_at': '2014-01-01T01:02:03Z',
+            'modified_at': time.strftime('%Y-%m-%dT%H:%M:%SZ',
+                                         time.gmtime(time.time() - age)),
+            'hostname': 'compute{}'.format(node_num),
+            'domain': 'zzzzz.arvadosapi.com',
+            'ip_address': ip_address_mock(node_num),
+            'job_uuid': job_uuid,
+            'info': {'slurm_state': slurm_state}}
+    node.update(kwargs)
+    return node
+
+def cloud_node_mock(node_num=99):
+    node = mock.NonCallableMagicMock(
+        ['id', 'name', 'state', 'public_ips', 'private_ips', 'driver', 'size',
+         'image', 'extra'],
+        name='cloud_node')
+    node.id = str(node_num)
+    node.name = node.id
+    node.public_ips = []
+    node.private_ips = [ip_address_mock(node_num)]
+    return node
+
+def ip_address_mock(last_octet):
+    return '10.20.30.{}'.format(last_octet)
+
+class MockSize(object):
+    def __init__(self, factor):
+        self.id = 'z{}.test'.format(factor)
+        self.name = self.id
+        self.ram = 128 * factor
+        self.disk = 100 * factor
+        self.bandwidth = 16 * factor
+        self.price = float(factor)
+        self.extra = {}
+
+    def __eq__(self, other):
+        return self.id == other.id
+
+
+class MockTimer(object):
+    def schedule(self, want_time, callback, *args, **kwargs):
+        return callback(*args, **kwargs)
+
+
+class ActorTestMixin(object):
+    FUTURE_CLASS = pykka.ThreadingFuture
+    TIMEOUT = pykka_timeout
+
+    def tearDown(self):
+        pykka.ActorRegistry.stop_all()
+
+    def stop_proxy(self, proxy):
+        return proxy.actor_ref.stop(timeout=self.TIMEOUT)
+
+    def wait_for_assignment(self, proxy, attr_name, unassigned=None,
+                            timeout=TIMEOUT):
+        deadline = time.time() + timeout
+        while True:
+            loop_timeout = deadline - time.time()
+            if loop_timeout <= 0:
+                self.fail("actor did not assign {} in time".format(attr_name))
+            result = getattr(proxy, attr_name).get(loop_timeout)
+            if result is not unassigned:
+                return result
+
+
+class RemotePollLoopActorTestMixin(ActorTestMixin):
+    def build_monitor(self, *args, **kwargs):
+        self.timer = mock.MagicMock(name='timer_mock')
+        self.client = mock.MagicMock(name='client_mock')
+        self.subscriber = mock.Mock(name='subscriber_mock')
+        self.monitor = self.TEST_CLASS.start(
+            self.client, self.timer, *args, **kwargs).proxy()