Merge branch '11221-always-restart-services'

author Tom Clegg <tom@curoverse.com>

Thu, 23 Mar 2017 18:51:21 +0000 (14:51 -0400)

committer Tom Clegg <tom@curoverse.com>

Thu, 23 Mar 2017 18:51:21 +0000 (14:51 -0400)
author Tom Clegg <tom@curoverse.com>
Thu, 23 Mar 2017 18:51:21 +0000 (14:51 -0400)
committer Tom Clegg <tom@curoverse.com>
Thu, 23 Mar 2017 18:51:21 +0000 (14:51 -0400)
diff --git a/apps/workbench/app/controllers/application_controller.rb b/apps/workbench/app/controllers/application_controller.rb

index ee3ac4d6810588b7d630705a5efe4cd6e08bd6ae..23b8788bcb17520b4e8ca71f493b514c0fcda9f3 100644 (file)
--- a/apps/workbench/app/controllers/application_controller.rb
+++ b/apps/workbench/app/controllers/application_controller.rb
@@ -232,11 +232,18 @@ class ApplicationController < ActionController::Base
        objects = @objects
      end
      if objects.respond_to?(:result_offset) and
-        objects.respond_to?(:result_limit) and
-        objects.respond_to?(:items_available)
+        objects.respond_to?(:result_limit)
        next_offset = objects.result_offset + objects.result_limit
-      if next_offset < objects.items_available
+      if objects.respond_to?(:items_available) and (next_offset < objects.items_available)
          next_offset
+      elsif @objects.results.size > 0 and (params[:count] == 'none' or
+           (params[:controller] == 'search' and params[:action] == 'choose'))
+        last_object_class = @objects.last.class
+        if params['last_object_class'].nil? or params['last_object_class'] == last_object_class.to_s
+          next_offset
+        else
+          @objects.select{|obj| obj.class == last_object_class}.size
+        end
        else
          nil
        end
diff --git a/apps/workbench/app/controllers/search_controller.rb b/apps/workbench/app/controllers/search_controller.rb

index 447f416aa25e438ab8fd5cdba3cf0d7d9827ebb7..2511ab08188fbb4aa2e17d250b1feecafff03bd9 100644 (file)
--- a/apps/workbench/app/controllers/search_controller.rb
+++ b/apps/workbench/app/controllers/search_controller.rb
@@ -15,6 +15,8 @@ class SearchController < ApplicationController
      end
      @objects = search_what.contents(limit: @limit,
                                      offset: @offset,
+                                    count: 'none',
+                                    last_object_class: params["last_object_class"],
                                      filters: @filters)
      super
    end
@@ -22,6 +24,7 @@ class SearchController < ApplicationController
    def next_page_href with_params={}
      super with_params.merge(last_object_class: @objects.last.class.to_s,
                              project_uuid: params[:project_uuid],
+                            count: 'none',
                              filters: @filters.to_json)
    end
  end
diff --git a/apps/workbench/app/views/projects/_show_dashboard.html.erb b/apps/workbench/app/views/projects/_show_dashboard.html.erb

index ab6eb16f5153862061a40c6b59f85bd89819c4f5..bcfeb6a466e544926dc4a8c7555dcbcd0cdd4ab7 100644 (file)
--- a/apps/workbench/app/views/projects/_show_dashboard.html.erb
+++ b/apps/workbench/app/views/projects/_show_dashboard.html.erb
@@ -25,17 +25,25 @@
  %>
  
  <%
+  recent_procs_panel_width = 6
    if !PipelineInstance.api_exists?(:index)
      recent_procs_title = 'Recent processes'
      run_proc_title = 'Choose a workflow to run:'
+    show_node_status = false
+    # Recent processes panel should take the entire width when is the only one
+    # being rendered.
+    if !Rails.configuration.show_recent_collections_on_dashboard
+      recent_procs_panel_width = 12
+    end
    else
      recent_procs_title = 'Recent pipelines and processes'
      run_proc_title = 'Choose a pipeline or workflow to run:'
+    show_node_status = true
    end
  %>
  
    <div class="row">
-    <div class="col-md-6">
+    <div class="col-md-<%= recent_procs_panel_width %>">
        <div class="panel panel-default" style="min-height: 10.5em">
          <div class="panel-heading">
            <span class="panel-title"><%=recent_procs_title%></span>
@@ -140,6 +148,7 @@
      </div>
  
      <div class="col-md-6">
+      <% if show_node_status %>
        <% nodes = Node.filter([["last_ping_at", ">", Time.now - 3600]]).results %>
        <div class="panel panel-default" style="min-height: 10.5em">
          <div class="panel-heading"><span class="panel-title">Compute node status</span>
@@ -173,6 +182,7 @@
            </div>
          </div>
        </div>
+      <% end %>
        <% if Rails.configuration.show_recent_collections_on_dashboard %>
        <div class="panel panel-default">
          <div class="panel-heading"><span class="panel-title">Recent collections</span>
diff --git a/apps/workbench/app/views/work_units/_show_child.html.erb b/apps/workbench/app/views/work_units/_show_child.html.erb

index d0db8673f14f6b9c9229647e6a9299b5ff3362b8..03bb518301a32fefe8c7c788fab4985a622171f3 100644 (file)
--- a/apps/workbench/app/views/work_units/_show_child.html.erb
+++ b/apps/workbench/app/views/work_units/_show_child.html.erb
@@ -3,8 +3,8 @@
        <div class="row">
          <div class="col-md-3" style="word-break:break-all;">
            <h4 class="panel-title">
-            <a class="component-detail-panel fa fa-caret-down" data-toggle="collapse" href="#collapse<%= i %>">
-              <%= current_obj.label %>
+            <a class="component-detail-panel" data-toggle="collapse" href="#collapse<%= i %>">
+              <%= current_obj.label %> <span class="caret" href="#collapse<%= i %>"></span>
              </a>
            </h4>
          </div>
diff --git a/apps/workbench/test/controllers/disabled_api_test.rb b/apps/workbench/test/controllers/disabled_api_test.rb

index 47276c02e835419cf89601e78c153f4f5431df21..334b89c4da7aa5ec578a353dcafd180d7bc09240 100644 (file)
--- a/apps/workbench/test/controllers/disabled_api_test.rb
+++ b/apps/workbench/test/controllers/disabled_api_test.rb
@@ -15,6 +15,17 @@ class DisabledApiTest < ActionController::TestCase
      assert_includes @response.body, "Run a process"
    end
  
+  test "dashboard compute node status not shown when pipeline_instance index API is disabled" do
+    @controller = ProjectsController.new
+
+    dd = ArvadosApiClient.new_or_current.discovery.deep_dup
+    dd[:resources][:pipeline_instances][:methods].delete(:index)
+    ArvadosApiClient.any_instance.stubs(:discovery).returns(dd)
+
+    get :index, {}, session_for(:active)
+    assert_not_includes @response.body, "compute-node-summary-pane"
+  end
+
    [
      [:jobs, JobsController.new],
      [:job_tasks, JobTasksController.new],
diff --git a/doc/_config.yml b/doc/_config.yml

index 22ae21400db12ef795341e1ead3ae57666dd2d98..8c3d42a397691b3a745b77b1a9de5f4f61316c24 100644 (file)
--- a/doc/_config.yml
+++ b/doc/_config.yml
@@ -162,12 +162,17 @@ navbar:
        - install/install-keep-balance.html.textile.liquid
      - Containers API support on SLURM:
        - install/crunch2-slurm/install-prerequisites.html.textile.liquid
+      - install/crunch2-slurm/install-slurm.html.textile.liquid
        - install/crunch2-slurm/install-compute-node.html.textile.liquid
        - install/crunch2-slurm/install-dispatch.html.textile.liquid
        - install/crunch2-slurm/install-test.html.textile.liquid
+      - install/install-nodemanager.html.textile.liquid
+      - install/install-compute-ping.html.textile.liquid
      - Jobs API support (deprecated):
        - install/install-crunch-dispatch.html.textile.liquid
        - install/install-compute-node.html.textile.liquid
      - Helpful hints:
        - install/copy_pipeline_from_curoverse.html.textile.liquid
        - install/cheat_sheet.html.textile.liquid
+    - Migrating from Docker 1.9:
+      - install/migrate-docker19.html.textile.liquid
diff --git a/doc/_includes/_compute_ping_rb.liquid b/doc/_includes/_compute_ping_rb.liquid

new file mode 100644 (file)

index 0000000..edd18e3
--- /dev/null
+++ b/doc/_includes/_compute_ping_rb.liquid
@@ -0,0 +1,285 @@
+#!/usr/bin/env ruby
+
+require 'rubygems'
+
+require 'cgi'
+require 'fileutils'
+require 'json'
+require 'net/https'
+require 'socket'
+require 'syslog'
+
+class ComputeNodePing
+  @@NODEDATA_DIR = "/var/tmp/arv-node-data"
+  @@PUPPET_CONFFILE = "/etc/puppet/puppet.conf"
+  @@HOST_STATEFILE = "/var/run/arvados-compute-ping-hoststate.json"
+
+  def initialize(args, stdout, stderr)
+    @stdout = stdout
+    @stderr = stderr
+    @stderr_loglevel = ((args.first == "quiet") ?
+                        Syslog::LOG_ERR : Syslog::LOG_DEBUG)
+    @puppet_disabled = false
+    @syslog = Syslog.open("arvados-compute-ping",
+                          Syslog::LOG_CONS | Syslog::LOG_PID,
+                          Syslog::LOG_DAEMON)
+    @puppetless = File.exist?('/compute-node.puppetless')
+
+    begin
+      prepare_ping
+      load_puppet_conf unless @puppetless
+      begin
+        @host_state = JSON.parse(IO.read(@@HOST_STATEFILE))
+      rescue Errno::ENOENT
+        @host_state = nil
+      end
+    rescue
+      @syslog.close
+      raise
+    end
+  end
+
+  def send
+    pong = send_raw_ping
+
+    if pong["hostname"] and pong["domain"] and pong["first_ping_at"]
+      if @host_state.nil?
+        @host_state = {
+          "fqdn" => (Socket.gethostbyname(Socket.gethostname).first rescue nil),
+          "resumed_slurm" =>
+            ["busy", "idle"].include?(pong["crunch_worker_state"]),
+        }
+        update_host_state({})
+      end
+
+      if hostname_changed?(pong)
+        disable_puppet unless @puppetless
+        rename_host(pong)
+        update_host_state("fqdn" => fqdn_from_pong(pong),
+                          "resumed_slurm" => false)
+      end
+
+      unless @host_state["resumed_slurm"]
+        run_puppet_agent unless @puppetless
+        resume_slurm_node(pong["hostname"])
+        update_host_state("resumed_slurm" => true)
+      end
+    end
+
+    log("Last ping at #{pong['last_ping_at']}")
+  end
+
+  def cleanup
+    enable_puppet if @puppet_disabled and not @puppetless
+    @syslog.close
+  end
+
+  private
+
+  def log(message, level=Syslog::LOG_INFO)
+    @syslog.log(level, message)
+    if level <= @stderr_loglevel
+      @stderr.write("#{Time.now.strftime("%Y-%m-%d %H:%M:%S")} #{message}\n")
+    end
+  end
+
+  def abort(message, code=1)
+    log(message, Syslog::LOG_ERR)
+    exit(code)
+  end
+
+  def run_and_check(cmd_a, accept_codes, io_opts, &block)
+    result = IO.popen(cmd_a, "r", io_opts, &block)
+    unless accept_codes.include?($?.exitstatus)
+      abort("#{cmd_a} exited #{$?.exitstatus}")
+    end
+    result
+  end
+
+  DEFAULT_ACCEPT_CODES=[0]
+  def check_output(cmd_a, accept_codes=DEFAULT_ACCEPT_CODES, io_opts={})
+    # Run a command, check the exit status, and return its stdout as a string.
+    run_and_check(cmd_a, accept_codes, io_opts) do |pipe|
+      pipe.read
+    end
+  end
+
+  def check_command(cmd_a, accept_codes=DEFAULT_ACCEPT_CODES, io_opts={})
+    # Run a command, send stdout to syslog, and check the exit status.
+    run_and_check(cmd_a, accept_codes, io_opts) do |pipe|
+      pipe.each_line do |line|
+        line.chomp!
+        log("#{cmd_a.first}: #{line}") unless line.empty?
+      end
+    end
+  end
+
+  def replace_file(path, body)
+    open(path, "w") { |f| f.write(body) }
+  end
+
+  def update_host_state(updates_h)
+    @host_state.merge!(updates_h)
+    replace_file(@@HOST_STATEFILE, @host_state.to_json)
+  end
+
+  def disable_puppet
+    check_command(["puppet", "agent", "--disable"])
+    @puppet_disabled = true
+    loop do
+      # Wait for any running puppet agents to finish.
+      check_output(["pgrep", "puppet"], 0..1)
+      break if $?.exitstatus == 1
+      sleep(1)
+    end
+  end
+
+  def enable_puppet
+    check_command(["puppet", "agent", "--enable"])
+    @puppet_disabled = false
+  end
+
+  def prepare_ping
+    begin
+      ping_uri_s = File.read(File.join(@@NODEDATA_DIR, "arv-ping-url"))
+    rescue Errno::ENOENT
+      abort("ping URL file is not present yet, skipping run")
+    end
+
+    ping_uri = URI.parse(ping_uri_s)
+    payload_h = CGI.parse(ping_uri.query)
+
+    # Collect all extra data to be sent
+    dirname = File.join(@@NODEDATA_DIR, "meta-data")
+    Dir.open(dirname).each do |basename|
+      filename = File.join(dirname, basename)
+      if File.file?(filename)
+        payload_h[basename.gsub('-', '_')] = File.read(filename).chomp
+      end
+    end
+
+    ping_uri.query = nil
+    @ping_req = Net::HTTP::Post.new(ping_uri.to_s)
+    @ping_req.set_form_data(payload_h)
+    @ping_client = Net::HTTP.new(ping_uri.host, ping_uri.port)
+    @ping_client.use_ssl = ping_uri.scheme == 'https'
+  end
+
+  def send_raw_ping
+    begin
+      response = @ping_client.start do |http|
+        http.request(@ping_req)
+      end
+      if response.is_a? Net::HTTPSuccess
+        pong = JSON.parse(response.body)
+      else
+        raise "response was a #{response}"
+      end
+    rescue JSON::ParserError => error
+      abort("Error sending ping: could not parse JSON response: #{error}")
+    rescue => error
+      abort("Error sending ping: #{error}")
+    end
+
+    replace_file(File.join(@@NODEDATA_DIR, "pong.json"), response.body)
+    if pong["errors"] then
+      log(pong["errors"].join("; "), Syslog::LOG_ERR)
+      if pong["errors"].grep(/Incorrect ping_secret/).any?
+        system("halt")
+      end
+      exit(1)
+    end
+    pong
+  end
+
+  def load_puppet_conf
+    # Parse Puppet configuration suitable for rewriting.
+    # Save certnames in @puppet_certnames.
+    # Save other functional configuration lines in @puppet_conf.
+    @puppet_conf = []
+    @puppet_certnames = []
+    open(@@PUPPET_CONFFILE, "r") do |conffile|
+      conffile.each_line do |line|
+        key, value = line.strip.split(/\s*=\s*/, 2)
+        if key == "certname"
+          @puppet_certnames << value
+        elsif not (key.nil? or key.empty? or key.start_with?("#"))
+          @puppet_conf << line
+        end
+      end
+    end
+  end
+
+  def fqdn_from_pong(pong)
+    "#{pong['hostname']}.#{pong['domain']}"
+  end
+
+  def certname_from_pong(pong)
+    fqdn = fqdn_from_pong(pong).sub(".", ".compute.")
+    "#{pong['first_ping_at'].gsub(':', '-').downcase}.#{fqdn}"
+  end
+
+  def hostname_changed?(pong)
+    if @puppetless
+      (@host_state["fqdn"] != fqdn_from_pong(pong))
+    else
+      (@host_state["fqdn"] != fqdn_from_pong(pong)) or
+        (@puppet_certnames != [certname_from_pong(pong)])
+    end
+  end
+
+  def rename_host(pong)
+    new_fqdn = fqdn_from_pong(pong)
+    log("Renaming host from #{@host_state["fqdn"]} to #{new_fqdn}")
+
+    replace_file("/etc/hostname", "#{new_fqdn.split('.', 2).first}\n")
+    check_output(["hostname", new_fqdn])
+
+    ip_address = check_output(["facter", "ipaddress"]).chomp
+    esc_address = Regexp.escape(ip_address)
+    check_command(["sed", "-i", "/etc/hosts",
+                   "-e", "s/^#{esc_address}.*$/#{ip_address}\t#{new_fqdn}/"])
+
+    unless @puppetless
+      new_conflines = @puppet_conf + ["\n[agent]\n",
+                                      "certname=#{certname_from_pong(pong)}\n"]
+      replace_file(@@PUPPET_CONFFILE, new_conflines.join(""))
+      FileUtils.remove_entry_secure("/var/lib/puppet/ssl")
+    end
+  end
+
+  def run_puppet_agent
+    log("Running puppet agent")
+    enable_puppet
+    check_command(["puppet", "agent", "--onetime", "--no-daemonize",
+                   "--no-splay", "--detailed-exitcodes",
+                   "--ignorecache", "--no-usecacheonfailure"],
+                  [0, 2], {err: [:child, :out]})
+  end
+
+  def resume_slurm_node(node_name)
+    current_state = check_output(["sinfo", "--noheader", "-o", "%t",
+                                  "-n", node_name]).chomp
+    if %w(down drain drng).include?(current_state)
+      log("Resuming node in SLURM")
+      check_command(["scontrol", "update", "NodeName=#{node_name}",
+                     "State=RESUME"], [0], {err: [:child, :out]})
+    end
+  end
+end
+
+LOCK_DIRNAME = "/var/lock/arvados-compute-node.lock"
+begin
+  Dir.mkdir(LOCK_DIRNAME)
+rescue Errno::EEXIST
+  exit(0)
+end
+
+ping_sender = nil
+begin
+  ping_sender = ComputeNodePing.new(ARGV, $stdout, $stderr)
+  ping_sender.send
+ensure
+  Dir.rmdir(LOCK_DIRNAME)
+  ping_sender.cleanup unless ping_sender.nil?
+end
diff --git a/doc/install/crunch2-slurm/install-compute-node.html.textile.liquid b/doc/install/crunch2-slurm/install-compute-node.html.textile.liquid

index 158616c36f55c8b9b296be6d291886dc948a144b..25e105b4e97cf2ed8cfcfccbeaa39fd9803c8b10 100644 (file)
--- a/doc/install/crunch2-slurm/install-compute-node.html.textile.liquid
+++ b/doc/install/crunch2-slurm/install-compute-node.html.textile.liquid
@@ -30,3 +30,9 @@ On Debian-based systems:
  {% include 'install_compute_fuse' %}
  
  {% include 'install_docker_cleaner' %}
+
+h2. Set up SLURM
+
+Install SLURM on the compute node using the same process you used on the API server in the "previous step":install-slurm.html.
+
+The @slurm.conf@ and @/etc/munge/munge.key@ files must be identical on all SLURM nodes. Copy the files you created on the API server in the "previous step":install-slurm.html to each compute node.
diff --git a/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid b/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid

index e6b8e1ea01d2deab5c408b1dd2c6015473d6d00c..f50534e1986d2b3ba1a77cf08ce1811dd5131e48 100644 (file)
--- a/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
+++ b/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
@@ -120,6 +120,21 @@ You can work around this issue by disabling the Docker daemon's systemd integrat
  
  {% include 'notebox_end' %}
  
+h3. CrunchRunCommand: Using host networking for containers
+
+Older Linux kernels (prior to 3.18) have bugs in network namespace handling which can lead to compute node lockups.  This by is indicated by blocked kernel tasks in "Workqueue: netns cleanup_net".   If you are experiencing this problem, as a workaround you can disable use of network namespaces by Docker across the cluster.  Be aware this reduces container isolation, which may be a security risk.
+
+<notextile>
+<pre><code class="userinput">Client:
+  APIHost: <b>zzzzz.arvadosapi.com</b>
+  AuthToken: <b>zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz</b>
+CrunchRunCommand:
+- <b>crunch-run</b>
+- <b>"-container-enable-networking=always"</b>
+- <b>"-container-network-mode=host"</b>
+</code></pre>
+</notextile>
+
  h3. MinRetryPeriod: Rate-limit repeated attempts to start containers
  
  If SLURM is unable to run a container, the dispatcher will submit it again after the next PollPeriod. If PollPeriod is very short, this can be excessive. If MinRetryPeriod is set, the dispatcher will avoid submitting the same container to SLURM more than once in the given time span.
diff --git a/doc/install/crunch2-slurm/install-slurm.html.textile.liquid b/doc/install/crunch2-slurm/install-slurm.html.textile.liquid

new file mode 100644 (file)

index 0000000..aa7e648
--- /dev/null
+++ b/doc/install/crunch2-slurm/install-slurm.html.textile.liquid
@@ -0,0 +1,110 @@
+---
+layout: default
+navsection: installguide
+title: Set up SLURM
+...
+
+h2(#slurm). Set up SLURM
+
+On the API server, install SLURM and munge, and generate a munge key.
+
+On Debian-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo /usr/bin/apt-get install slurm-llnl munge</span>
+~$ <span class="userinput">sudo /usr/sbin/create-munge-key</span>
+</code></pre>
+</notextile>
+
+On Red Hat-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install slurm munge slurm-munge</span>
+</code></pre>
+</notextile>
+
+Now we need to give SLURM a configuration file.  On Debian-based systems, this is installed at @/etc/slurm-llnl/slurm.conf@.  On Red Hat-based systems, this is installed at @/etc/slurm/slurm.conf@.  Here's an example @slurm.conf@:
+
+<notextile>
+<pre>
+ControlMachine=uuid_prefix.your.domain
+SlurmctldPort=6817
+SlurmdPort=6818
+AuthType=auth/munge
+StateSaveLocation=/tmp
+SlurmdSpoolDir=/tmp/slurmd
+SwitchType=switch/none
+MpiDefault=none
+SlurmctldPidFile=/var/run/slurmctld.pid
+SlurmdPidFile=/var/run/slurmd.pid
+ProctrackType=proctrack/pgid
+CacheGroups=0
+ReturnToService=2
+TaskPlugin=task/affinity
+#
+# TIMERS
+SlurmctldTimeout=300
+SlurmdTimeout=300
+InactiveLimit=0
+MinJobAge=300
+KillWait=30
+Waittime=0
+#
+# SCHEDULING
+SchedulerType=sched/backfill
+SchedulerPort=7321
+SelectType=select/linear
+FastSchedule=0
+#
+# LOGGING
+SlurmctldDebug=3
+#SlurmctldLogFile=
+SlurmdDebug=3
+#SlurmdLogFile=
+JobCompType=jobcomp/none
+#JobCompLoc=
+JobAcctGatherType=jobacct_gather/none
+#
+# COMPUTE NODES
+NodeName=DEFAULT
+PartitionName=DEFAULT MaxTime=INFINITE State=UP
+
+NodeName=compute[0-255]
+PartitionName=compute Nodes=compute[0-255] Default=YES Shared=YES
+</pre>
+</notextile>
+
+h3. SLURM configuration essentials
+
+Whenever you change this file, you will need to update the copy _on every compute node_ as well as the controller node, and then run @sudo scontrol reconfigure@.
+
+*@ControlMachine@* should be a DNS name that resolves to the SLURM controller (dispatch/API server). This must resolve correctly on all SLURM worker nodes as well as the controller itself. In general SLURM is very sensitive about all of the nodes being able to communicate with the controller _and one another_, all using the same DNS names.
+
+*@NodeName=compute[0-255]@* establishes that the hostnames of the worker nodes will be compute0, compute1, etc. through compute255.
+* There are several ways to compress sequences of names, like @compute[0-9,80,100-110]@. See the "hostlist" discussion in the @slurm.conf(5)@ and @scontrol(1)@ man pages for more information.
+* It is not necessary for all of the nodes listed here to be alive in order for SLURM to work, although you should make sure the DNS entries exist. It is easiest to define lots of hostnames up front, assigning them to real nodes and updating your DNS records as the nodes appear. This minimizes the frequency of @slurm.conf@ updates and use of @scontrol reconfigure@.
+
+Each hostname in @slurm.conf@ must also resolve correctly on all SLURM worker nodes as well as the controller itself. Furthermore, the hostnames used in the configuration file must match the hostnames reported by @hostname@ or @hostname -s@ on the nodes themselves. This applies to the ControlMachine as well as the worker nodes.
+
+For example:
+* In @slurm.conf@ on control and worker nodes: @ControlMachine=uuid_prefix.your.domain@
+* In @slurm.conf@ on control and worker nodes: @NodeName=compute[0-255]@
+* In @/etc/resolv.conf@ on control and worker nodes: @search uuid_prefix.your.domain@
+* On the control node: @hostname@ reports @uuid_prefix.your.domain@
+* On worker node 123: @hostname@ reports @compute123.uuid_prefix.your.domain@
+
+h3. Automatic hostname assignment
+
+The API server will choose an unused hostname from the set given in @application.yml@, which defaults to @compute[0-255]@.
+
+If it is not feasible to give your compute nodes hostnames like compute0, compute1, etc., you can accommodate other naming schemes with a bit of extra configuration.
+
+If you want Arvados to assign names to your nodes with a different consecutive numeric series like @{worker1-0000, worker1-0001, worker1-0002}@, add an entry to @application.yml@; see @/var/www/arvados-api/current/config/application.default.yml@ for details. Example:
+* In @application.yml@: <code>assign_node_hostname: worker1-%<slot_number>04d</code>
+* In @slurm.conf@: <code>NodeName=worker1-[0000-0255]</code>
+
+If your worker hostnames are already assigned by other means, and the full set of names is known in advance, have your worker node bootstrapping script (see "Installing a compute node":install-compute-node.html) send its current hostname, rather than expect Arvados to assign one.
+* In @application.yml@: <code>assign_node_hostname: false</code>
+* In @slurm.conf@: <code>NodeName=alice,bob,clay,darlene</code>
+
+If your worker hostnames are already assigned by other means, but the full set of names is _not_ known in advance, you can use the @slurm.conf@ and @application.yml@ settings in the previous example, but you must also update @slurm.conf@ (both on the controller and on all worker nodes) and run @sudo scontrol reconfigure@ whenever a new node comes online.
diff --git a/doc/install/install-compute-ping.html.textile.liquid b/doc/install/install-compute-ping.html.textile.liquid

new file mode 100644 (file)

index 0000000..ce96268
--- /dev/null
+++ b/doc/install/install-compute-ping.html.textile.liquid
@@ -0,0 +1,9 @@
+---
+layout: default
+navsection: installguide
+title: Sample compute node ping script
+...
+
+When a new elastic compute node is booted, it needs to contact Arvados to register itself.  Here is an example ping script to run on boot.
+
+<notextile> {% code 'compute_ping_rb' as ruby %} </notextile>
diff --git a/doc/install/install-nodemanager.html.textile.liquid b/doc/install/install-nodemanager.html.textile.liquid

new file mode 100644 (file)

index 0000000..baf7c2f
--- /dev/null
+++ b/doc/install/install-nodemanager.html.textile.liquid
@@ -0,0 +1,562 @@
+---
+layout: default
+navsection: installguide
+title: Install Node Manager
+...
+
+Arvados Node Manager provides elastic computing for Arvados and SLURM by creating and destroying virtual machines on demand.  Node Manager currently supports Amazon Web Services (AWS), Google Cloud Platform (GCP) and Microsoft Azure.
+
+Note: node manager is only required for elastic computing cloud environments.  Fixed size clusters do not require node manager.
+
+h2. Install
+
+Node manager may run anywhere, however it must be able to communicate with the cloud provider's APIs, and use the command line tools @sinfo@, @squeue@ and @scontrol@ to communicate with the cluster's SLURM controller.
+
+On Debian-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo apt-get install arvados-node-manager</span>
+</code></pre>
+</notextile>
+
+On Red Hat-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install arvados-node-manager</span>
+</code></pre>
+</notextile>
+
+h2. Create compute image
+
+Configure a virtual machine following the "instructions to set up a compute node.":{{site.baseurl}}/install/crunch2-slurm/install-compute-node.html and set it up to run a "ping script":{{site.baseurl}}/install/install-compute-ping.html at boot.
+
+Create a virtual machine image using the commands provided by your cloud provider.  We recommend using a tool such as "Packer":https://www.packer.io/ to automate this process.
+
+Configure node manager to use the image with the @image@ or @image_id@ parameter.
+
+h2. Configure node manager
+
+The configuration file at @/etc/arvados-node-manager/config.ini@ .  Some configuration details are specific to the cloud provider you are using:
+
+* "Amazon Web Services":#aws
+* "Google Cloud Platform":#gcp
+* "Microsoft Azure":#azure
+
+h3(#aws). Amazon Web Services
+
+<pre>
+# EC2 configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes.  For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times.  If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type.  However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll EC2 nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down.  Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 1800
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = ec2
+
+# It's usually most cost-effective to shut down compute nodes during narrow
+# windows of time.  For example, EC2 bills each node by the hour, so the best
+# time to shut down a node is right before a new hour of uptime starts.
+# Shutdown windows define these periods of time.  These are windows in
+# full minutes, separated by commas.  Counting from the time the node is
+# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
+# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
+# For example, "54, 5, 1" means the node may shut down from the 54th to the
+# 59th minute of each hour of uptime.
+# Specify at least two windows.  You can add as many as you need beyond that.
+shutdown_windows = 54, 5, 1
+
+[Cloud Credentials]
+key = KEY
+secret = SECRET_KEY
+region = us-east-1
+timeout = 60
+
+[Cloud List]
+# This section defines filters that find compute nodes.
+# Tags that you specify here will automatically be added to nodes you create.
+# Replace colons in Amazon filters with underscores
+# (e.g., write "tag:mytag" as "tag_mytag").
+instance-state-name = running
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+[Cloud Create]
+# New compute nodes will send pings to Arvados at this host.
+# You may specify a port, and use brackets to disambiguate IPv6 addresses.
+ping_host = hostname:port
+
+# Give the name of an SSH key on AWS...
+ex_keyname = string
+
+# ... or a file path for an SSH key that can log in to the compute node.
+# (One or the other, not both.)
+# ssh_key = path
+
+# The EC2 IDs of the image and subnet compute nodes should use.
+image_id = idstring
+subnet_id = idstring
+
+# Comma-separated EC2 IDs for the security group(s) assigned to each
+# compute node.
+security_groups = idstring1, idstring2
+
+
+# You can define any number of Size sections to list EC2 sizes you're
+# willing to use.  The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.  You can also override Amazon's provided
+# data fields (such as price per hour) by setting them here.
+
+[Size m4.large]
+cores = 2
+price = 0.126
+scratch = 100
+
+[Size m4.xlarge]
+cores = 4
+price = 0.252
+scratch = 100
+</pre>
+
+h3(#gcp). Google Cloud Platform
+
+<pre>
+# Google Compute Engine configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# Node Manager will ensure that there are at least this many nodes running at
+# all times.  If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type.  However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# running at all times.  By default, these will be the cheapest node size.
+max_nodes = 8
+
+# Poll compute nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = gce
+
+# Shutdown windows define periods of time when a node may and may not
+# be shut down.  These are windows in full minutes, separated by
+# commas.  Counting from the time the node is booted, the node WILL
+# NOT shut down for N1 minutes; then it MAY shut down for N2 minutes;
+# then it WILL NOT shut down for N3 minutes; and so on.  For example,
+# "54, 5, 1" means the node may shut down from the 54th to the 59th
+# minute of each hour of uptime.
+# GCE bills by the minute, and does not provide information about when
+# a node booted.  Node Manager will store this information in metadata
+# when it boots a node; if that information is not available, it will
+# assume the node booted at the epoch.  These shutdown settings are
+# very aggressive.  You may want to adjust this if you want more
+# continuity of service from a single node.
+shutdown_windows = 20, 999999
+
+[Cloud Credentials]
+user_id = client_email_address@developer.gserviceaccount.com
+key = path_to_certificate.pem
+project = project-id-from-google-cloud-dashboard
+timeout = 60
+
+# Valid location (zone) names: https://cloud.google.com/compute/docs/zones
+datacenter = us-central1-a
+
+# Optional settings. For full documentation see
+# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
+#
+# auth_type = SA               # SA, IA or GCE
+# scopes = https://www.googleapis.com/auth/compute
+# credential_file =
+
+[Cloud List]
+# A comma-separated list of tags that must be applied to a node for it to
+# be considered a compute node.
+# The driver will automatically apply these tags to nodes it creates.
+tags = zyxwv, compute
+
+[Cloud Create]
+# New compute nodes will send pings to Arvados at this host.
+# You may specify a port, and use brackets to disambiguate IPv6 addresses.
+ping_host = hostname:port
+
+# A file path for an SSH key that can log in to the compute node.
+# ssh_key = path
+
+# The GCE image name and network zone name to use when creating new nodes.
+image = debian-7
+# network = your_network_name
+
+# JSON string of service account authorizations for this cluster.
+# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
+# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}]
+
+
+# You can define any number of Size sections to list node sizes you're
+# willing to use.  The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue.
+#
+# The Size fields are interpreted the same way as with a libcloud NodeSize:
+# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
+#
+# See https://cloud.google.com/compute/docs/machine-types for a list
+# of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.
+# You can also override Google's provided data fields (such as price per hour)
+# by setting them here.
+
+[Size n1-standard-2]
+cores = 2
+price = 0.076
+scratch = 100
+
+[Size n1-standard-4]
+cores = 4
+price = 0.152
+scratch = 200
+</pre>
+
+h3(#azure). Microsoft Azure
+
+<pre>
+# Azure configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes.  For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times.  If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type.  However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down.  Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 1800
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down.  These are windows in full minutes, separated by commas.  Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on.  For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes.  Specify at least two windows.  You can add as many as you need beyond
+# that.
+shutdown_windows = 20, 999999
+
+[Cloud Credentials]
+# Use "azure account list" with the azure CLI to get these values.
+tenant_id = 00000000-0000-0000-0000-000000000000
+subscription_id = 00000000-0000-0000-0000-000000000000
+
+# The following directions are based on
+# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
+#
+# azure config mode arm
+# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
+# azure ad sp create "<Application_Id>"
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/{subscriptionId}/
+#
+# Use <Application_Id> for "key" and the <Your_Password> for "secret"
+#
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+# The resource group in which the compute node virtual machines will be created
+# and listed.
+ex_resource_group = ArvadosResourceGroup
+
+[Cloud Create]
+# The compute node image, as a link to a VHD in Azure blob store.
+image = https://example.blob.core.windows.net/system/Microsoft.Compute/Images/images/zyxwv-compute-osDisk.vhd
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = /home/arvadosuser/.ssh/id_rsa.pub
+
+# The account name for the admin user that will be provisioned on new nodes.
+ex_user_name = arvadosuser
+
+# The Azure storage account that will be used to store the node OS disk images.
+ex_storage_account = arvadosstorage
+
+# The virtual network the VMs will be associated with.
+ex_network = ArvadosNetwork
+
+# Optional subnet of the virtual network.
+#ex_subnet = default
+
+# Node tags
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+# the API server to ping
+ping_host = hostname:port
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue.  You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.  You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size Standard_D3]
+cores = 4
+price = 0.56
+
+[Size Standard_D4]
+cores = 8
+price = 1.12
+</pre>
+
+h2. Running
+
+<pre>
+$ arvados-node-manager --config /etc/arvados-node-manager/config.ini
+</pre>
diff --git a/doc/install/migrate-docker19.html.textile.liquid b/doc/install/migrate-docker19.html.textile.liquid

new file mode 100644 (file)

index 0000000..2dca039
--- /dev/null
+++ b/doc/install/migrate-docker19.html.textile.liquid
@@ -0,0 +1,31 @@
+---
+layout: default
+navsection: installguide
+title: Migrating Docker images
+...
+
+If you have an existing Arvados installation using Docker 1.9 and wish to update to Docker 1.10+, you must migrate the Docker images stored in Keep.
+
+The @arv-migrate-docker19@ tool converts Docker images stored in Arvados from image format v1 (Docker <= 1.9) to image format v2 (Docker >= 1.10).
+
+Requires Docker running on the local host (can be either 1.9 or 1.10+).
+
+Usage:
+
+# Run @arvados/docker/migrate-docker19/build.sh@ to create @arvados/migrate-docker19@ Docker image.
+# Set ARVADOS_API_HOST and ARVADOS_API_TOKEN to the cluster you want to migrate.
+# Run @arv-migrate-docker19@ from the Arvados Python SDK on the host (not in a container).
+
+This will query Arvados for v1 format Docker images.  For each image that does not already have a corresponding v2 format image (as indicated by a docker_image_migration tag) it will perform the following process:
+
+i) download the image from Arvados
+ii) load it into Docker
+iii) update the Docker version, which updates the image
+iv) save the v2 format image and upload to Arvados
+v) create a migration link
+
+Once the Docker images in Keep have been migrated, upgrade the version of Docker used across the cluster.  Finally, update the API server configuration from "v1" to "v2" to reflect the supported Docker image version:
+
+<pre>
+docker_image_formats: ["v2"]
+</pre>
diff --git a/docker/migrate-docker19/Dockerfile b/docker/migrate-docker19/Dockerfile

new file mode 100644 (file)

index 0000000..7e921c8
--- /dev/null
+++ b/docker/migrate-docker19/Dockerfile
@@ -0,0 +1,31 @@
+FROM debian:8
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-key adv --keyserver pool.sks-keyservers.net --recv 1078ECD7 && \
+    gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
+    apt-key adv --keyserver hkp://pool.sks-keyservers.net:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D || \
+    apt-key adv --keyserver hkp://pgp.mit.edu:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D
+
+VOLUME /var/lib/docker
+
+RUN mkdir -p /etc/apt/sources.list.d && \
+    echo deb http://apt.arvados.org/ jessie main > /etc/apt/sources.list.d/apt.arvados.org.list && \
+    apt-get clean && \
+    apt-get update && \
+    apt-get install -yq --no-install-recommends -o Acquire::Retries=6 \
+        git curl python-arvados-python-client apt-transport-https ca-certificates && \
+    apt-get clean
+
+RUN echo deb https://apt.dockerproject.org/repo debian-jessie main > /etc/apt/sources.list.d/docker.list && \
+    apt-get update && \
+    apt-get install -yq --no-install-recommends -o Acquire::Retries=6 \
+        docker-engine=1.9.1-0~jessie && \
+    apt-get clean
+
+RUN mkdir /root/pkgs && \
+    cd /root/pkgs && \
+    curl -L -O https://apt.dockerproject.org/repo/pool/main/d/docker-engine/docker-engine_1.13.1-0~debian-jessie_amd64.deb && \
+    curl -L -O http://httpredir.debian.org/debian/pool/main/libt/libtool/libltdl7_2.4.2-1.11+b1_amd64.deb
+
+ADD migrate.sh dnd.sh /root/
diff --git a/docker/migrate-docker19/build.sh b/docker/migrate-docker19/build.sh

new file mode 100755 (executable)

index 0000000..3a36cd4
--- /dev/null
+++ b/docker/migrate-docker19/build.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec docker build -t arvados/migrate-docker19 .
diff --git a/docker/migrate-docker19/dnd.sh b/docker/migrate-docker19/dnd.sh

new file mode 100755 (executable)

index 0000000..ce72601
--- /dev/null
+++ b/docker/migrate-docker19/dnd.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Taken from https://github.com/jpetazzo/dind
+
+exec 2>&1
+
+# Ensure that all nodes in /dev/mapper correspond to mapped devices currently loaded by the device-mapper kernel driver
+dmsetup mknodes
+
+: {LOG:=stdio}
+
+# First, make sure that cgroups are mounted correctly.
+CGROUP=/sys/fs/cgroup
+[ -d $CGROUP ] || mkdir $CGROUP
+
+if mountpoint -q $CGROUP ; then
+    break
+else
+    mount -n -t tmpfs -o uid=0,gid=0,mode=0755 cgroup $CGROUP
+fi
+
+if ! mountpoint -q $CGROUP ; then
+    echo "Could not find or mount cgroups. Tried /sys/fs/cgroup and /cgroup.  Did you use --privileged?"
+    exit 1
+fi
+
+if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security
+then
+    mount -t securityfs none /sys/kernel/security || {
+        echo "Could not mount /sys/kernel/security."
+        echo "AppArmor detection and --privileged mode might break."
+    }
+fi
+
+# Mount the cgroup hierarchies exactly as they are in the parent system.
+for SUBSYS in $(cut -d: -f2 /proc/1/cgroup)
+do
+        [ -d $CGROUP/$SUBSYS ] || mkdir $CGROUP/$SUBSYS
+        mountpoint -q $CGROUP/$SUBSYS ||
+                mount -n -t cgroup -o $SUBSYS cgroup $CGROUP/$SUBSYS
+
+        # The two following sections address a bug which manifests itself
+        # by a cryptic "lxc-start: no ns_cgroup option specified" when
+        # trying to start containers withina container.
+        # The bug seems to appear when the cgroup hierarchies are not
+        # mounted on the exact same directories in the host, and in the
+        # container.
+
+        # Named, control-less cgroups are mounted with "-o name=foo"
+        # (and appear as such under /proc/<pid>/cgroup) but are usually
+        # mounted on a directory named "foo" (without the "name=" prefix).
+        # Systemd and OpenRC (and possibly others) both create such a
+        # cgroup. To avoid the aforementioned bug, we symlink "foo" to
+        # "name=foo". This shouldn't have any adverse effect.
+        echo $SUBSYS | grep -q ^name= && {
+                NAME=$(echo $SUBSYS | sed s/^name=//)
+                ln -s $SUBSYS $CGROUP/$NAME
+        }
+
+        # Likewise, on at least one system, it has been reported that
+        # systemd would mount the CPU and CPU accounting controllers
+        # (respectively "cpu" and "cpuacct") with "-o cpuacct,cpu"
+        # but on a directory called "cpu,cpuacct" (note the inversion
+        # in the order of the groups). This tries to work around it.
+        [ $SUBSYS = cpuacct,cpu ] && ln -s $SUBSYS $CGROUP/cpu,cpuacct
+done
+
+# Note: as I write those lines, the LXC userland tools cannot setup
+# a "sub-container" properly if the "devices" cgroup is not in its
+# own hierarchy. Let's detect this and issue a warning.
+grep -q :devices: /proc/1/cgroup ||
+       echo "WARNING: the 'devices' cgroup should be in its own hierarchy."
+grep -qw devices /proc/1/cgroup ||
+       echo "WARNING: it looks like the 'devices' cgroup is not mounted."
+
+# Now, close extraneous file descriptors.
+pushd /proc/self/fd >/dev/null
+for FD in *
+do
+       case "$FD" in
+       # Keep stdin/stdout/stderr
+       [012])
+               ;;
+       # Nuke everything else
+       *)
+               eval exec "$FD>&-"
+               ;;
+       esac
+done
+popd >/dev/null
+
+
+# If a pidfile is still around (for example after a container restart),
+# delete it so that docker can start.
+rm -rf /var/run/docker.pid
+
+read pid cmd state ppid pgrp session tty_nr tpgid rest < /proc/self/stat
+
+if ! docker daemon --storage-driver=overlay $DOCKER_DAEMON_ARGS ; then
+    docker daemon $DOCKER_DAEMON_ARGS
+fi
diff --git a/docker/migrate-docker19/migrate.sh b/docker/migrate-docker19/migrate.sh

new file mode 100755 (executable)

index 0000000..58d6665
--- /dev/null
+++ b/docker/migrate-docker19/migrate.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -e
+
+function cleanup {
+    kill $(cat /var/run/docker.pid)
+    sleep 1
+    rm -rf /var/lib/docker/*
+}
+
+trap cleanup EXIT
+
+/root/dnd.sh &
+sleep 2
+
+image_tar_keepref=$1
+image_id=$2
+image_repo=$3
+image_tag=$4
+project_uuid=$5
+
+arv-get $image_tar_keepref | docker load
+
+docker tag $image_id $image_repo:$image_tag
+
+docker images -a
+
+kill $(cat /var/run/docker.pid)
+sleep 1
+
+cd /root/pkgs
+dpkg -i libltdl7_2.4.2-1.11+b1_amd64.deb  docker-engine_1.13.1-0~debian-jessie_amd64.deb
+
+/root/dnd.sh &
+sleep 2
+
+docker images -a
+
+UUID=$(arv-keepdocker --force-image-format --project-uuid=$project_uuid $image_repo $image_tag)
+
+echo "Migrated uuid is $UUID"
diff --git a/sdk/cli/arvados-cli.gemspec b/sdk/cli/arvados-cli.gemspec

index 0eeee57e7ffb36a4881b6029942b644297e5502e..651ebf20b0e96f13b4b32bb1b55855f0ac1076a9 100644 (file)
--- a/sdk/cli/arvados-cli.gemspec
+++ b/sdk/cli/arvados-cli.gemspec
@@ -28,7 +28,7 @@ Gem::Specification.new do |s|
    # Our google-api-client dependency used to be < 0.9, but that could be
    # satisfied by the buggy 0.9.pre*.  https://dev.arvados.org/issues/9213
    s.add_runtime_dependency 'google-api-client', '~> 0.6', '>= 0.6.3', '<0.8.9'
-  s.add_runtime_dependency 'activesupport', '~> 3.2', '>= 3.2.13'
+  s.add_runtime_dependency 'activesupport', '>= 3.2.13', '< 5'
    s.add_runtime_dependency 'json', '~> 1.7', '>= 1.7.7'
    s.add_runtime_dependency 'trollop', '~> 2.0'
    s.add_runtime_dependency 'andand', '~> 1.3', '>= 1.3.3'
diff --git a/sdk/cli/test/test_arv-collection-create.rb b/sdk/cli/test/test_arv-collection-create.rb

index f7a9dbe41a853412d84a1752ccae0637ef2250d4..a46d1e6ac0fba92d38acc16ec5b09c9893cf9b29 100644 (file)
--- a/sdk/cli/test/test_arv-collection-create.rb
+++ b/sdk/cli/test/test_arv-collection-create.rb
@@ -1,5 +1,6 @@
  require 'minitest/autorun'
  require 'digest/md5'
+require 'active_support'
  require 'active_support/core_ext'
  require 'tempfile'
  
diff --git a/sdk/cwl/arvados_cwl/arvdocker.py b/sdk/cwl/arvados_cwl/arvdocker.py

index f83add9379c176d5cb9ad472c0778c68174058b5..4fc8b8e7229783117b747c468919a786010d599a 100644 (file)
--- a/sdk/cwl/arvados_cwl/arvdocker.py
+++ b/sdk/cwl/arvados_cwl/arvdocker.py
@@ -27,7 +27,7 @@ def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid
      global cached_lookups_lock
      with cached_lookups_lock:
          if dockerRequirement["dockerImageId"] in cached_lookups:
-            return cached_lookups[dockerRequirement["dockerImageId"]]
+            return dockerRequirement["dockerImageId"]
  
      with SourceLine(dockerRequirement, "dockerImageId", WorkflowException):
          sp = dockerRequirement["dockerImageId"].split(":")
@@ -63,12 +63,10 @@ def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid
          if not images:
              raise WorkflowException("Could not find Docker image %s:%s" % (image_name, image_tag))
  
-        pdh = api_client.collections().get(uuid=images[0][0]).execute()["portable_data_hash"]
-
          with cached_lookups_lock:
-            cached_lookups[dockerRequirement["dockerImageId"]] = pdh
+            cached_lookups[dockerRequirement["dockerImageId"]] = True
  
-        return pdh
+    return dockerRequirement["dockerImageId"]
  
  def arv_docker_clear_cache():
      global cached_lookups
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py

index 68cb3273fd147d526d30801ba4955e26bfe08529..ad69371605a9ffaec4bfd1abb99296cfe565f489 100644 (file)
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -67,7 +67,7 @@ class TestContainer(unittest.TestCase):
                          'state': 'Committed',
                          'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
                          'output_path': '/var/spool/cwl',
-                        'container_image': '99999999999999999999999999999993+99',
+                        'container_image': 'arvados/jobs',
                          'command': ['ls', '/var/spool/cwl'],
                          'cwd': '/var/spool/cwl',
                          'scheduling_parameters': {},
@@ -138,7 +138,7 @@ class TestContainer(unittest.TestCase):
                  'state': 'Committed',
                  'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
                  'output_path': '/var/spool/cwl',
-                'container_image': '99999999999999999999999999999993+99',
+                'container_image': 'arvados/jobs',
                  'command': ['ls'],
                  'cwd': '/var/spool/cwl',
                  'scheduling_parameters': {
@@ -262,7 +262,7 @@ class TestContainer(unittest.TestCase):
                  'state': 'Committed',
                  'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
                  'output_path': '/var/spool/cwl',
-                'container_image': '99999999999999999999999999999993+99',
+                'container_image': 'arvados/jobs',
                  'command': ['ls'],
                  'cwd': '/var/spool/cwl',
                  'scheduling_parameters': {
diff --git a/sdk/python/arvados/api.py b/sdk/python/arvados/api.py

index f24b1ed8142d3e3678ff819bfe4823ebe2add874..d1263e24f27b2e89b3ef386d3d35a28bb9341811 100644 (file)
--- a/sdk/python/arvados/api.py
+++ b/sdk/python/arvados/api.py
@@ -15,6 +15,7 @@ from apiclient import errors as apiclient_errors
  import config
  import errors
  import util
+import cache
  
  _logger = logging.getLogger('arvados.api')
  
@@ -136,7 +137,7 @@ def http_cache(data_type):
          util.mkdir_dash_p(path)
      except OSError:
          path = None
-    return path
+    return cache.SafeHTTPCache(path, max_age=60*60*24*2)
  
  def api(version=None, cache=True, host=None, token=None, insecure=False, **kwargs):
      """Return an apiclient Resources object for an Arvados instance.
@@ -205,7 +206,7 @@ def api(version=None, cache=True, host=None, token=None, insecure=False, **kwarg
  
      kwargs['http'] = _patch_http_request(kwargs['http'], token)
  
-    svc = apiclient_discovery.build('arvados', version, **kwargs)
+    svc = apiclient_discovery.build('arvados', version, cache_discovery=False, **kwargs)
      svc.api_token = token
      svc.insecure = insecure
      kwargs['http'].max_request_size = svc._rootDesc.get('maxRequestSize', 0)
diff --git a/sdk/python/arvados/cache.py b/sdk/python/arvados/cache.py

new file mode 100644 (file)

index 0000000..08c19e4
--- /dev/null
+++ b/sdk/python/arvados/cache.py
@@ -0,0 +1,71 @@
+import errno
+import md5
+import os
+import tempfile
+import time
+
+class SafeHTTPCache(object):
+    """Thread-safe replacement for httplib2.FileCache"""
+
+    def __init__(self, path=None, max_age=None):
+        self._dir = path
+        if max_age is not None:
+            try:
+                self._clean(threshold=time.time() - max_age)
+            except:
+                pass
+
+    def _clean(self, threshold=0):
+        for ent in os.listdir(self._dir):
+            fnm = os.path.join(self._dir, ent)
+            if os.path.isdir(fnm) or not fnm.endswith('.tmp'):
+                continue
+            stat = os.lstat(fnm)
+            if stat.st_mtime < threshold:
+                try:
+                    os.unlink(fnm)
+                except OSError as err:
+                    if err.errno != errno.ENOENT:
+                        raise
+
+    def __str__(self):
+        return self._dir
+
+    def _filename(self, url):
+        return os.path.join(self._dir, md5.new(url).hexdigest()+'.tmp')
+
+    def get(self, url):
+        filename = self._filename(url)
+        try:
+            with open(filename, 'rb') as f:
+                return f.read()
+        except IOError, OSError:
+            return None
+
+    def set(self, url, content):
+        try:
+            fd, tempname = tempfile.mkstemp(dir=self._dir)
+        except:
+            return None
+        try:
+            try:
+                f = os.fdopen(fd, 'w')
+            except:
+                os.close(fd)
+                raise
+            try:
+                f.write(content)
+            finally:
+                f.close()
+            os.rename(tempname, self._filename(url))
+            tempname = None
+        finally:
+            if tempname:
+                os.unlink(tempname)
+
+    def delete(self, url):
+        try:
+            os.unlink(self._filename(url))
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise
diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py

index 098a11e1bb12141c9206edd492d1aa46aa4d0991..f26d3a3d27c0b221d269d3d4a1beb8775268f7e6 100644 (file)
--- a/sdk/python/arvados/collection.py
+++ b/sdk/python/arvados/collection.py
@@ -1316,6 +1316,7 @@ class Collection(RichCollectionBase):
                  uuid=self._manifest_locator).execute(
                      num_retries=self.num_retries))
              self._manifest_text = self._api_response['manifest_text']
+            self._portable_data_hash = self._api_response['portable_data_hash']
              # If not overriden via kwargs, we should try to load the
              # replication_desired from the API server
              if self.replication_desired is None:
diff --git a/sdk/python/arvados/commands/keepdocker.py b/sdk/python/arvados/commands/keepdocker.py

index 3ffb7f36b6b8f7962c711cc19e1f59460346a6ff..57832483236fe5e404e9c64b7d3158336e88cd2d 100644 (file)
--- a/sdk/python/arvados/commands/keepdocker.py
+++ b/sdk/python/arvados/commands/keepdocker.py
@@ -11,6 +11,7 @@ import subprocess
  import sys
  import tarfile
  import tempfile
+import shutil
  import _strptime
  
  from operator import itemgetter
@@ -20,10 +21,17 @@ import arvados
  import arvados.util
  import arvados.commands._util as arv_cmd
  import arvados.commands.put as arv_put
+from arvados.collection import CollectionReader
  import ciso8601
+import logging
+import arvados.config
  
  from arvados._version import __version__
  
+logger = logging.getLogger('arvados.keepdocker')
+logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
+                else logging.INFO)
+
  EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
  STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
  
@@ -103,15 +111,15 @@ def docker_image_format(image_hash):
  def docker_image_compatible(api, image_hash):
      supported = api._rootDesc.get('dockerImageFormats', [])
      if not supported:
-        print >>sys.stderr, "arv-keepdocker: warning: server does not specify supported image formats (see docker_image_formats in server config). Continuing."
-        return True
+        logger.warn("server does not specify supported image formats (see docker_image_formats in server config).")
+        return False
  
      fmt = docker_image_format(image_hash)
      if fmt in supported:
          return True
      else:
-        print >>sys.stderr, "arv-keepdocker: image format is {!r} " \
-            "but server supports only {!r}".format(fmt, supported)
+        logger.error("image format is {!r} " \
+            "but server supports only {!r}".format(fmt, supported))
          return False
  
  def docker_images():
@@ -330,62 +338,6 @@ def _uuid2pdh(api, uuid):
          select=['portable_data_hash'],
      ).execute()['items'][0]['portable_data_hash']
  
-_migration_link_class = 'docker_image_migration'
-_migration_link_name = 'migrate_1.9_1.10'
-def _migrate19_link(api, root_uuid, old_uuid, new_uuid):
-    old_pdh = _uuid2pdh(api, old_uuid)
-    new_pdh = _uuid2pdh(api, new_uuid)
-    if not api.links().list(filters=[
-            ['owner_uuid', '=', root_uuid],
-            ['link_class', '=', _migration_link_class],
-            ['name', '=', _migration_link_name],
-            ['tail_uuid', '=', old_pdh],
-            ['head_uuid', '=', new_pdh]]).execute()['items']:
-        print >>sys.stderr, 'Creating migration link {} -> {}: '.format(
-            old_pdh, new_pdh),
-        link = api.links().create(body={
-            'owner_uuid': root_uuid,
-            'link_class': _migration_link_class,
-            'name': _migration_link_name,
-            'tail_uuid': old_pdh,
-            'head_uuid': new_pdh,
-        }).execute()
-        print >>sys.stderr, '{}'.format(link['uuid'])
-        return link
-
-def migrate19():
-    api = arvados.api('v1')
-    user = api.users().current().execute()
-    if not user['is_admin']:
-        raise Exception("This command requires an admin token")
-    root_uuid = user['uuid'][:12] + '000000000000000'
-    new_image_uuids = {}
-    images = list_images_in_arv(api, 2)
-    is_new = lambda img: img['dockerhash'].startswith('sha256:')
-
-    count_new = 0
-    for uuid, img in images:
-        if not re.match(r'^[0-9a-f]{64}$', img["tag"]):
-            continue
-        key = (img["repo"], img["tag"])
-        if is_new(img) and key not in new_image_uuids:
-            count_new += 1
-            new_image_uuids[key] = uuid
-
-    count_migrations = 0
-    new_links = []
-    for uuid, img in images:
-        key = (img['repo'], img['tag'])
-        if not is_new(img) and key in new_image_uuids:
-            count_migrations += 1
-            link = _migrate19_link(api, root_uuid, uuid, new_image_uuids[key])
-            if link:
-                new_links.append(link)
-
-    print >>sys.stderr, "=== {} new-format images, {} migrations detected, " \
-        "{} links added.".format(count_new, count_migrations, len(new_links))
-    return new_links
-
  def main(arguments=None, stdout=sys.stdout):
      args = arg_parser.parse_args(arguments)
      api = arvados.api('v1')
@@ -393,8 +345,14 @@ def main(arguments=None, stdout=sys.stdout):
      if args.image is None or args.image == 'images':
          fmt = "{:30}  {:10}  {:12}  {:29}  {:20}\n"
          stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
-        for i, j in list_images_in_arv(api, args.retries):
-            stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
+        try:
+            for i, j in list_images_in_arv(api, args.retries):
+                stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
+        except IOError as e:
+            if e.errno == errno.EPIPE:
+                pass
+            else:
+                raise
          sys.exit(0)
  
      # Pull the image if requested, unless the image is specified as a hash
@@ -405,15 +363,15 @@ def main(arguments=None, stdout=sys.stdout):
      try:
          image_hash = find_one_image_hash(args.image, args.tag)
      except DockerError as error:
-        print >>sys.stderr, "arv-keepdocker:", error.message
+        logger.error(error.message)
          sys.exit(1)
  
      if not docker_image_compatible(api, image_hash):
          if args.force_image_format:
-            print >>sys.stderr, "arv-keepdocker: forcing incompatible image"
+            logger.warn("forcing incompatible image")
          else:
-            print >>sys.stderr, "arv-keepdocker: refusing to store " \
-                "incompatible format (use --force-image-format to override)"
+            logger.error("refusing to store " \
+                "incompatible format (use --force-image-format to override)")
              sys.exit(1)
  
      image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
@@ -455,7 +413,7 @@ def main(arguments=None, stdout=sys.stdout):
                          api, args.retries,
                          filters=[['link_class', '=', 'docker_image_repo+tag'],
                                   ['name', '=', image_repo_tag],
-                                 ['head_uuid', 'in', collections]])
+                                 ['head_uuid', 'in', [c["uuid"] for c in collections]]])
                  else:
                      existing_repo_tag = []
  
diff --git a/sdk/python/arvados/commands/migrate19.py b/sdk/python/arvados/commands/migrate19.py

new file mode 100644 (file)

index 0000000..e75095b
--- /dev/null
+++ b/sdk/python/arvados/commands/migrate19.py
@@ -0,0 +1,204 @@
+import argparse
+import time
+import sys
+import logging
+import shutil
+import tempfile
+import os
+import subprocess
+import re
+
+import arvados
+import arvados.commands.keepdocker
+from arvados._version import __version__
+from arvados.collection import CollectionReader
+
+logger = logging.getLogger('arvados.migrate-docker19')
+logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
+                else logging.INFO)
+
+_migration_link_class = 'docker_image_migration'
+_migration_link_name = 'migrate_1.9_1.10'
+
+class MigrationFailed(Exception):
+    pass
+
+def main(arguments=None):
+    """Docker image format migration tool for Arvados.
+
+    This converts Docker images stored in Arvados from image format v1
+    (Docker <= 1.9) to image format v2 (Docker >= 1.10).
+
+    Requires Docker running on the local host.
+
+    Usage:
+
+    1) Run arvados/docker/migrate-docker19/build.sh to create
+    arvados/migrate-docker19 Docker image.
+
+    2) Set ARVADOS_API_HOST and ARVADOS_API_TOKEN to the cluster you want to migrate.
+
+    3) Run arv-migrate-docker19 from the Arvados Python SDK on the host (not in a container).
+
+    This will query Arvados for v1 format Docker images.  For each image that
+    does not already have a corresponding v2 format image (as indicated by a
+    docker_image_migration tag) it will perform the following process:
+
+    i) download the image from Arvados
+    ii) load it into Docker
+    iii) update the Docker version, which updates the image
+    iv) save the v2 format image and upload to Arvados
+    v) create a migration link
+
+    """
+
+    migrate19_parser = argparse.ArgumentParser()
+    migrate19_parser.add_argument(
+        '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
+        help='Print version and exit.')
+
+    exgroup = migrate19_parser.add_mutually_exclusive_group()
+    exgroup.add_argument(
+        '--dry-run', action='store_true', help="Print number of pending migrations.")
+    exgroup.add_argument(
+        '--print-unmigrated', action='store_true',
+        default=False, help="Print list of images needing migration.")
+
+    migrate19_parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
+                                  default=None, help="List of images to be migrated")
+
+    args = migrate19_parser.parse_args(arguments)
+
+    only_migrate = None
+    if args.infile:
+        only_migrate = set()
+        for l in args.infile:
+            only_migrate.add(l.strip())
+
+    api_client  = arvados.api()
+
+    user = api_client.users().current().execute()
+    if not user['is_admin']:
+        raise Exception("This command requires an admin token")
+    sys_uuid = user['uuid'][:12] + '000000000000000'
+
+    images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3)
+
+    is_new = lambda img: img['dockerhash'].startswith('sha256:')
+
+    count_new = 0
+    old_images = []
+    for uuid, img in images:
+        if img["dockerhash"].startswith("sha256:"):
+            continue
+        key = (img["repo"], img["tag"], img["timestamp"])
+        old_images.append(img)
+
+    migration_links = arvados.util.list_all(api_client.links().list, filters=[
+        ['link_class', '=', _migration_link_class],
+        ['name', '=', _migration_link_name],
+    ])
+
+    already_migrated = set()
+    for m in migration_links:
+        already_migrated.add(m["tail_uuid"])
+
+    items = arvados.util.list_all(api_client.collections().list,
+                                  filters=[["uuid", "in", [img["collection"] for img in old_images]]],
+                                  select=["uuid", "portable_data_hash"])
+    uuid_to_pdh = {i["uuid"]: i["portable_data_hash"] for i in items}
+
+    need_migrate = {}
+    for img in old_images:
+        pdh = uuid_to_pdh[img["collection"]]
+        if pdh not in already_migrated and (only_migrate is None or pdh in only_migrate):
+            need_migrate[pdh] = img
+
+    if args.print_unmigrated:
+        only_migrate = set()
+        for pdh in need_migrate:
+            print pdh
+        return
+
+    logger.info("Already migrated %i images", len(already_migrated))
+    logger.info("Need to migrate %i images", len(need_migrate))
+
+    if args.dry_run:
+        return
+
+    success = []
+    failures = []
+    count = 1
+    for old_image in need_migrate.values():
+        if uuid_to_pdh[old_image["collection"]] in already_migrated:
+            continue
+
+        logger.info("[%i/%i] Migrating %s:%s (%s)", count, len(need_migrate), old_image["repo"], old_image["tag"], old_image["collection"])
+        count += 1
+        start = time.time()
+
+        oldcol = CollectionReader(old_image["collection"])
+        tarfile = oldcol.keys()[0]
+
+        varlibdocker = tempfile.mkdtemp()
+        try:
+            with tempfile.NamedTemporaryFile() as envfile:
+                envfile.write("ARVADOS_API_HOST=%s\n" % (os.environ["ARVADOS_API_HOST"]))
+                envfile.write("ARVADOS_API_TOKEN=%s\n" % (os.environ["ARVADOS_API_TOKEN"]))
+                if "ARVADOS_API_HOST_INSECURE" in os.environ:
+                    envfile.write("ARVADOS_API_HOST_INSECURE=%s\n" % (os.environ["ARVADOS_API_HOST_INSECURE"]))
+                envfile.flush()
+
+                dockercmd = ["docker", "run",
+                             "--privileged",
+                             "--rm",
+                             "--env-file", envfile.name,
+                             "--volume", "%s:/var/lib/docker" % varlibdocker,
+                             "arvados/migrate-docker19",
+                             "/root/migrate.sh",
+                             "%s/%s" % (old_image["collection"], tarfile),
+                             tarfile[0:40],
+                             old_image["repo"],
+                             old_image["tag"],
+                             oldcol.api_response()["owner_uuid"]]
+
+                proc = subprocess.Popen(dockercmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                out, err = proc.communicate()
+
+                if proc.returncode != 0:
+                    logger.error("Failed with return code %i", proc.returncode)
+                    logger.error("--- Stdout ---\n%s", out)
+                    logger.error("--- Stderr ---\n%s", err)
+                    raise MigrationFailed()
+
+            migrated = re.search(r"Migrated uuid is ([a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15})", out)
+            if migrated:
+                newcol = CollectionReader(migrated.group(1))
+
+                api_client.links().create(body={"link": {
+                    'owner_uuid': sys_uuid,
+                    'link_class': _migration_link_class,
+                    'name': _migration_link_name,
+                    'tail_uuid': oldcol.portable_data_hash(),
+                    'head_uuid': newcol.portable_data_hash()
+                    }}).execute(num_retries=3)
+
+                logger.info("Migrated '%s' (%s) to '%s' (%s) in %is",
+                            oldcol.portable_data_hash(), old_image["collection"],
+                            newcol.portable_data_hash(), migrated.group(1),
+                            time.time() - start)
+                already_migrated.add(oldcol.portable_data_hash())
+                success.append(old_image["collection"])
+            else:
+                logger.error("Error migrating '%s'", old_image["collection"])
+                failures.append(old_image["collection"])
+        except Exception as e:
+            logger.error("Failed to migrate %s in %is", old_image["collection"], time.time() - start,
+                         exc_info=(not isinstance(e, MigrationFailed)))
+            failures.append(old_image["collection"])
+        finally:
+            shutil.rmtree(varlibdocker)
+
+    logger.info("Successfully migrated %i images", len(success))
+    if failures:
+        logger.error("Failed to migrate %i images", len(failures))
diff --git a/sdk/python/bin/arv-migrate-docker19 b/sdk/python/bin/arv-migrate-docker19

index fcef8d744d813be3c9b5b285635d01b1bf97ff1f..8d928a1c69ab41f350a2dfab1a8c34c2c83cf880 100755 (executable)
--- a/sdk/python/bin/arv-migrate-docker19
+++ b/sdk/python/bin/arv-migrate-docker19
@@ -1,4 +1,4 @@
  #!/usr/bin/env python
  
-from arvados.commands.keepdocker import migrate19
-migrate19()
+from arvados.commands.migrate19 import main
+main()
diff --git a/sdk/python/tests/run_test_server.py b/sdk/python/tests/run_test_server.py

index 776ff728cb164113c056362bc28ed58e09d34320..d10e60c22fef1009179c90da126d098a2fdc9c56 100644 (file)
--- a/sdk/python/tests/run_test_server.py
+++ b/sdk/python/tests/run_test_server.py
@@ -34,9 +34,19 @@ import arvados.config
  ARVADOS_DIR = os.path.realpath(os.path.join(MY_DIRNAME, '../../..'))
  SERVICES_SRC_DIR = os.path.join(ARVADOS_DIR, 'services')
  if 'GOPATH' in os.environ:
+    # Add all GOPATH bin dirs to PATH -- but insert them after the
+    # ruby gems bin dir, to ensure "bundle" runs the Ruby bundler
+    # command, not the golang.org/x/tools/cmd/bundle command.
      gopaths = os.environ['GOPATH'].split(':')
-    gobins = [os.path.join(path, 'bin') for path in gopaths]
-    os.environ['PATH'] = ':'.join(gobins) + ':' + os.environ['PATH']
+    addbins = [os.path.join(path, 'bin') for path in gopaths]
+    newbins = []
+    for path in os.environ['PATH'].split(':'):
+        newbins.append(path)
+        if os.path.exists(os.path.join(path, 'bundle')):
+            newbins += addbins
+            addbins = []
+    newbins += addbins
+    os.environ['PATH'] = ':'.join(newbins)
  
  TEST_TMPDIR = os.path.join(ARVADOS_DIR, 'tmp')
  if not os.path.exists(TEST_TMPDIR):
@@ -229,8 +239,9 @@ def run(leave_running_atexit=False):
      # This will clear cached docs that belong to other processes (like
      # concurrent test suites) even if they're still running. They should
      # be able to tolerate that.
-    for fn in glob.glob(os.path.join(arvados.http_cache('discovery'),
-                                     '*,arvados,v1,rest,*')):
+    for fn in glob.glob(os.path.join(
+            str(arvados.http_cache('discovery')),
+            '*,arvados,v1,rest,*')):
          os.unlink(fn)
  
      pid_file = _pidfile('api')
diff --git a/sdk/python/tests/test_arv_keepdocker.py b/sdk/python/tests/test_arv_keepdocker.py

index 9fcf2f1bab80513fb57e38eeed4703793d03a35f..a25ad1660912b08ebc825d795ba7c23626be3d89 100644 (file)
--- a/sdk/python/tests/test_arv_keepdocker.py
+++ b/sdk/python/tests/test_arv_keepdocker.py
@@ -10,6 +10,7 @@ import subprocess
  import sys
  import tempfile
  import unittest
+import logging
  
  import arvados.commands.keepdocker as arv_keepdocker
  import arvados_testutil as tutil
@@ -21,39 +22,28 @@ class StopTest(Exception):
  
  
  class ArvKeepdockerTestCase(unittest.TestCase):
-    def run_arv_keepdocker(self, args):
+    def run_arv_keepdocker(self, args, err):
          sys.argv = ['arv-keepdocker'] + args
-        return arv_keepdocker.main()
+        log_handler = logging.StreamHandler(err)
+        arv_keepdocker.logger.addHandler(log_handler)
+        try:
+            return arv_keepdocker.main()
+        finally:
+            arv_keepdocker.logger.removeHandler(log_handler)
  
      def test_unsupported_arg(self):
          with self.assertRaises(SystemExit):
-            self.run_arv_keepdocker(['-x=unknown'])
+            self.run_arv_keepdocker(['-x=unknown'], sys.stderr)
  
      def test_version_argument(self):
          err = io.BytesIO()
          out = io.BytesIO()
          with tutil.redirected_streams(stdout=out, stderr=err):
              with self.assertRaises(SystemExit):
-                self.run_arv_keepdocker(['--version'])
+                self.run_arv_keepdocker(['--version'], sys.stderr)
          self.assertEqual(out.getvalue(), '')
          self.assertRegexpMatches(err.getvalue(), "[0-9]+\.[0-9]+\.[0-9]+")
  
-    def test_migrate19(self):
-        try:
-            sys.argv = ['arv-migrate-docker19']
-
-            added = arv_keepdocker.migrate19()
-            self.assertEqual(len(added), 1)
-            self.assertEqual(added[0]['link_class'], 'docker_image_migration')
-            self.assertEqual(added[0]['name'], 'migrate_1.9_1.10')
-            self.assertEqual(added[0]['tail_uuid'], 'fa3c1a9cb6783f85f2ecda037e07b8c3+167')
-            self.assertEqual(added[0]['head_uuid'], 'd740a57097711e08eb9b2a93518f20ab+174')
-
-            added = arv_keepdocker.migrate19()
-            self.assertEqual(added, [])
-        finally:
-            run_test_server.reset()
-
      @mock.patch('arvados.commands.keepdocker.find_image_hashes',
                  return_value=['abc123'])
      @mock.patch('arvados.commands.keepdocker.find_one_image_hash',
@@ -64,9 +54,9 @@ class ArvKeepdockerTestCase(unittest.TestCase):
          for supported, img_id, expect_ok in [
                  (['v1'], old_id, True),
                  (['v1'], new_id, False),
-                (None, old_id, True),
-                ([], old_id, True),
-                ([], new_id, True),
+                (None, old_id, False),
+                ([], old_id, False),
+                ([], new_id, False),
                  (['v1', 'v2'], new_id, True),
                  (['v1'], new_id, False),
                  (['v2'], new_id, True)]:
@@ -80,7 +70,7 @@ class ArvKeepdockerTestCase(unittest.TestCase):
              err = io.BytesIO()
              out = io.BytesIO()
  
-            with tutil.redirected_streams(stdout=out, stderr=err), \
+            with tutil.redirected_streams(stdout=out), \
                   mock.patch('arvados.api') as api, \
                   mock.patch('arvados.commands.keepdocker.popen_docker',
                              return_value=subprocess.Popen(
@@ -91,7 +81,7 @@ class ArvKeepdockerTestCase(unittest.TestCase):
                   self.assertRaises(StopTest if expect_ok else SystemExit):
  
                  api()._rootDesc = fakeDD
-                self.run_arv_keepdocker(['--force', 'testimage'])
+                self.run_arv_keepdocker(['--force', 'testimage'], err)
  
              self.assertEqual(out.getvalue(), '')
              if expect_ok:
@@ -112,7 +102,7 @@ class ArvKeepdockerTestCase(unittest.TestCase):
          fakeDD['dockerImageFormats'] = ['v1']
          err = io.BytesIO()
          out = io.BytesIO()
-        with tutil.redirected_streams(stdout=out, stderr=err), \
+        with tutil.redirected_streams(stdout=out), \
               mock.patch('arvados.api') as api, \
               mock.patch('arvados.commands.keepdocker.popen_docker',
                          return_value=subprocess.Popen(
@@ -123,5 +113,5 @@ class ArvKeepdockerTestCase(unittest.TestCase):
               self.assertRaises(StopTest):
              api()._rootDesc = fakeDD
              self.run_arv_keepdocker(
-                ['--force', '--force-image-format', 'testimage'])
+                ['--force', '--force-image-format', 'testimage'], err)
          self.assertRegexpMatches(err.getvalue(), "forcing incompatible image")
diff --git a/sdk/python/tests/test_cache.py b/sdk/python/tests/test_cache.py

new file mode 100644 (file)

index 0000000..baa60bf
--- /dev/null
+++ b/sdk/python/tests/test_cache.py
@@ -0,0 +1,82 @@
+from __future__ import print_function
+
+import md5
+import mock
+import shutil
+import random
+import sys
+import tempfile
+import threading
+import unittest
+
+import arvados.cache
+import run_test_server
+
+
+def _random(n):
+    return bytearray(random.getrandbits(8) for _ in xrange(n))
+
+
+class CacheTestThread(threading.Thread):
+    def __init__(self, dir):
+        super(CacheTestThread, self).__init__()
+        self._dir = dir
+
+    def run(self):
+        c = arvados.cache.SafeHTTPCache(self._dir)
+        url = 'http://example.com/foo'
+        self.ok = True
+        for x in range(16):
+            try:
+                data_in = _random(128)
+                data_in = md5.new(data_in).hexdigest() + "\n" + str(data_in)
+                c.set(url, data_in)
+                data_out = c.get(url)
+                digest, content = data_out.split("\n", 1)
+                if digest != md5.new(content).hexdigest():
+                    self.ok = False
+            except Exception as err:
+                self.ok = False
+                print("cache failed: {}".format(err), file=sys.stderr)
+
+
+class CacheTest(unittest.TestCase):
+    def setUp(self):
+        self._dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self._dir)
+
+    def test_cache_crud(self):
+        c = arvados.cache.SafeHTTPCache(self._dir, max_age=0)
+        url = 'https://example.com/foo?bar=baz'
+        data1 = _random(256)
+        data2 = _random(128)
+        self.assertEqual(None, c.get(url))
+        c.delete(url)
+        c.set(url, data1)
+        self.assertEqual(data1, c.get(url))
+        c.delete(url)
+        self.assertEqual(None, c.get(url))
+        c.set(url, data1)
+        c.set(url, data2)
+        self.assertEqual(data2, c.get(url))
+
+    def test_cache_threads(self):
+        threads = []
+        for _ in range(64):
+            t = CacheTestThread(dir=self._dir)
+            t.start()
+            threads.append(t)
+        for t in threads:
+            t.join()
+            self.assertTrue(t.ok)
+
+
+class CacheIntegrationTest(run_test_server.TestCaseWithServers):
+    MAIN_SERVER = {}
+
+    def test_cache_used_by_default_client(self):
+        with mock.patch('arvados.cache.SafeHTTPCache.get') as getter:
+            arvados.api('v1')._rootDesc.get('foobar')
+            getter.assert_called()
diff --git a/services/api/app/controllers/arvados/v1/groups_controller.rb b/services/api/app/controllers/arvados/v1/groups_controller.rb

index 5d91a81074cdfe9e75df182132af4b17f1ff85e3..ded28c7109673b4244d46b3ae0ce76ffcfa27e79 100644 (file)
--- a/services/api/app/controllers/arvados/v1/groups_controller.rb
+++ b/services/api/app/controllers/arvados/v1/groups_controller.rb
@@ -91,9 +91,25 @@ class Arvados::V1::GroupsController < ApplicationController
        end
      end
  
+    seen_last_class = false
      klasses.each do |klass|
+      @offset = 0 if seen_last_class  # reset offset for the new next type being processed
+
+      # if current klass is same as params['last_object_class'], mark that fact
+      seen_last_class = true if((params['count'].andand.==('none')) and
+                                (params['last_object_class'].nil? or
+                                 params['last_object_class'].empty? or
+                                 params['last_object_class'] == klass.to_s))
+
+      # if klasses are specified, skip all other klass types
        next if wanted_klasses.any? and !wanted_klasses.include?(klass.to_s)
  
+      # don't reprocess klass types that were already seen
+      next if params['count'] == 'none' and !seen_last_class
+
+      # don't process rest of object types if we already have needed number of objects
+      break if params['count'] == 'none' and all_objects.size >= limit_all
+
        # If the currently requested orders specifically match the
        # table_name for the current klass, apply that order.
        # Otherwise, order by recency.
diff --git a/services/api/app/models/collection.rb b/services/api/app/models/collection.rb

index 9b081dbd2e6586a0e3822f796c2eebfc7bfb5e6d..6d1a0d5b55f595fcaa30baf9937432eb88bc923c 100644 (file)
--- a/services/api/app/models/collection.rb
+++ b/services/api/app/models/collection.rb
@@ -316,8 +316,53 @@ class Collection < ArvadosModel
      [hash_part, size_part].compact.join '+'
    end
  
-  # Return array of Collection objects
-  def self.find_all_for_docker_image(search_term, search_tag=nil, readers=nil)
+  def self.get_compatible_images(readers, pattern, collections)
+    if collections.empty?
+      return []
+    end
+
+    migrations = Hash[
+      Link.where('tail_uuid in (?) AND link_class=? AND links.owner_uuid=?',
+                 collections.map(&:portable_data_hash),
+                 'docker_image_migration',
+                 system_user_uuid).
+      order('links.created_at asc').
+      map { |l|
+        [l.tail_uuid, l.head_uuid]
+      }]
+
+    migrated_collections = Hash[
+      Collection.readable_by(*readers).
+      where('portable_data_hash in (?)', migrations.values).
+      map { |c|
+        [c.portable_data_hash, c]
+      }]
+
+    collections.map { |c|
+      # Check if the listed image is compatible first, if not, then try the
+      # migration link.
+      manifest = Keep::Manifest.new(c.manifest_text)
+      if manifest.exact_file_count?(1) and manifest.files[0][1] =~ pattern
+        c
+      elsif m = migrated_collections[migrations[c.portable_data_hash]]
+        manifest = Keep::Manifest.new(m.manifest_text)
+        if manifest.exact_file_count?(1) and manifest.files[0][1] =~ pattern
+          m
+        end
+      end
+    }.compact
+  end
+
+  # Resolve a Docker repo+tag, hash, or collection PDH to an array of
+  # Collection objects, sorted by timestamp starting with the most recent
+  # match.
+  #
+  # If filter_compatible_format is true (the default), only return image
+  # collections which are support by the installation as indicated by
+  # Rails.configuration.docker_image_formats.  Will follow
+  # 'docker_image_migration' links if search_term resolves to an incompatible
+  # image, but an equivalent compatible image is available.
+  def self.find_all_for_docker_image(search_term, search_tag=nil, readers=nil, filter_compatible_format: true)
      readers ||= [Thread.current[:user]]
      base_search = Link.
        readable_by(*readers).
@@ -325,20 +370,23 @@ class Collection < ArvadosModel
        joins("JOIN collections ON links.head_uuid = collections.uuid").
        order("links.created_at DESC")
  
+    if (Rails.configuration.docker_image_formats.include? 'v1' and
+        Rails.configuration.docker_image_formats.include? 'v2') or filter_compatible_format == false
+      pattern = /^(sha256:)?[0-9A-Fa-f]{64}\.tar$/
+    elsif Rails.configuration.docker_image_formats.include? 'v2'
+      pattern = /^(sha256:)[0-9A-Fa-f]{64}\.tar$/
+    elsif Rails.configuration.docker_image_formats.include? 'v1'
+      pattern = /^[0-9A-Fa-f]{64}\.tar$/
+    else
+      raise "Unrecognized configuration for docker_image_formats #{Rails.configuration.docker_image_formats}"
+    end
+
      # If the search term is a Collection locator that contains one file
      # that looks like a Docker image, return it.
      if loc = Keep::Locator.parse(search_term)
        loc.strip_hints!
-      coll_match = readable_by(*readers).where(portable_data_hash: loc.to_s).limit(1).first
-      if coll_match
-        # Check if the Collection contains exactly one file whose name
-        # looks like a saved Docker image.
-        manifest = Keep::Manifest.new(coll_match.manifest_text)
-        if manifest.exact_file_count?(1) and
-            (manifest.files[0][1] =~ /^(sha256:)?[0-9A-Fa-f]{64}\.tar$/)
-          return [coll_match]
-        end
-      end
+      coll_match = readable_by(*readers).where(portable_data_hash: loc.to_s).limit(1)
+      return get_compatible_images(readers, pattern, coll_match)
      end
  
      if search_tag.nil? and (n = search_term.index(":"))
@@ -362,34 +410,25 @@ class Collection < ArvadosModel
      # so that anything with an image timestamp is considered more recent than
      # anything without; then we use the link's created_at as a tiebreaker.
      uuid_timestamps = {}
-    matches.all.map do |link|
+    matches.each do |link|
        uuid_timestamps[link.head_uuid] = [(-link.properties["image_timestamp"].to_datetime.to_i rescue 0),
         -link.created_at.to_i]
+     end
+
+    sorted = Collection.where('uuid in (?)', uuid_timestamps.keys).sort_by { |c|
+      uuid_timestamps[c.uuid]
+    }
+    compatible = get_compatible_images(readers, pattern, sorted)
+    if sorted.length > 0 and compatible.empty?
+      raise ArvadosModel::UnresolvableContainerError.new "Matching Docker image is incompatible with 'docker_image_formats' configuration."
      end
-    Collection.where('uuid in (?)', uuid_timestamps.keys).sort_by { |c| uuid_timestamps[c.uuid] }
+    compatible
    end
  
    def self.for_latest_docker_image(search_term, search_tag=nil, readers=nil)
      find_all_for_docker_image(search_term, search_tag, readers).first
    end
  
-  # If the given pdh is an old-format docker image, old-format images
-  # aren't supported by the compute nodes according to site config,
-  # and a migrated new-format image is available, return the migrated
-  # image's pdh. Otherwise, just return pdh.
-  def self.docker_migration_pdh(read_users, pdh)
-    if Rails.configuration.docker_image_formats.include?('v1')
-      return pdh
-    end
-    Collection.readable_by(*read_users).
-      joins('INNER JOIN links ON head_uuid=portable_data_hash').
-      where('tail_uuid=? AND link_class=? AND links.owner_uuid=?',
-            pdh, 'docker_image_migration', system_user_uuid).
-      order('links.created_at desc').
-      select('portable_data_hash').
-      first.andand.portable_data_hash || pdh
-  end
-
    def self.searchable_columns operator
      super - ["manifest_text"]
    end
diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb

index 6bb8fb003515881747ce8327d69e3873c8c8f236..87c3ebed30184f822839b35443e1d7808e62fa10 100644 (file)
--- a/services/api/app/models/container_request.rb
+++ b/services/api/app/models/container_request.rb
@@ -235,7 +235,7 @@ class ContainerRequest < ArvadosModel
      if !coll
        raise ArvadosModel::UnresolvableContainerError.new "docker image #{container_image.inspect} not found"
      end
-    return Collection.docker_migration_pdh([current_user], coll.portable_data_hash)
+    coll.portable_data_hash
    end
  
    def set_container
diff --git a/services/api/app/models/job.rb b/services/api/app/models/job.rb

index 80a03e21ab37e95582aa64b14e3ca15989f77bcf..2fdfbb1ecf8d15a0e2fad39e86158ae063a67c21 100644 (file)
--- a/services/api/app/models/job.rb
+++ b/services/api/app/models/job.rb
@@ -149,7 +149,7 @@ class Job < ArvadosModel
          image_hashes = Array.wrap(operand).flat_map do |search_term|
            image_search, image_tag = search_term.split(':', 2)
            Collection.
-            find_all_for_docker_image(image_search, image_tag, read_users).
+            find_all_for_docker_image(image_search, image_tag, read_users, filter_compatible_format: false).
              map(&:portable_data_hash)
          end
          filters << [attr, operator.sub(/ docker$/, ""), image_hashes]
@@ -217,8 +217,7 @@ class Job < ArvadosModel
        else
          image_locator = nil
        end
-      filters << ["docker_image_locator", "=",
-                  Collection.docker_migration_pdh(read_users, image_locator)]
+      filters << ["docker_image_locator", "=", image_locator]
        if sdk_version = attrs[:runtime_constraints].andand["arvados_sdk_version"]
          filters += default_git_filters("arvados_sdk_version", "arvados", sdk_version)
        end
@@ -440,12 +439,6 @@ class Job < ArvadosModel
          [false, "not found for #{image_search}"]
        end
      end
-    Rails.logger.info("docker_image_locator is #{docker_image_locator}")
-    if docker_image_locator && docker_image_locator_changed?
-      self.docker_image_locator =
-        Collection.docker_migration_pdh([current_user], docker_image_locator)
-    end
-    Rails.logger.info("docker_image_locator is #{docker_image_locator}")
    end
  
    def permission_to_update
diff --git a/services/api/config/application.default.yml b/services/api/config/application.default.yml

index 077c23762a111937ad4a70299cc061080d8afdf1..cae6bbdf174468d719d2a0b2dc61d757a78f10bd 100644 (file)
--- a/services/api/config/application.default.yml
+++ b/services/api/config/application.default.yml
@@ -463,3 +463,4 @@ test:
    git_internal_dir: <%= Rails.root.join 'tmp', 'internal.git' %>
    websocket_address: <% if ENV['ARVADOS_TEST_EXPERIMENTAL_WS'] %>"wss://0.0.0.0:<%= ENV['ARVADOS_TEST_WSS_PORT'] %>/websocket"<% else %>false<% end %>
    trash_sweep_interval: -1
+  docker_image_formats: ["v1"]
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb

index 21843de67e64cc045c71d73e8085e4ca063f800e..bea1657de22b72c0d5296a4c571afcee3ffc0993 100644 (file)
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -963,8 +963,11 @@ class CrunchDispatch
    # An array of job_uuids in squeue
    def squeue_jobs
      if Rails.configuration.crunch_job_wrapper == :slurm_immediate
-      File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
-        line.strip
+      p = IO.popen(['squeue', '-a', '-h', '-o', '%j'])
+      begin
+        p.readlines.map {|line| line.strip}
+      ensure
+        p.close
        end
      else
        []
@@ -973,7 +976,9 @@ class CrunchDispatch
  
    def scancel slurm_name
      cmd = sudo_preface + ['scancel', '-n', slurm_name]
-    puts File.popen(cmd).read
+    IO.popen(cmd) do |scancel_pipe|
+      puts scancel_pipe.read
+    end
      if not $?.success?
        Rails.logger.error "scancel #{slurm_name.shellescape}: $?"
      end
diff --git a/services/api/test/functional/arvados/v1/filters_test.rb b/services/api/test/functional/arvados/v1/filters_test.rb

index 2b1b675323fa8a05cdd9682a176e3ec561a099ad..b0660f96fc6a9b3f942c9b66241ab2f82d1cdd1b 100644 (file)
--- a/services/api/test/functional/arvados/v1/filters_test.rb
+++ b/services/api/test/functional/arvados/v1/filters_test.rb
@@ -95,4 +95,56 @@ class Arvados::V1::FiltersTest < ActionController::TestCase
        end
      end
    end
+
+  test "full text search with count='none'" do
+    @controller = Arvados::V1::GroupsController.new
+    authorize_with :admin
+
+    get :contents, {
+      format: :json,
+      count: 'none',
+      limit: 1000,
+      filters: [['any', '@@', Rails.configuration.uuid_prefix]],
+    }
+
+    assert_response :success
+
+    all_objects = Hash.new(0)
+    json_response['items'].map{|o| o['kind']}.each{|t| all_objects[t] += 1}
+
+    assert_equal true, all_objects['arvados#group']>0
+    assert_equal true, all_objects['arvados#job']>0
+    assert_equal true, all_objects['arvados#pipelineInstance']>0
+    assert_equal true, all_objects['arvados#pipelineTemplate']>0
+
+    # Perform test again mimicking a second page request with:
+    # last_object_class = PipelineInstance
+    #   and hence groups and jobs should not be included in the response
+    # offset = 5, which means first 5 pipeline instances were already received in page 1
+    #   and hence the remaining pipeline instances and all other object types should be included in the response
+
+    @test_counter = 0  # Reset executed action counter
+
+    @controller = Arvados::V1::GroupsController.new
+
+    get :contents, {
+      format: :json,
+      count: 'none',
+      limit: 1000,
+      offset: '5',
+      last_object_class: 'PipelineInstance',
+      filters: [['any', '@@', Rails.configuration.uuid_prefix]],
+    }
+
+    assert_response :success
+
+    second_page = Hash.new(0)
+    json_response['items'].map{|o| o['kind']}.each{|t| second_page[t] += 1}
+
+    assert_equal false, second_page.include?('arvados#group')
+    assert_equal false, second_page.include?('arvados#job')
+    assert_equal true, second_page['arvados#pipelineInstance']>0
+    assert_equal all_objects['arvados#pipelineInstance'], second_page['arvados#pipelineInstance']+5
+    assert_equal true, second_page['arvados#pipelineTemplate']>0
+  end
  end
diff --git a/services/api/test/unit/container_request_test.rb b/services/api/test/unit/container_request_test.rb

index 328273bfc25ae1cc2b494b1f43e1e32503691708..af1d4b25fdeed5b26d55b17affeb3c552ab01646 100644 (file)
--- a/services/api/test/unit/container_request_test.rb
+++ b/services/api/test/unit/container_request_test.rb
@@ -395,7 +395,8 @@ class ContainerRequestTest < ActiveSupport::TestCase
  
    test "container_image_for_container(pdh)" do
      set_user_from_auth :active
-    [:docker_image, :docker_image_1_12].each do |coll|
+    [[:docker_image, 'v1'], [:docker_image_1_12, 'v2']].each do |coll, ver|
+      Rails.configuration.docker_image_formats = [ver]
        pdh = collections(coll).portable_data_hash
        cr = ContainerRequest.new(container_image: pdh)
        resolved = cr.send :container_image_for_container
@@ -420,6 +421,8 @@ class ContainerRequestTest < ActiveSupport::TestCase
      Rails.configuration.docker_image_formats = ['v2']
      add_docker19_migration_link
  
+    # Test that it returns only v2 images even though request is for v1 image.
+
      set_user_from_auth :active
      cr = create_minimal_req!(command: ["true", "1"],
                               container_image: collections(:docker_image).portable_data_hash)
@@ -432,6 +435,55 @@ class ContainerRequestTest < ActiveSupport::TestCase
                   collections(:docker_image_1_12).portable_data_hash)
    end
  
+  test "use unmigrated docker image" do
+    Rails.configuration.docker_image_formats = ['v1']
+    add_docker19_migration_link
+
+    # Test that it returns only supported v1 images even though there is a
+    # migration link.
+
+    set_user_from_auth :active
+    cr = create_minimal_req!(command: ["true", "1"],
+                             container_image: collections(:docker_image).portable_data_hash)
+    assert_equal(cr.send(:container_image_for_container),
+                 collections(:docker_image).portable_data_hash)
+
+    cr = create_minimal_req!(command: ["true", "2"],
+                             container_image: links(:docker_image_collection_tag).name)
+    assert_equal(cr.send(:container_image_for_container),
+                 collections(:docker_image).portable_data_hash)
+  end
+
+  test "incompatible docker image v1" do
+    Rails.configuration.docker_image_formats = ['v1']
+    add_docker19_migration_link
+
+    # Don't return unsupported v2 image even if we ask for it directly.
+    set_user_from_auth :active
+    cr = create_minimal_req!(command: ["true", "1"],
+                             container_image: collections(:docker_image_1_12).portable_data_hash)
+    assert_raises(ArvadosModel::UnresolvableContainerError) do
+      cr.send(:container_image_for_container)
+    end
+  end
+
+  test "incompatible docker image v2" do
+    Rails.configuration.docker_image_formats = ['v2']
+    # No migration link, don't return unsupported v1 image,
+
+    set_user_from_auth :active
+    cr = create_minimal_req!(command: ["true", "1"],
+                             container_image: collections(:docker_image).portable_data_hash)
+    assert_raises(ArvadosModel::UnresolvableContainerError) do
+      cr.send(:container_image_for_container)
+    end
+    cr = create_minimal_req!(command: ["true", "2"],
+                             container_image: links(:docker_image_collection_tag).name)
+    assert_raises(ArvadosModel::UnresolvableContainerError) do
+      cr.send(:container_image_for_container)
+    end
+  end
+
    test "requestor can retrieve container owned by dispatch" do
      assert_not_empty Container.readable_by(users(:admin)).where(uuid: containers(:running).uuid)
      assert_not_empty Container.readable_by(users(:active)).where(uuid: containers(:running).uuid)
diff --git a/services/api/test/unit/crunch_dispatch_test.rb b/services/api/test/unit/crunch_dispatch_test.rb

index 4646f7afd151c1b6a0467521f2ff08841791c199..d091847df2a20a19bbf08d55a5a9c8e511b8d7b9 100644 (file)
--- a/services/api/test/unit/crunch_dispatch_test.rb
+++ b/services/api/test/unit/crunch_dispatch_test.rb
@@ -208,14 +208,14 @@ class CrunchDispatchTest < ActiveSupport::TestCase
      act_as_system_user do
        dispatch = CrunchDispatch.new
  
-      squeue_resp = File.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\n")
-      scancel_resp = File.popen("true")
+      squeue_resp = IO.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\n")
+      scancel_resp = IO.popen("true")
  
-      File.expects(:popen).
+      IO.expects(:popen).
          with(['squeue', '-a', '-h', '-o', '%j']).
          returns(squeue_resp)
  
-      File.expects(:popen).
+      IO.expects(:popen).
          with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
          returns(scancel_resp)
  
diff --git a/services/api/test/unit/fail_jobs_test.rb b/services/api/test/unit/fail_jobs_test.rb

index 8c6539e8dc62820e53076c108aee5c3dfdf32f52..311976501cda65792c23283b98b21882120b17c2 100644 (file)
--- a/services/api/test/unit/fail_jobs_test.rb
+++ b/services/api/test/unit/fail_jobs_test.rb
@@ -38,12 +38,12 @@ class FailJobsTest < ActiveSupport::TestCase
    test 'cancel slurm jobs' do
      Rails.configuration.crunch_job_wrapper = :slurm_immediate
      Rails.configuration.crunch_job_user = 'foobar'
-    fake_squeue = File.popen("echo #{@job[:before_reboot].uuid}")
-    fake_scancel = File.popen("true")
-    File.expects(:popen).
+    fake_squeue = IO.popen("echo #{@job[:before_reboot].uuid}")
+    fake_scancel = IO.popen("true")
+    IO.expects(:popen).
        with(['squeue', '-a', '-h', '-o', '%j']).
        returns(fake_squeue)
-    File.expects(:popen).
+    IO.expects(:popen).
        with(includes('sudo', '-u', 'foobar', 'scancel', '-n', @job[:before_reboot].uuid)).
        returns(fake_scancel)
      @dispatch.fail_jobs(before: Time.at(BOOT_TIME).to_s)
diff --git a/services/api/test/unit/job_test.rb b/services/api/test/unit/job_test.rb

index 8fe5e5db28c804881fd2ac24c44ede3da7ae4d53..5677776cd45b875c9f85602af95d080e951f4118 100644 (file)
--- a/services/api/test/unit/job_test.rb
+++ b/services/api/test/unit/job_test.rb
@@ -444,18 +444,31 @@ class JobTest < ActiveSupport::TestCase
    ].each do |existing_image, request_image, expect_image|
      test "if a #{existing_image} job exists, #{request_image} yields #{expect_image} after migration" do
        Rails.configuration.docker_image_formats = ['v1']
-      oldjob = Job.create!(
-        job_attrs(
-          script: 'foobar1',
-          runtime_constraints: {
-            'docker_image' => collections(existing_image).portable_data_hash}))
-      oldjob.reload
-      assert_equal(oldjob.docker_image_locator,
-                   collections(existing_image).portable_data_hash)
+
+      if existing_image == :docker_image
+        oldjob = Job.create!(
+          job_attrs(
+            script: 'foobar1',
+            runtime_constraints: {
+              'docker_image' => collections(existing_image).portable_data_hash}))
+        oldjob.reload
+        assert_equal(oldjob.docker_image_locator,
+                     collections(existing_image).portable_data_hash)
+      elsif existing_image == :docker_image_1_12
+        assert_raises(ActiveRecord::RecordInvalid,
+                      "Should not resolve v2 image when only v1 is supported") do
+        oldjob = Job.create!(
+          job_attrs(
+            script: 'foobar1',
+            runtime_constraints: {
+              'docker_image' => collections(existing_image).portable_data_hash}))
+        end
+      end
  
        Rails.configuration.docker_image_formats = ['v2']
        add_docker19_migration_link
  
+      # Check that both v1 and v2 images get resolved to v2.
        newjob = Job.create!(
          job_attrs(
            script: 'foobar1',
diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go

index 3b3cdf1c14a872dacbc76b5679db9970609907dd..062126d6ff42e710523c5779ac2441c109f77f65 100644 (file)
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -33,6 +33,7 @@ type IArvadosClient interface {
         Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
         Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
         Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error
+       CallRaw(method string, resourceType string, uuid string, action string, parameters arvadosclient.Dict) (reader io.ReadCloser, err error)
         Discovery(key string) (interface{}, error)
  }
  
@@ -91,8 +92,6 @@ type ContainerRunner struct {
         CleanupTempDir []string
         Binds          []string
         OutputPDH      *string
-       CancelLock     sync.Mutex
-       Cancelled      bool
         SigChan        chan os.Signal
         ArvMountExit   chan error
         finalState     string
@@ -114,6 +113,13 @@ type ContainerRunner struct {
         // parent to be X" feature even on sites where the "specify
         // cgroup parent" feature breaks.
         setCgroupParent string
+
+       cStateLock sync.Mutex
+       cStarted   bool // StartContainer() succeeded
+       cCancelled bool // StopContainer() invoked
+
+       enableNetwork string // one of "default" or "always"
+       networkMode   string // passed through to HostConfig.NetworkMode
  }
  
  // SetupSignals sets up signal handling to gracefully terminate the underlying
@@ -133,13 +139,13 @@ func (runner *ContainerRunner) SetupSignals() {
  
  // stop the underlying Docker container.
  func (runner *ContainerRunner) stop() {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                 return
         }
-       runner.Cancelled = true
-       if runner.ContainerID != "" {
+       runner.cCancelled = true
+       if runner.cStarted {
                 err := runner.Docker.StopContainer(runner.ContainerID, 10)
                 if err != nil {
                         log.Printf("StopContainer failed: %s", err)
@@ -504,6 +510,98 @@ func (runner *ContainerRunner) StartCrunchstat() {
         runner.statReporter.Start()
  }
  
+type infoCommand struct {
+       label string
+       cmd   []string
+}
+
+// Gather node information and store it on the log for debugging
+// purposes.
+func (runner *ContainerRunner) LogNodeInfo() (err error) {
+       w := runner.NewLogWriter("node-info")
+       logger := log.New(w, "node-info", 0)
+
+       commands := []infoCommand{
+               infoCommand{
+                       label: "Host Information",
+                       cmd:   []string{"uname", "-a"},
+               },
+               infoCommand{
+                       label: "CPU Information",
+                       cmd:   []string{"cat", "/proc/cpuinfo"},
+               },
+               infoCommand{
+                       label: "Memory Information",
+                       cmd:   []string{"cat", "/proc/meminfo"},
+               },
+               infoCommand{
+                       label: "Disk Space",
+                       cmd:   []string{"df", "-m", "/", os.TempDir()},
+               },
+               infoCommand{
+                       label: "Disk INodes",
+                       cmd:   []string{"df", "-i", "/", os.TempDir()},
+               },
+       }
+
+       // Run commands with informational output to be logged.
+       var out []byte
+       for _, command := range commands {
+               out, err = exec.Command(command.cmd[0], command.cmd[1:]...).CombinedOutput()
+               if err != nil {
+                       return fmt.Errorf("While running command %q: %v",
+                               command.cmd, err)
+               }
+               logger.Println(command.label)
+               for _, line := range strings.Split(string(out), "\n") {
+                       logger.Println(" ", line)
+               }
+       }
+
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing node-info logs: %v", err)
+       }
+       return nil
+}
+
+// Get and save the raw JSON container record from the API server
+func (runner *ContainerRunner) LogContainerRecord() (err error) {
+       w := &ArvLogWriter{
+               runner.ArvClient,
+               runner.Container.UUID,
+               "container",
+               runner.LogCollection.Open("container.json"),
+       }
+       // Get Container record JSON from the API Server
+       reader, err := runner.ArvClient.CallRaw("GET", "containers", runner.Container.UUID, "", nil)
+       if err != nil {
+               return fmt.Errorf("While retrieving container record from the API server: %v", err)
+       }
+       defer reader.Close()
+       // Read the API server response as []byte
+       json_bytes, err := ioutil.ReadAll(reader)
+       if err != nil {
+               return fmt.Errorf("While reading container record API server response: %v", err)
+       }
+       // Decode the JSON []byte
+       var cr map[string]interface{}
+       if err = json.Unmarshal(json_bytes, &cr); err != nil {
+               return fmt.Errorf("While decoding the container record JSON response: %v", err)
+       }
+       // Re-encode it using indentation to improve readability
+       enc := json.NewEncoder(w)
+       enc.SetIndent("", "    ")
+       if err = enc.Encode(cr); err != nil {
+               return fmt.Errorf("While logging the JSON container record: %v", err)
+       }
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing container.json log: %v", err)
+       }
+       return nil
+}
+
  // AttachLogs connects the docker container stdout and stderr logs to the
  // Arvados logger which logs to Keep and the API server logs table.
  func (runner *ContainerRunner) AttachStreams() (err error) {
@@ -563,6 +661,15 @@ func (runner *ContainerRunner) CreateContainer() error {
         for k, v := range runner.Container.Environment {
                 runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
         }
+
+       runner.HostConfig = dockerclient.HostConfig{
+               Binds:        runner.Binds,
+               CgroupParent: runner.setCgroupParent,
+               LogConfig: dockerclient.LogConfig{
+                       Type: "none",
+               },
+       }
+
         if wantAPI := runner.Container.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
                 tok, err := runner.ContainerToken()
                 if err != nil {
@@ -573,9 +680,13 @@ func (runner *ContainerRunner) CreateContainer() error {
                         "ARVADOS_API_HOST="+os.Getenv("ARVADOS_API_HOST"),
                         "ARVADOS_API_HOST_INSECURE="+os.Getenv("ARVADOS_API_HOST_INSECURE"),
                 )
-               runner.ContainerConfig.NetworkDisabled = false
+               runner.HostConfig.NetworkMode = runner.networkMode
         } else {
-               runner.ContainerConfig.NetworkDisabled = true
+               if runner.enableNetwork == "always" {
+                       runner.HostConfig.NetworkMode = runner.networkMode
+               } else {
+                       runner.HostConfig.NetworkMode = "none"
+               }
         }
  
         var err error
@@ -584,24 +695,22 @@ func (runner *ContainerRunner) CreateContainer() error {
                 return fmt.Errorf("While creating container: %v", err)
         }
  
-       runner.HostConfig = dockerclient.HostConfig{
-               Binds:        runner.Binds,
-               CgroupParent: runner.setCgroupParent,
-               LogConfig: dockerclient.LogConfig{
-                       Type: "none",
-               },
-       }
-
         return runner.AttachStreams()
  }
  
  // StartContainer starts the docker container created by CreateContainer.
  func (runner *ContainerRunner) StartContainer() error {
         runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
+               return ErrCancelled
+       }
         err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
         if err != nil {
                 return fmt.Errorf("could not start container: %v", err)
         }
+       runner.cStarted = true
         return nil
  }
  
@@ -846,9 +955,9 @@ func (runner *ContainerRunner) CommitLogs() error {
  
  // UpdateContainerRunning updates the container state to "Running"
  func (runner *ContainerRunner) UpdateContainerRunning() error {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                 return ErrCancelled
         }
         return runner.ArvClient.Update("containers", runner.Container.UUID,
@@ -892,9 +1001,9 @@ func (runner *ContainerRunner) UpdateContainerFinal() error {
  
  // IsCancelled returns the value of Cancelled, with goroutine safety.
  func (runner *ContainerRunner) IsCancelled() bool {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       return runner.Cancelled
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       return runner.cCancelled
  }
  
  // NewArvLogWriter creates an ArvLogWriter
@@ -990,6 +1099,17 @@ func (runner *ContainerRunner) Run() (err error) {
                 return
         }
  
+       // Gather and record node information
+       err = runner.LogNodeInfo()
+       if err != nil {
+               return
+       }
+       // Save container.json record on log collection
+       err = runner.LogContainerRecord()
+       if err != nil {
+               return
+       }
+
         runner.StartCrunchstat()
  
         if runner.IsCancelled() {
@@ -1037,6 +1157,14 @@ func main() {
         cgroupParent := flag.String("cgroup-parent", "docker", "name of container's parent cgroup (ignored if -cgroup-parent-subsystem is used)")
         cgroupParentSubsystem := flag.String("cgroup-parent-subsystem", "", "use current cgroup for given subsystem as parent cgroup for container")
         caCertsPath := flag.String("ca-certs", "", "Path to TLS root certificates")
+       enableNetwork := flag.String("container-enable-networking", "default",
+               `Specify if networking should be enabled for container.  One of 'default', 'always':
+       default: only enable networking if container requests it.
+       always:  containers always have networking enabled
+       `)
+       networkMode := flag.String("container-network-mode", "default",
+               `Set networking mode for container.  Corresponds to Docker network mode (--net).
+       `)
         flag.Parse()
  
         containerId := flag.Arg(0)
@@ -1068,6 +1196,8 @@ func main() {
         cr.statInterval = *statInterval
         cr.cgroupRoot = *cgroupRoot
         cr.expectCgroupParent = *cgroupParent
+       cr.enableNetwork = *enableNetwork
+       cr.networkMode = *networkMode
         if *cgroupParentSubsystem != "" {
                 p := findCgroup(*cgroupParentSubsystem)
                 cr.setCgroupParent = p
diff --git a/services/crunch-run/crunchrun_test.go b/services/crunch-run/crunchrun_test.go

index 36ddd36d31525533e604d3d9d521de404206c9d8..7224c4f1b3d622051e1506da7bc1116f443ac003 100644 (file)
--- a/services/crunch-run/crunchrun_test.go
+++ b/services/crunch-run/crunchrun_test.go
@@ -11,6 +11,7 @@ import (
         "os"
         "os/exec"
         "path/filepath"
+       "runtime/pprof"
         "sort"
         "strings"
         "sync"
@@ -183,6 +184,21 @@ func (client *ArvTestClient) Call(method, resourceType, uuid, action string, par
         }
  }
  
+func (client *ArvTestClient) CallRaw(method, resourceType, uuid, action string,
+       parameters arvadosclient.Dict) (reader io.ReadCloser, err error) {
+       j := []byte(`{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`)
+       return ioutil.NopCloser(bytes.NewReader(j)), nil
+}
+
  func (client *ArvTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
         if resourceType == "collections" {
                 if uuid == hwPDH {
@@ -321,6 +337,11 @@ func (ArvErrorTestClient) Call(method, resourceType, uuid, action string, parame
         return errors.New("ArvError")
  }
  
+func (ArvErrorTestClient) CallRaw(method, resourceType, uuid, action string,
+       parameters arvadosclient.Dict) (reader io.ReadCloser, err error) {
+       return nil, errors.New("ArvError")
+}
+
  func (ArvErrorTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
         return errors.New("ArvError")
  }
@@ -525,7 +546,7 @@ func (s *TestSuite) TestUpdateContainerCancelled(c *C) {
         api := &ArvTestClient{}
         kc := &KeepTestClient{}
         cr := NewContainerRunner(api, kc, nil, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
-       cr.Cancelled = true
+       cr.cCancelled = true
         cr.finalState = "Cancelled"
  
         err := cr.UpdateContainerFinal()
@@ -650,6 +671,56 @@ func (s *TestSuite) TestCrunchstat(c *C) {
         c.Check(api.Logs["crunchstat"].String(), Matches, `(?ms).*cgroup stats files never appeared for abcde\n`)
  }
  
+func (s *TestSuite) TestNodeInfoLog(c *C) {
+       api, _, _ := FullRunHelper(c, `{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`, nil, func(t *TestDockerClient) {
+               time.Sleep(time.Second)
+               t.logWriter.Close()
+               t.finish <- dockerclient.WaitResult{}
+       })
+
+       c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+       c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+       c.Assert(api.Logs["node-info"], NotNil)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Host Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*CPU Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Memory Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Disk Space.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Disk INodes.*`)
+}
+
+func (s *TestSuite) TestContainerRecordLog(c *C) {
+       api, _, _ := FullRunHelper(c, `{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`, nil, func(t *TestDockerClient) {
+               time.Sleep(time.Second)
+               t.logWriter.Close()
+               t.finish <- dockerclient.WaitResult{}
+       })
+
+       c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+       c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+       c.Assert(api.Logs["container"], NotNil)
+       c.Check(api.Logs["container"].String(), Matches, `(?ms).*container_image.*`)
+}
+
  func (s *TestSuite) TestFullRunStderr(c *C) {
         api, _, _ := FullRunHelper(c, `{
      "command": ["/bin/sh", "-c", "echo hello ; echo world 1>&2 ; exit 1"],
@@ -722,7 +793,7 @@ func (s *TestSuite) TestFullRunSetCwd(c *C) {
  func (s *TestSuite) TestStopOnSignal(c *C) {
         s.testStopContainer(c, func(cr *ContainerRunner) {
                 go func() {
-                       for cr.ContainerID == "" {
+                       for !cr.cStarted {
                                 time.Sleep(time.Millisecond)
                         }
                         cr.SigChan <- syscall.SIGINT
@@ -776,6 +847,7 @@ func (s *TestSuite) testStopContainer(c *C, setup func(cr *ContainerRunner)) {
         }()
         select {
         case <-time.After(20 * time.Second):
+               pprof.Lookup("goroutine").WriteTo(os.Stderr, 1)
                 c.Fatal("timed out")
         case err = <-done:
                 c.Check(err, IsNil)
diff --git a/services/fuse/arvados_fuse/command.py b/services/fuse/arvados_fuse/command.py

index ca77cfcd7bd6bc3caafb55957f41e2b8d776c26b..66f8a4d39319fb9bdc8f864962bc9c02a0d57ac3 100644 (file)
--- a/services/fuse/arvados_fuse/command.py
+++ b/services/fuse/arvados_fuse/command.py
@@ -31,6 +31,8 @@ class ArgumentParser(argparse.ArgumentParser):
          self.add_argument('mountpoint', type=str, help="""Mount point.""")
          self.add_argument('--allow-other', action='store_true',
                              help="""Let other users read the mount""")
+        self.add_argument('--subtype', type=str, metavar='STRING',
+                            help="""Report mounted filesystem type as "fuse.STRING", instead of just "fuse".""")
  
          mode = self.add_mutually_exclusive_group()
  
@@ -148,6 +150,8 @@ class Mount(object):
                  if getattr(self.args, optname)]
          # Increase default read/write size from 4KiB to 128KiB
          opts += ["big_writes", "max_read=131072"]
+        if self.args.subtype:
+            opts += ["subtype="+self.args.subtype]
          return opts
  
      def _setup_logging(self):
diff --git a/services/fuse/tests/test_mount_type.py b/services/fuse/tests/test_mount_type.py

new file mode 100644 (file)

index 0000000..f6a3e89
--- /dev/null
+++ b/services/fuse/tests/test_mount_type.py
@@ -0,0 +1,23 @@
+import logging
+import subprocess
+
+from .integration_test import IntegrationTest
+
+logger = logging.getLogger('arvados.arv-mount')
+
+
+class MountTypeTest(IntegrationTest):
+    @IntegrationTest.mount(argv=["--subtype=arv-mount-test"])
+    def test_mount_type(self):
+        self.pool_test(self.mnt)
+
+    @staticmethod
+    def _test_mount_type(self, mnt):
+        self.assertEqual(["fuse.arv-mount-test"], [
+            toks[4]
+            for toks in [
+                line.split(' ')
+                for line in subprocess.check_output("mount").split("\n")
+            ]
+            if len(toks) > 4 and toks[2] == mnt
+        ])
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py

index be24e0fba069db5e66227d2fb448aecf12b93d36..0c8ddc29eb6ca7b37ebc947addfffbf08dc7ab46 100644 (file)
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -79,7 +79,7 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
  
      def _destroy_node(self):
          if self._nodename:
-            self._set_node_state(self._nodename, 'DOWN')
+            self._set_node_state(self._nodename, 'DOWN', 'Reason=Node Manager shutdown')
          super(ComputeNodeShutdownActor, self)._destroy_node()
  
  
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py

index dcbc87ccc855d88193840565aec5342271c492e6..b4f17849f1e77b296ee08ed632b2dab47b1e173b 100644 (file)
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -243,14 +243,15 @@ class NodeManagerDaemonActor(actor_class):
          return s
  
      def _node_states(self, size):
-        states = pykka.get_all(rec.actor.get_state()
-                               for rec in self.cloud_nodes.nodes.itervalues()
-                               if ((size is None or rec.cloud_node.size.id == size.id) and
-                                   rec.shutdown_actor is None))
-        states += ['shutdown' for rec in self.cloud_nodes.nodes.itervalues()
-                   if ((size is None or rec.cloud_node.size.id == size.id) and
-                       rec.shutdown_actor is not None)]
-        return states
+        proxy_states = []
+        states = []
+        for rec in self.cloud_nodes.nodes.itervalues():
+            if size is None or rec.cloud_node.size.id == size.id:
+                if rec.shutdown_actor is None and rec.actor is not None:
+                    proxy_states.append(rec.actor.get_state())
+                else:
+                    states.append("shutdown")
+        return states + pykka.get_all(proxy_states)
  
      def _state_counts(self, size):
          states = self._node_states(size)
diff --git a/tools/arvbox/bin/arvbox b/tools/arvbox/bin/arvbox

index 474913170a624a618ab82bad57dd74c4ca790cf3..395129d21b76e6e7aea027e79feb402afd09d2fc 100755 (executable)
--- a/tools/arvbox/bin/arvbox
+++ b/tools/arvbox/bin/arvbox
@@ -106,14 +106,22 @@ run() {
  
      shift
  
+    need_setup=1
+
      if docker ps -a --filter "status=running" | grep -E "$ARVBOX_CONTAINER$" -q ; then
-        echo "Container $ARVBOX_CONTAINER is already running"
-        exit 0
+        if test "$CONFIG" = test ; then
+            need_setup=0
+        else
+            echo "Container $ARVBOX_CONTAINER is already running"
+            exit 0
+        fi
      fi
  
-    if docker ps -a | grep -E "$ARVBOX_CONTAINER$" -q ; then
-        echo "Container $ARVBOX_CONTAINER already exists but is not running; use restart or rebuild"
-        exit 1
+    if test $need_setup = 1 ; then
+        if docker ps -a | grep -E "$ARVBOX_CONTAINER$" -q ; then
+            echo "Container $ARVBOX_CONTAINER already exists but is not running; use restart or reboot"
+            exit 1
+        fi
      fi
  
      if test ! -z "$TAG"
@@ -186,35 +194,37 @@ run() {
  
              mkdir -p $VAR_DATA/test
  
-            docker run \
-                   --detach \
-                   --name=$ARVBOX_CONTAINER \
-                   --privileged \
-                   "--volume=$ARVADOS_ROOT:/usr/src/arvados:rw" \
-                   "--volume=$SSO_ROOT:/usr/src/sso:rw" \
-                   "--volume=$PG_DATA:/var/lib/postgresql:rw" \
-                   "--volume=$VAR_DATA:/var/lib/arvados:rw" \
-                   "--volume=$PASSENGER:/var/lib/passenger:rw" \
-                   "--volume=$GEMS:/var/lib/gems:rw" \
-                   "--volume=$PIPCACHE:/var/lib/pip:rw" \
-                   "--volume=$GOSTUFF:/var/lib/gopath:rw" \
-                   arvados/arvbox-dev$TAG \
-                   /usr/local/bin/runsvinit -svdir=/etc/test-service
-
-            docker exec -ti \
-                    $ARVBOX_CONTAINER \
-                    /usr/local/lib/arvbox/runsu.sh \
-                    /usr/local/lib/arvbox/waitforpostgres.sh
-
-            docker exec -ti \
-                   $ARVBOX_CONTAINER \
-                   /usr/local/lib/arvbox/runsu.sh \
-                   /var/lib/arvbox/service/sso/run-service --only-setup
-
-            docker exec -ti \
-                   $ARVBOX_CONTAINER \
-                   /usr/local/lib/arvbox/runsu.sh \
-                   /var/lib/arvbox/service/api/run-service --only-setup
+            if test "$need_setup" = 1 ; then
+                docker run \
+                       --detach \
+                       --name=$ARVBOX_CONTAINER \
+                       --privileged \
+                       "--volume=$ARVADOS_ROOT:/usr/src/arvados:rw" \
+                       "--volume=$SSO_ROOT:/usr/src/sso:rw" \
+                       "--volume=$PG_DATA:/var/lib/postgresql:rw" \
+                       "--volume=$VAR_DATA:/var/lib/arvados:rw" \
+                       "--volume=$PASSENGER:/var/lib/passenger:rw" \
+                       "--volume=$GEMS:/var/lib/gems:rw" \
+                       "--volume=$PIPCACHE:/var/lib/pip:rw" \
+                       "--volume=$GOSTUFF:/var/lib/gopath:rw" \
+                       arvados/arvbox-dev$TAG \
+                       /usr/local/bin/runsvinit -svdir=/etc/test-service
+
+                docker exec -ti \
+                       $ARVBOX_CONTAINER \
+                       /usr/local/lib/arvbox/runsu.sh \
+                       /usr/local/lib/arvbox/waitforpostgres.sh
+
+                docker exec -ti \
+                       $ARVBOX_CONTAINER \
+                       /usr/local/lib/arvbox/runsu.sh \
+                       /var/lib/arvbox/service/sso/run-service --only-setup
+
+                docker exec -ti \
+                       $ARVBOX_CONTAINER \
+                       /usr/local/lib/arvbox/runsu.sh \
+                       /var/lib/arvbox/service/api/run-service --only-setup
+            fi
  
              docker exec -ti \
                     $ARVBOX_CONTAINER \
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/run-service b/tools/arvbox/lib/arvbox/docker/service/websockets/run-service

index d0c0b5dbd7c482a5be4f90e829dd49a906ab3361..d2f010b474d3a100759fce047d10389adbf382f7 100755 (executable)
--- a/tools/arvbox/lib/arvbox/docker/service/websockets/run-service
+++ b/tools/arvbox/lib/arvbox/docker/service/websockets/run-service
@@ -5,26 +5,31 @@ set -ex -o pipefail
  
  . /usr/local/lib/arvbox/common.sh
  
-cd /usr/src/arvados/services/api
-export RAILS_ENV=development
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
  
-run_bundler --without=development
+export GOPATH=$PWD
+mkdir -p "$GOPATH/src/git.curoverse.com"
+ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/ws"
+install bin/ws /usr/local/bin/arvados-ws
  
  if test "$1" = "--only-deps" ; then
      exit
  fi
  
-flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
-
-set +u
-if test "$1" = "--only-setup" ; then
-    exit
-fi
-
-export ARVADOS_WEBSOCKETS=ws-only
-
-# serving ssl directly doesn't work, gets
-# Rack app error: #<TypeError: no implicit conversion of Puma::MiniSSL::Socket into Integer>
-#exec bundle exec puma -b "ssl://0.0.0.0:${services[websockets]}?cert=/var/lib/arvados/self-signed.pem&key=/var/lib/arvados/self-signed.key"
-
-exec bundle exec puma -p${services[websockets]}
+database_pw=$(cat /var/lib/arvados/api_database_pw)
+
+cat >/var/lib/arvados/arvados-ws.yml <<EOF
+Client:
+  APIHost: $localip:${services[api]}
+  Insecure: true
+Postgres:
+  dbname: arvados_development
+  user: arvados
+  password: $database_pw
+  host: localhost
+Listen: :8002
+EOF
+
+exec /usr/local/bin/arvados-ws -config /var/lib/arvados/arvados-ws.yml
author	Tom Clegg <tom@curoverse.com>
	Thu, 23 Mar 2017 18:51:21 +0000 (14:51 -0400)
committer	Tom Clegg <tom@curoverse.com>
	Thu, 23 Mar 2017 18:51:21 +0000 (14:51 -0400)
apps/workbench/app/controllers/application_controller.rb		patch \| blob \| history
apps/workbench/app/controllers/search_controller.rb		patch \| blob \| history
apps/workbench/app/views/projects/_show_dashboard.html.erb		patch \| blob \| history
apps/workbench/app/views/work_units/_show_child.html.erb		patch \| blob \| history
apps/workbench/test/controllers/disabled_api_test.rb		patch \| blob \| history
doc/_config.yml		patch \| blob \| history
doc/_includes/_compute_ping_rb.liquid	[new file with mode: 0644]	patch \| blob
doc/install/crunch2-slurm/install-compute-node.html.textile.liquid		patch \| blob \| history
doc/install/crunch2-slurm/install-dispatch.html.textile.liquid		patch \| blob \| history
doc/install/crunch2-slurm/install-slurm.html.textile.liquid	[new file with mode: 0644]	patch \| blob
doc/install/install-compute-ping.html.textile.liquid	[new file with mode: 0644]	patch \| blob
doc/install/install-nodemanager.html.textile.liquid	[new file with mode: 0644]	patch \| blob
doc/install/migrate-docker19.html.textile.liquid	[new file with mode: 0644]	patch \| blob
docker/migrate-docker19/Dockerfile	[new file with mode: 0644]	patch \| blob
docker/migrate-docker19/build.sh	[new file with mode: 0755]	patch \| blob
docker/migrate-docker19/dnd.sh	[new file with mode: 0755]	patch \| blob
docker/migrate-docker19/migrate.sh	[new file with mode: 0755]	patch \| blob
sdk/cli/arvados-cli.gemspec		patch \| blob \| history
sdk/cli/test/test_arv-collection-create.rb		patch \| blob \| history
sdk/cwl/arvados_cwl/arvdocker.py		patch \| blob \| history
sdk/cwl/tests/test_container.py		patch \| blob \| history
sdk/python/arvados/api.py		patch \| blob \| history
sdk/python/arvados/cache.py	[new file with mode: 0644]	patch \| blob
sdk/python/arvados/collection.py		patch \| blob \| history
sdk/python/arvados/commands/keepdocker.py		patch \| blob \| history
sdk/python/arvados/commands/migrate19.py	[new file with mode: 0644]	patch \| blob
sdk/python/bin/arv-migrate-docker19		patch \| blob \| history
sdk/python/tests/run_test_server.py		patch \| blob \| history
sdk/python/tests/test_arv_keepdocker.py		patch \| blob \| history
sdk/python/tests/test_cache.py	[new file with mode: 0644]	patch \| blob
services/api/app/controllers/arvados/v1/groups_controller.rb		patch \| blob \| history
services/api/app/models/collection.rb		patch \| blob \| history
services/api/app/models/container_request.rb		patch \| blob \| history
services/api/app/models/job.rb		patch \| blob \| history
services/api/config/application.default.yml		patch \| blob \| history
services/api/lib/crunch_dispatch.rb		patch \| blob \| history
services/api/test/functional/arvados/v1/filters_test.rb		patch \| blob \| history
services/api/test/unit/container_request_test.rb		patch \| blob \| history
services/api/test/unit/crunch_dispatch_test.rb		patch \| blob \| history
services/api/test/unit/fail_jobs_test.rb		patch \| blob \| history
services/api/test/unit/job_test.rb		patch \| blob \| history
services/crunch-run/crunchrun.go		patch \| blob \| history
services/crunch-run/crunchrun_test.go		patch \| blob \| history
services/fuse/arvados_fuse/command.py		patch \| blob \| history
services/fuse/tests/test_mount_type.py	[new file with mode: 0644]	patch \| blob
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py		patch \| blob \| history
services/nodemanager/arvnodeman/daemon.py		patch \| blob \| history
tools/arvbox/bin/arvbox		patch \| blob \| history
tools/arvbox/lib/arvbox/docker/service/websockets/run-service		patch \| blob \| history