objects = @objects
end
if objects.respond_to?(:result_offset) and
- objects.respond_to?(:result_limit) and
- objects.respond_to?(:items_available)
+ objects.respond_to?(:result_limit)
next_offset = objects.result_offset + objects.result_limit
- if next_offset < objects.items_available
+ if objects.respond_to?(:items_available) and (next_offset < objects.items_available)
next_offset
+ elsif @objects.results.size > 0 and (params[:count] == 'none' or
+ (params[:controller] == 'search' and params[:action] == 'choose'))
+ last_object_class = @objects.last.class
+ if params['last_object_class'].nil? or params['last_object_class'] == last_object_class.to_s
+ next_offset
+ else
+ @objects.select{|obj| obj.class == last_object_class}.size
+ end
else
nil
end
end
@objects = search_what.contents(limit: @limit,
offset: @offset,
+ count: 'none',
+ last_object_class: params["last_object_class"],
filters: @filters)
super
end
def next_page_href with_params={}
super with_params.merge(last_object_class: @objects.last.class.to_s,
project_uuid: params[:project_uuid],
+ count: 'none',
filters: @filters.to_json)
end
end
%>
<%
+ recent_procs_panel_width = 6
if !PipelineInstance.api_exists?(:index)
recent_procs_title = 'Recent processes'
run_proc_title = 'Choose a workflow to run:'
+ show_node_status = false
+ # Recent processes panel should take the entire width when is the only one
+ # being rendered.
+ if !Rails.configuration.show_recent_collections_on_dashboard
+ recent_procs_panel_width = 12
+ end
else
recent_procs_title = 'Recent pipelines and processes'
run_proc_title = 'Choose a pipeline or workflow to run:'
+ show_node_status = true
end
%>
<div class="row">
- <div class="col-md-6">
+ <div class="col-md-<%= recent_procs_panel_width %>">
<div class="panel panel-default" style="min-height: 10.5em">
<div class="panel-heading">
<span class="panel-title"><%=recent_procs_title%></span>
</div>
<div class="col-md-6">
+ <% if show_node_status %>
<% nodes = Node.filter([["last_ping_at", ">", Time.now - 3600]]).results %>
<div class="panel panel-default" style="min-height: 10.5em">
<div class="panel-heading"><span class="panel-title">Compute node status</span>
</div>
</div>
</div>
+ <% end %>
<% if Rails.configuration.show_recent_collections_on_dashboard %>
<div class="panel panel-default">
<div class="panel-heading"><span class="panel-title">Recent collections</span>
assert_includes @response.body, "Run a process"
end
+ test "dashboard compute node status not shown when pipeline_instance index API is disabled" do
+ @controller = ProjectsController.new
+
+ dd = ArvadosApiClient.new_or_current.discovery.deep_dup
+ dd[:resources][:pipeline_instances][:methods].delete(:index)
+ ArvadosApiClient.any_instance.stubs(:discovery).returns(dd)
+
+ get :index, {}, session_for(:active)
+ assert_not_includes @response.body, "compute-node-summary-pane"
+ end
+
[
[:jobs, JobsController.new],
[:job_tasks, JobTasksController.new],
- install/install-keep-balance.html.textile.liquid
- Containers API support on SLURM:
- install/crunch2-slurm/install-prerequisites.html.textile.liquid
+ - install/crunch2-slurm/install-slurm.html.textile.liquid
- install/crunch2-slurm/install-compute-node.html.textile.liquid
- install/crunch2-slurm/install-dispatch.html.textile.liquid
- install/crunch2-slurm/install-test.html.textile.liquid
+ - install/install-nodemanager.html.textile.liquid
+ - install/install-compute-ping.html.textile.liquid
- Jobs API support (deprecated):
- install/install-crunch-dispatch.html.textile.liquid
- install/install-compute-node.html.textile.liquid
--- /dev/null
+#!/usr/bin/env ruby
+
+require 'rubygems'
+
+require 'cgi'
+require 'fileutils'
+require 'json'
+require 'net/https'
+require 'socket'
+require 'syslog'
+
+class ComputeNodePing
+ @@NODEDATA_DIR = "/var/tmp/arv-node-data"
+ @@PUPPET_CONFFILE = "/etc/puppet/puppet.conf"
+ @@HOST_STATEFILE = "/var/run/arvados-compute-ping-hoststate.json"
+
+ def initialize(args, stdout, stderr)
+ @stdout = stdout
+ @stderr = stderr
+ @stderr_loglevel = ((args.first == "quiet") ?
+ Syslog::LOG_ERR : Syslog::LOG_DEBUG)
+ @puppet_disabled = false
+ @syslog = Syslog.open("arvados-compute-ping",
+ Syslog::LOG_CONS | Syslog::LOG_PID,
+ Syslog::LOG_DAEMON)
+ @puppetless = File.exist?('/compute-node.puppetless')
+
+ begin
+ prepare_ping
+ load_puppet_conf unless @puppetless
+ begin
+ @host_state = JSON.parse(IO.read(@@HOST_STATEFILE))
+ rescue Errno::ENOENT
+ @host_state = nil
+ end
+ rescue
+ @syslog.close
+ raise
+ end
+ end
+
+ def send
+ pong = send_raw_ping
+
+ if pong["hostname"] and pong["domain"] and pong["first_ping_at"]
+ if @host_state.nil?
+ @host_state = {
+ "fqdn" => (Socket.gethostbyname(Socket.gethostname).first rescue nil),
+ "resumed_slurm" =>
+ ["busy", "idle"].include?(pong["crunch_worker_state"]),
+ }
+ update_host_state({})
+ end
+
+ if hostname_changed?(pong)
+ disable_puppet unless @puppetless
+ rename_host(pong)
+ update_host_state("fqdn" => fqdn_from_pong(pong),
+ "resumed_slurm" => false)
+ end
+
+ unless @host_state["resumed_slurm"]
+ run_puppet_agent unless @puppetless
+ resume_slurm_node(pong["hostname"])
+ update_host_state("resumed_slurm" => true)
+ end
+ end
+
+ log("Last ping at #{pong['last_ping_at']}")
+ end
+
+ def cleanup
+ enable_puppet if @puppet_disabled and not @puppetless
+ @syslog.close
+ end
+
+ private
+
+ def log(message, level=Syslog::LOG_INFO)
+ @syslog.log(level, message)
+ if level <= @stderr_loglevel
+ @stderr.write("#{Time.now.strftime("%Y-%m-%d %H:%M:%S")} #{message}\n")
+ end
+ end
+
+ def abort(message, code=1)
+ log(message, Syslog::LOG_ERR)
+ exit(code)
+ end
+
+ def run_and_check(cmd_a, accept_codes, io_opts, &block)
+ result = IO.popen(cmd_a, "r", io_opts, &block)
+ unless accept_codes.include?($?.exitstatus)
+ abort("#{cmd_a} exited #{$?.exitstatus}")
+ end
+ result
+ end
+
+ DEFAULT_ACCEPT_CODES=[0]
+ def check_output(cmd_a, accept_codes=DEFAULT_ACCEPT_CODES, io_opts={})
+ # Run a command, check the exit status, and return its stdout as a string.
+ run_and_check(cmd_a, accept_codes, io_opts) do |pipe|
+ pipe.read
+ end
+ end
+
+ def check_command(cmd_a, accept_codes=DEFAULT_ACCEPT_CODES, io_opts={})
+ # Run a command, send stdout to syslog, and check the exit status.
+ run_and_check(cmd_a, accept_codes, io_opts) do |pipe|
+ pipe.each_line do |line|
+ line.chomp!
+ log("#{cmd_a.first}: #{line}") unless line.empty?
+ end
+ end
+ end
+
+ def replace_file(path, body)
+ open(path, "w") { |f| f.write(body) }
+ end
+
+ def update_host_state(updates_h)
+ @host_state.merge!(updates_h)
+ replace_file(@@HOST_STATEFILE, @host_state.to_json)
+ end
+
+ def disable_puppet
+ check_command(["puppet", "agent", "--disable"])
+ @puppet_disabled = true
+ loop do
+ # Wait for any running puppet agents to finish.
+ check_output(["pgrep", "puppet"], 0..1)
+ break if $?.exitstatus == 1
+ sleep(1)
+ end
+ end
+
+ def enable_puppet
+ check_command(["puppet", "agent", "--enable"])
+ @puppet_disabled = false
+ end
+
+ def prepare_ping
+ begin
+ ping_uri_s = File.read(File.join(@@NODEDATA_DIR, "arv-ping-url"))
+ rescue Errno::ENOENT
+ abort("ping URL file is not present yet, skipping run")
+ end
+
+ ping_uri = URI.parse(ping_uri_s)
+ payload_h = CGI.parse(ping_uri.query)
+
+ # Collect all extra data to be sent
+ dirname = File.join(@@NODEDATA_DIR, "meta-data")
+ Dir.open(dirname).each do |basename|
+ filename = File.join(dirname, basename)
+ if File.file?(filename)
+ payload_h[basename.gsub('-', '_')] = File.read(filename).chomp
+ end
+ end
+
+ ping_uri.query = nil
+ @ping_req = Net::HTTP::Post.new(ping_uri.to_s)
+ @ping_req.set_form_data(payload_h)
+ @ping_client = Net::HTTP.new(ping_uri.host, ping_uri.port)
+ @ping_client.use_ssl = ping_uri.scheme == 'https'
+ end
+
+ def send_raw_ping
+ begin
+ response = @ping_client.start do |http|
+ http.request(@ping_req)
+ end
+ if response.is_a? Net::HTTPSuccess
+ pong = JSON.parse(response.body)
+ else
+ raise "response was a #{response}"
+ end
+ rescue JSON::ParserError => error
+ abort("Error sending ping: could not parse JSON response: #{error}")
+ rescue => error
+ abort("Error sending ping: #{error}")
+ end
+
+ replace_file(File.join(@@NODEDATA_DIR, "pong.json"), response.body)
+ if pong["errors"] then
+ log(pong["errors"].join("; "), Syslog::LOG_ERR)
+ if pong["errors"].grep(/Incorrect ping_secret/).any?
+ system("halt")
+ end
+ exit(1)
+ end
+ pong
+ end
+
+ def load_puppet_conf
+ # Parse Puppet configuration suitable for rewriting.
+ # Save certnames in @puppet_certnames.
+ # Save other functional configuration lines in @puppet_conf.
+ @puppet_conf = []
+ @puppet_certnames = []
+ open(@@PUPPET_CONFFILE, "r") do |conffile|
+ conffile.each_line do |line|
+ key, value = line.strip.split(/\s*=\s*/, 2)
+ if key == "certname"
+ @puppet_certnames << value
+ elsif not (key.nil? or key.empty? or key.start_with?("#"))
+ @puppet_conf << line
+ end
+ end
+ end
+ end
+
+ def fqdn_from_pong(pong)
+ "#{pong['hostname']}.#{pong['domain']}"
+ end
+
+ def certname_from_pong(pong)
+ fqdn = fqdn_from_pong(pong).sub(".", ".compute.")
+ "#{pong['first_ping_at'].gsub(':', '-').downcase}.#{fqdn}"
+ end
+
+ def hostname_changed?(pong)
+ if @puppetless
+ (@host_state["fqdn"] != fqdn_from_pong(pong))
+ else
+ (@host_state["fqdn"] != fqdn_from_pong(pong)) or
+ (@puppet_certnames != [certname_from_pong(pong)])
+ end
+ end
+
+ def rename_host(pong)
+ new_fqdn = fqdn_from_pong(pong)
+ log("Renaming host from #{@host_state["fqdn"]} to #{new_fqdn}")
+
+ replace_file("/etc/hostname", "#{new_fqdn.split('.', 2).first}\n")
+ check_output(["hostname", new_fqdn])
+
+ ip_address = check_output(["facter", "ipaddress"]).chomp
+ esc_address = Regexp.escape(ip_address)
+ check_command(["sed", "-i", "/etc/hosts",
+ "-e", "s/^#{esc_address}.*$/#{ip_address}\t#{new_fqdn}/"])
+
+ unless @puppetless
+ new_conflines = @puppet_conf + ["\n[agent]\n",
+ "certname=#{certname_from_pong(pong)}\n"]
+ replace_file(@@PUPPET_CONFFILE, new_conflines.join(""))
+ FileUtils.remove_entry_secure("/var/lib/puppet/ssl")
+ end
+ end
+
+ def run_puppet_agent
+ log("Running puppet agent")
+ enable_puppet
+ check_command(["puppet", "agent", "--onetime", "--no-daemonize",
+ "--no-splay", "--detailed-exitcodes",
+ "--ignorecache", "--no-usecacheonfailure"],
+ [0, 2], {err: [:child, :out]})
+ end
+
+ def resume_slurm_node(node_name)
+ current_state = check_output(["sinfo", "--noheader", "-o", "%t",
+ "-n", node_name]).chomp
+ if %w(down drain drng).include?(current_state)
+ log("Resuming node in SLURM")
+ check_command(["scontrol", "update", "NodeName=#{node_name}",
+ "State=RESUME"], [0], {err: [:child, :out]})
+ end
+ end
+end
+
+LOCK_DIRNAME = "/var/lock/arvados-compute-node.lock"
+begin
+ Dir.mkdir(LOCK_DIRNAME)
+rescue Errno::EEXIST
+ exit(0)
+end
+
+ping_sender = nil
+begin
+ ping_sender = ComputeNodePing.new(ARGV, $stdout, $stderr)
+ ping_sender.send
+ensure
+ Dir.rmdir(LOCK_DIRNAME)
+ ping_sender.cleanup unless ping_sender.nil?
+end
{% include 'install_compute_fuse' %}
{% include 'install_docker_cleaner' %}
+
+h2. Set up SLURM
+
+Install SLURM following "the same process you used to install the Crunch dispatcher":install-crunch-dispatch.html#slurm.
+
+h2. Copy configuration files from the dispatcher (API server)
+
+The @slurm.conf@ and @/etc/munge/munge.key@ files need to be identical across the dispatcher and all compute nodes. Copy the files you created in the "Install the Crunch dispatcher":install-crunch-dispatch.html step to this compute node.
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Set up SLURM
+...
+
+h2(#slurm). Set up SLURM
+
+On the API server, install SLURM and munge, and generate a munge key.
+
+On Debian-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo /usr/bin/apt-get install slurm-llnl munge</span>
+~$ <span class="userinput">sudo /usr/sbin/create-munge-key</span>
+</code></pre>
+</notextile>
+
+On Red Hat-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install slurm munge slurm-munge</span>
+</code></pre>
+</notextile>
+
+Now we need to give SLURM a configuration file. On Debian-based systems, this is installed at @/etc/slurm-llnl/slurm.conf@. On Red Hat-based systems, this is installed at @/etc/slurm/slurm.conf@. Here's an example @slurm.conf@:
+
+<notextile>
+<pre>
+ControlMachine=uuid_prefix.your.domain
+SlurmctldPort=6817
+SlurmdPort=6818
+AuthType=auth/munge
+StateSaveLocation=/tmp
+SlurmdSpoolDir=/tmp/slurmd
+SwitchType=switch/none
+MpiDefault=none
+SlurmctldPidFile=/var/run/slurmctld.pid
+SlurmdPidFile=/var/run/slurmd.pid
+ProctrackType=proctrack/pgid
+CacheGroups=0
+ReturnToService=2
+TaskPlugin=task/affinity
+#
+# TIMERS
+SlurmctldTimeout=300
+SlurmdTimeout=300
+InactiveLimit=0
+MinJobAge=300
+KillWait=30
+Waittime=0
+#
+# SCHEDULING
+SchedulerType=sched/backfill
+SchedulerPort=7321
+SelectType=select/linear
+FastSchedule=0
+#
+# LOGGING
+SlurmctldDebug=3
+#SlurmctldLogFile=
+SlurmdDebug=3
+#SlurmdLogFile=
+JobCompType=jobcomp/none
+#JobCompLoc=
+JobAcctGatherType=jobacct_gather/none
+#
+# COMPUTE NODES
+NodeName=DEFAULT
+PartitionName=DEFAULT MaxTime=INFINITE State=UP
+
+NodeName=compute[0-255]
+PartitionName=compute Nodes=compute[0-255] Default=YES Shared=YES
+</pre>
+</notextile>
+
+h3. SLURM configuration essentials
+
+Whenever you change this file, you will need to update the copy _on every compute node_ as well as the controller node, and then run @sudo scontrol reconfigure@.
+
+*@ControlMachine@* should be a DNS name that resolves to the SLURM controller (dispatch/API server). This must resolve correctly on all SLURM worker nodes as well as the controller itself. In general SLURM is very sensitive about all of the nodes being able to communicate with the controller _and one another_, all using the same DNS names.
+
+*@NodeName=compute[0-255]@* establishes that the hostnames of the worker nodes will be compute0, compute1, etc. through compute255.
+* There are several ways to compress sequences of names, like @compute[0-9,80,100-110]@. See the "hostlist" discussion in the @slurm.conf(5)@ and @scontrol(1)@ man pages for more information.
+* It is not necessary for all of the nodes listed here to be alive in order for SLURM to work, although you should make sure the DNS entries exist. It is easiest to define lots of hostnames up front, assigning them to real nodes and updating your DNS records as the nodes appear. This minimizes the frequency of @slurm.conf@ updates and use of @scontrol reconfigure@.
+
+Each hostname in @slurm.conf@ must also resolve correctly on all SLURM worker nodes as well as the controller itself. Furthermore, the hostnames used in the configuration file must match the hostnames reported by @hostname@ or @hostname -s@ on the nodes themselves. This applies to the ControlMachine as well as the worker nodes.
+
+For example:
+* In @slurm.conf@ on control and worker nodes: @ControlMachine=uuid_prefix.your.domain@
+* In @slurm.conf@ on control and worker nodes: @NodeName=compute[0-255]@
+* In @/etc/resolv.conf@ on control and worker nodes: @search uuid_prefix.your.domain@
+* On the control node: @hostname@ reports @uuid_prefix.your.domain@
+* On worker node 123: @hostname@ reports @compute123.uuid_prefix.your.domain@
+
+h3. Automatic hostname assignment
+
+The API server will choose an unused hostname from the set given in @application.yml@, which defaults to @compute[0-255]@.
+
+If it is not feasible to give your compute nodes hostnames like compute0, compute1, etc., you can accommodate other naming schemes with a bit of extra configuration.
+
+If you want Arvados to assign names to your nodes with a different consecutive numeric series like @{worker1-0000, worker1-0001, worker1-0002}@, add an entry to @application.yml@; see @/var/www/arvados-api/current/config/application.default.yml@ for details. Example:
+* In @application.yml@: <code>assign_node_hostname: worker1-%<slot_number>04d</code>
+* In @slurm.conf@: <code>NodeName=worker1-[0000-0255]</code>
+
+If your worker hostnames are already assigned by other means, and the full set of names is known in advance, have your worker node bootstrapping script (see "Installing a compute node":install-compute-node.html) send its current hostname, rather than expect Arvados to assign one.
+* In @application.yml@: <code>assign_node_hostname: false</code>
+* In @slurm.conf@: <code>NodeName=alice,bob,clay,darlene</code>
+
+If your worker hostnames are already assigned by other means, but the full set of names is _not_ known in advance, you can use the @slurm.conf@ and @application.yml@ settings in the previous example, but you must also update @slurm.conf@ (both on the controller and on all worker nodes) and run @sudo scontrol reconfigure@ whenever a new node comes online.
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Sample compute node ping script
+...
+
+When a new elastic compute node is booted, it needs to contact Arvados to register itself. Here is an example ping script to run on boot.
+
+<notextile> {% code 'compute_ping_rb' as ruby %} </notextile>
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Install Node Manager
+...
+
+Arvados Node Manager provides elastic computing for Arvados and SLURM by creating and destroying virtual machines on demand. Node Manager currently supports Amazon Web Services (AWS), Google Cloud Platform (GCP) and Microsoft Azure.
+
+Note: node manager is only required for elastic computing cloud environments. Fixed size clusters do not require node manager.
+
+h2. Install
+
+Node manager may run anywhere, however it must be able to communicate with the cloud provider's APIs, and use the command line tools @sinfo@, @squeue@ and @scontrol@ to communicate with the cluster's SLURM controller.
+
+On Debian-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo apt-get install arvados-node-manager</span>
+</code></pre>
+</notextile>
+
+On Red Hat-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install arvados-node-manager</span>
+</code></pre>
+</notextile>
+
+h2. Create compute image
+
+Configure a virtual machine following the "instructions to set up a compute node.":{{site.baseurl}}/install/crunch2-slurm/install-compute-node.html and set it up to run a "ping script":{{site.baseurl}}/install/install-compute-ping.html at boot.
+
+Create a virtual machine image using the commands provided by your cloud provider. We recommend using a tool such as "Packer":https://www.packer.io/ to automate this process.
+
+Configure node manager to use the image with the @image@ or @image_id@ parameter.
+
+h2. Configure node manager
+
+The configuration file at @/etc/arvados-node-manager/config.ini@ . Some configuration details are specific to the cloud provider you are using:
+
+* "Amazon Web Services":#aws
+* "Google Cloud Platform":#gcp
+* "Microsoft Azure":#azure
+
+h3(#aws). Amazon Web Services
+
+<pre>
+# EC2 configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes. For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times. If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type. However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll EC2 nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down. Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 1800
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = ec2
+
+# It's usually most cost-effective to shut down compute nodes during narrow
+# windows of time. For example, EC2 bills each node by the hour, so the best
+# time to shut down a node is right before a new hour of uptime starts.
+# Shutdown windows define these periods of time. These are windows in
+# full minutes, separated by commas. Counting from the time the node is
+# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
+# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
+# For example, "54, 5, 1" means the node may shut down from the 54th to the
+# 59th minute of each hour of uptime.
+# Specify at least two windows. You can add as many as you need beyond that.
+shutdown_windows = 54, 5, 1
+
+[Cloud Credentials]
+key = KEY
+secret = SECRET_KEY
+region = us-east-1
+timeout = 60
+
+[Cloud List]
+# This section defines filters that find compute nodes.
+# Tags that you specify here will automatically be added to nodes you create.
+# Replace colons in Amazon filters with underscores
+# (e.g., write "tag:mytag" as "tag_mytag").
+instance-state-name = running
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+[Cloud Create]
+# New compute nodes will send pings to Arvados at this host.
+# You may specify a port, and use brackets to disambiguate IPv6 addresses.
+ping_host = hostname:port
+
+# Give the name of an SSH key on AWS...
+ex_keyname = string
+
+# ... or a file path for an SSH key that can log in to the compute node.
+# (One or the other, not both.)
+# ssh_key = path
+
+# The EC2 IDs of the image and subnet compute nodes should use.
+image_id = idstring
+subnet_id = idstring
+
+# Comma-separated EC2 IDs for the security group(s) assigned to each
+# compute node.
+security_groups = idstring1, idstring2
+
+
+# You can define any number of Size sections to list EC2 sizes you're
+# willing to use. The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Amazon's provided
+# data fields (such as price per hour) by setting them here.
+
+[Size m4.large]
+cores = 2
+price = 0.126
+scratch = 100
+
+[Size m4.xlarge]
+cores = 4
+price = 0.252
+scratch = 100
+</pre>
+
+h3(#gcp). Google Cloud Platform
+
+<pre>
+# Google Compute Engine configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# Node Manager will ensure that there are at least this many nodes running at
+# all times. If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type. However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# running at all times. By default, these will be the cheapest node size.
+max_nodes = 8
+
+# Poll compute nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = gce
+
+# Shutdown windows define periods of time when a node may and may not
+# be shut down. These are windows in full minutes, separated by
+# commas. Counting from the time the node is booted, the node WILL
+# NOT shut down for N1 minutes; then it MAY shut down for N2 minutes;
+# then it WILL NOT shut down for N3 minutes; and so on. For example,
+# "54, 5, 1" means the node may shut down from the 54th to the 59th
+# minute of each hour of uptime.
+# GCE bills by the minute, and does not provide information about when
+# a node booted. Node Manager will store this information in metadata
+# when it boots a node; if that information is not available, it will
+# assume the node booted at the epoch. These shutdown settings are
+# very aggressive. You may want to adjust this if you want more
+# continuity of service from a single node.
+shutdown_windows = 20, 999999
+
+[Cloud Credentials]
+user_id = client_email_address@developer.gserviceaccount.com
+key = path_to_certificate.pem
+project = project-id-from-google-cloud-dashboard
+timeout = 60
+
+# Valid location (zone) names: https://cloud.google.com/compute/docs/zones
+datacenter = us-central1-a
+
+# Optional settings. For full documentation see
+# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
+#
+# auth_type = SA # SA, IA or GCE
+# scopes = https://www.googleapis.com/auth/compute
+# credential_file =
+
+[Cloud List]
+# A comma-separated list of tags that must be applied to a node for it to
+# be considered a compute node.
+# The driver will automatically apply these tags to nodes it creates.
+tags = zyxwv, compute
+
+[Cloud Create]
+# New compute nodes will send pings to Arvados at this host.
+# You may specify a port, and use brackets to disambiguate IPv6 addresses.
+ping_host = hostname:port
+
+# A file path for an SSH key that can log in to the compute node.
+# ssh_key = path
+
+# The GCE image name and network zone name to use when creating new nodes.
+image = debian-7
+# network = your_network_name
+
+# JSON string of service account authorizations for this cluster.
+# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
+# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}]
+
+
+# You can define any number of Size sections to list node sizes you're
+# willing to use. The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue.
+#
+# The Size fields are interpreted the same way as with a libcloud NodeSize:
+# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
+#
+# See https://cloud.google.com/compute/docs/machine-types for a list
+# of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.
+# You can also override Google's provided data fields (such as price per hour)
+# by setting them here.
+
+[Size n1-standard-2]
+cores = 2
+price = 0.076
+scratch = 100
+
+[Size n1-standard-4]
+cores = 4
+price = 0.152
+scratch = 200
+</pre>
+
+h3(#azure). Microsoft Azure
+
+<pre>
+# Azure configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes. For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times. If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type. However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down. Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 1800
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = azure
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down. These are windows in full minutes, separated by commas. Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on. For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes. Specify at least two windows. You can add as many as you need beyond
+# that.
+shutdown_windows = 20, 999999
+
+[Cloud Credentials]
+# Use "azure account list" with the azure CLI to get these values.
+tenant_id = 00000000-0000-0000-0000-000000000000
+subscription_id = 00000000-0000-0000-0000-000000000000
+
+# The following directions are based on
+# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
+#
+# azure config mode arm
+# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
+# azure ad sp create "<Application_Id>"
+# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/{subscriptionId}/
+#
+# Use <Application_Id> for "key" and the <Your_Password> for "secret"
+#
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+# The resource group in which the compute node virtual machines will be created
+# and listed.
+ex_resource_group = ArvadosResourceGroup
+
+[Cloud Create]
+# The compute node image, as a link to a VHD in Azure blob store.
+image = https://example.blob.core.windows.net/system/Microsoft.Compute/Images/images/zyxwv-compute-osDisk.vhd
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = /home/arvadosuser/.ssh/id_rsa.pub
+
+# The account name for the admin user that will be provisioned on new nodes.
+ex_user_name = arvadosuser
+
+# The Azure storage account that will be used to store the node OS disk images.
+ex_storage_account = arvadosstorage
+
+# The virtual network the VMs will be associated with.
+ex_network = ArvadosNetwork
+
+# Optional subnet of the virtual network.
+#ex_subnet = default
+
+# Node tags
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+# the API server to ping
+ping_host = hostname:port
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use. The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue. You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size Standard_D3]
+cores = 4
+price = 0.56
+
+[Size Standard_D4]
+cores = 8
+price = 1.12
+</pre>
+
+h2. Running
+
+<pre>
+$ arvados-node-manager --config /etc/arvados-node-manager/config.ini
+</pre>
global cached_lookups_lock
with cached_lookups_lock:
if dockerRequirement["dockerImageId"] in cached_lookups:
- return cached_lookups[dockerRequirement["dockerImageId"]]
+ return dockerRequirement["dockerImageId"]
with SourceLine(dockerRequirement, "dockerImageId", WorkflowException):
sp = dockerRequirement["dockerImageId"].split(":")
if not images:
raise WorkflowException("Could not find Docker image %s:%s" % (image_name, image_tag))
- pdh = api_client.collections().get(uuid=images[0][0]).execute()["portable_data_hash"]
-
with cached_lookups_lock:
- cached_lookups[dockerRequirement["dockerImageId"]] = pdh
+ cached_lookups[dockerRequirement["dockerImageId"]] = True
- return pdh
+ return dockerRequirement["dockerImageId"]
def arv_docker_clear_cache():
global cached_lookups
'state': 'Committed',
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
- 'container_image': '99999999999999999999999999999993+99',
+ 'container_image': 'arvados/jobs',
'command': ['ls', '/var/spool/cwl'],
'cwd': '/var/spool/cwl',
'scheduling_parameters': {},
'state': 'Committed',
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
- 'container_image': '99999999999999999999999999999993+99',
+ 'container_image': 'arvados/jobs',
'command': ['ls'],
'cwd': '/var/spool/cwl',
'scheduling_parameters': {
'state': 'Committed',
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
- 'container_image': '99999999999999999999999999999993+99',
+ 'container_image': 'arvados/jobs',
'command': ['ls'],
'cwd': '/var/spool/cwl',
'scheduling_parameters': {
end
end
+ seen_last_class = false
klasses.each do |klass|
+ @offset = 0 if seen_last_class # reset offset for the new next type being processed
+
+ # if current klass is same as params['last_object_class'], mark that fact
+ seen_last_class = true if((params['count'].andand.==('none')) and
+ (params['last_object_class'].nil? or
+ params['last_object_class'].empty? or
+ params['last_object_class'] == klass.to_s))
+
+ # if klasses are specified, skip all other klass types
next if wanted_klasses.any? and !wanted_klasses.include?(klass.to_s)
+ # don't reprocess klass types that were already seen
+ next if params['count'] == 'none' and !seen_last_class
+
+ # don't process rest of object types if we already have needed number of objects
+ break if params['count'] == 'none' and all_objects.size >= limit_all
+
# If the currently requested orders specifically match the
# table_name for the current klass, apply that order.
# Otherwise, order by recency.
[hash_part, size_part].compact.join '+'
end
- # Return array of Collection objects
- def self.find_all_for_docker_image(search_term, search_tag=nil, readers=nil)
+ def self.get_compatible_images(readers, pattern, collections)
+ if collections.empty?
+ return []
+ end
+
+ migrations = Hash[
+ Link.where('tail_uuid in (?) AND link_class=? AND links.owner_uuid=?',
+ collections.map(&:portable_data_hash),
+ 'docker_image_migration',
+ system_user_uuid).
+ order('links.created_at asc').
+ map { |l|
+ [l.tail_uuid, l.head_uuid]
+ }]
+
+ migrated_collections = Hash[
+ Collection.readable_by(*readers).
+ where('portable_data_hash in (?)', migrations.values).
+ map { |c|
+ [c.portable_data_hash, c]
+ }]
+
+ collections.map { |c|
+ # Check if the listed image is compatible first, if not, then try the
+ # migration link.
+ manifest = Keep::Manifest.new(c.manifest_text)
+ if manifest.exact_file_count?(1) and manifest.files[0][1] =~ pattern
+ c
+ elsif m = migrated_collections[migrations[c.portable_data_hash]]
+ manifest = Keep::Manifest.new(m.manifest_text)
+ if manifest.exact_file_count?(1) and manifest.files[0][1] =~ pattern
+ m
+ end
+ end
+ }.compact
+ end
+
+ # Resolve a Docker repo+tag, hash, or collection PDH to an array of
+ # Collection objects, sorted by timestamp starting with the most recent
+ # match.
+ #
+ # If filter_compatible_format is true (the default), only return image
+ # collections which are support by the installation as indicated by
+ # Rails.configuration.docker_image_formats. Will follow
+ # 'docker_image_migration' links if search_term resolves to an incompatible
+ # image, but an equivalent compatible image is available.
+ def self.find_all_for_docker_image(search_term, search_tag=nil, readers=nil, filter_compatible_format: true)
readers ||= [Thread.current[:user]]
base_search = Link.
readable_by(*readers).
joins("JOIN collections ON links.head_uuid = collections.uuid").
order("links.created_at DESC")
+ if (Rails.configuration.docker_image_formats.include? 'v1' and
+ Rails.configuration.docker_image_formats.include? 'v2') or filter_compatible_format == false
+ pattern = /^(sha256:)?[0-9A-Fa-f]{64}\.tar$/
+ elsif Rails.configuration.docker_image_formats.include? 'v2'
+ pattern = /^(sha256:)[0-9A-Fa-f]{64}\.tar$/
+ elsif Rails.configuration.docker_image_formats.include? 'v1'
+ pattern = /^[0-9A-Fa-f]{64}\.tar$/
+ else
+ raise "Unrecognized configuration for docker_image_formats #{Rails.configuration.docker_image_formats}"
+ end
+
# If the search term is a Collection locator that contains one file
# that looks like a Docker image, return it.
if loc = Keep::Locator.parse(search_term)
loc.strip_hints!
- coll_match = readable_by(*readers).where(portable_data_hash: loc.to_s).limit(1).first
- if coll_match
- # Check if the Collection contains exactly one file whose name
- # looks like a saved Docker image.
- manifest = Keep::Manifest.new(coll_match.manifest_text)
- if manifest.exact_file_count?(1) and
- (manifest.files[0][1] =~ /^(sha256:)?[0-9A-Fa-f]{64}\.tar$/)
- return [coll_match]
- end
- end
+ coll_match = readable_by(*readers).where(portable_data_hash: loc.to_s).limit(1)
+ return get_compatible_images(readers, pattern, coll_match)
end
if search_tag.nil? and (n = search_term.index(":"))
# so that anything with an image timestamp is considered more recent than
# anything without; then we use the link's created_at as a tiebreaker.
uuid_timestamps = {}
- matches.all.map do |link|
+ matches.each do |link|
uuid_timestamps[link.head_uuid] = [(-link.properties["image_timestamp"].to_datetime.to_i rescue 0),
-link.created_at.to_i]
+ end
+
+ sorted = Collection.where('uuid in (?)', uuid_timestamps.keys).sort_by { |c|
+ uuid_timestamps[c.uuid]
+ }
+ compatible = get_compatible_images(readers, pattern, sorted)
+ if sorted.length > 0 and compatible.empty?
+ raise ArvadosModel::UnresolvableContainerError.new "Matching Docker image is incompatible with 'docker_image_formats' configuration."
end
- Collection.where('uuid in (?)', uuid_timestamps.keys).sort_by { |c| uuid_timestamps[c.uuid] }
+ compatible
end
def self.for_latest_docker_image(search_term, search_tag=nil, readers=nil)
find_all_for_docker_image(search_term, search_tag, readers).first
end
- # If the given pdh is an old-format docker image, old-format images
- # aren't supported by the compute nodes according to site config,
- # and a migrated new-format image is available, return the migrated
- # image's pdh. Otherwise, just return pdh.
- def self.docker_migration_pdh(read_users, pdh)
- if Rails.configuration.docker_image_formats.include?('v1')
- return pdh
- end
- Collection.readable_by(*read_users).
- joins('INNER JOIN links ON head_uuid=portable_data_hash').
- where('tail_uuid=? AND link_class=? AND links.owner_uuid=?',
- pdh, 'docker_image_migration', system_user_uuid).
- order('links.created_at desc').
- select('portable_data_hash').
- first.andand.portable_data_hash || pdh
- end
-
def self.searchable_columns operator
super - ["manifest_text"]
end
if !coll
raise ArvadosModel::UnresolvableContainerError.new "docker image #{container_image.inspect} not found"
end
- return Collection.docker_migration_pdh([current_user], coll.portable_data_hash)
+ coll.portable_data_hash
end
def set_container
image_hashes = Array.wrap(operand).flat_map do |search_term|
image_search, image_tag = search_term.split(':', 2)
Collection.
- find_all_for_docker_image(image_search, image_tag, read_users).
+ find_all_for_docker_image(image_search, image_tag, read_users, filter_compatible_format: false).
map(&:portable_data_hash)
end
filters << [attr, operator.sub(/ docker$/, ""), image_hashes]
else
image_locator = nil
end
- filters << ["docker_image_locator", "=",
- Collection.docker_migration_pdh(read_users, image_locator)]
+ filters << ["docker_image_locator", "=", image_locator]
if sdk_version = attrs[:runtime_constraints].andand["arvados_sdk_version"]
filters += default_git_filters("arvados_sdk_version", "arvados", sdk_version)
end
[false, "not found for #{image_search}"]
end
end
- Rails.logger.info("docker_image_locator is #{docker_image_locator}")
- if docker_image_locator && docker_image_locator_changed?
- self.docker_image_locator =
- Collection.docker_migration_pdh([current_user], docker_image_locator)
- end
- Rails.logger.info("docker_image_locator is #{docker_image_locator}")
end
def permission_to_update
git_internal_dir: <%= Rails.root.join 'tmp', 'internal.git' %>
websocket_address: <% if ENV['ARVADOS_TEST_EXPERIMENTAL_WS'] %>"wss://0.0.0.0:<%= ENV['ARVADOS_TEST_WSS_PORT'] %>/websocket"<% else %>false<% end %>
trash_sweep_interval: -1
+ docker_image_formats: ["v1"]
end
end
end
+
+ test "full text search with count='none'" do
+ @controller = Arvados::V1::GroupsController.new
+ authorize_with :admin
+
+ get :contents, {
+ format: :json,
+ count: 'none',
+ limit: 1000,
+ filters: [['any', '@@', Rails.configuration.uuid_prefix]],
+ }
+
+ assert_response :success
+
+ all_objects = Hash.new(0)
+ json_response['items'].map{|o| o['kind']}.each{|t| all_objects[t] += 1}
+
+ assert_equal true, all_objects['arvados#group']>0
+ assert_equal true, all_objects['arvados#job']>0
+ assert_equal true, all_objects['arvados#pipelineInstance']>0
+ assert_equal true, all_objects['arvados#pipelineTemplate']>0
+
+ # Perform test again mimicking a second page request with:
+ # last_object_class = PipelineInstance
+ # and hence groups and jobs should not be included in the response
+ # offset = 5, which means first 5 pipeline instances were already received in page 1
+ # and hence the remaining pipeline instances and all other object types should be included in the response
+
+ @test_counter = 0 # Reset executed action counter
+
+ @controller = Arvados::V1::GroupsController.new
+
+ get :contents, {
+ format: :json,
+ count: 'none',
+ limit: 1000,
+ offset: '5',
+ last_object_class: 'PipelineInstance',
+ filters: [['any', '@@', Rails.configuration.uuid_prefix]],
+ }
+
+ assert_response :success
+
+ second_page = Hash.new(0)
+ json_response['items'].map{|o| o['kind']}.each{|t| second_page[t] += 1}
+
+ assert_equal false, second_page.include?('arvados#group')
+ assert_equal false, second_page.include?('arvados#job')
+ assert_equal true, second_page['arvados#pipelineInstance']>0
+ assert_equal all_objects['arvados#pipelineInstance'], second_page['arvados#pipelineInstance']+5
+ assert_equal true, second_page['arvados#pipelineTemplate']>0
+ end
end
test "container_image_for_container(pdh)" do
set_user_from_auth :active
- [:docker_image, :docker_image_1_12].each do |coll|
+ [[:docker_image, 'v1'], [:docker_image_1_12, 'v2']].each do |coll, ver|
+ Rails.configuration.docker_image_formats = [ver]
pdh = collections(coll).portable_data_hash
cr = ContainerRequest.new(container_image: pdh)
resolved = cr.send :container_image_for_container
Rails.configuration.docker_image_formats = ['v2']
add_docker19_migration_link
+ # Test that it returns only v2 images even though request is for v1 image.
+
set_user_from_auth :active
cr = create_minimal_req!(command: ["true", "1"],
container_image: collections(:docker_image).portable_data_hash)
collections(:docker_image_1_12).portable_data_hash)
end
+ test "use unmigrated docker image" do
+ Rails.configuration.docker_image_formats = ['v1']
+ add_docker19_migration_link
+
+ # Test that it returns only supported v1 images even though there is a
+ # migration link.
+
+ set_user_from_auth :active
+ cr = create_minimal_req!(command: ["true", "1"],
+ container_image: collections(:docker_image).portable_data_hash)
+ assert_equal(cr.send(:container_image_for_container),
+ collections(:docker_image).portable_data_hash)
+
+ cr = create_minimal_req!(command: ["true", "2"],
+ container_image: links(:docker_image_collection_tag).name)
+ assert_equal(cr.send(:container_image_for_container),
+ collections(:docker_image).portable_data_hash)
+ end
+
+ test "incompatible docker image v1" do
+ Rails.configuration.docker_image_formats = ['v1']
+ add_docker19_migration_link
+
+ # Don't return unsupported v2 image even if we ask for it directly.
+ set_user_from_auth :active
+ cr = create_minimal_req!(command: ["true", "1"],
+ container_image: collections(:docker_image_1_12).portable_data_hash)
+ assert_raises(ArvadosModel::UnresolvableContainerError) do
+ cr.send(:container_image_for_container)
+ end
+ end
+
+ test "incompatible docker image v2" do
+ Rails.configuration.docker_image_formats = ['v2']
+ # No migration link, don't return unsupported v1 image,
+
+ set_user_from_auth :active
+ cr = create_minimal_req!(command: ["true", "1"],
+ container_image: collections(:docker_image).portable_data_hash)
+ assert_raises(ArvadosModel::UnresolvableContainerError) do
+ cr.send(:container_image_for_container)
+ end
+ cr = create_minimal_req!(command: ["true", "2"],
+ container_image: links(:docker_image_collection_tag).name)
+ assert_raises(ArvadosModel::UnresolvableContainerError) do
+ cr.send(:container_image_for_container)
+ end
+ end
+
test "requestor can retrieve container owned by dispatch" do
assert_not_empty Container.readable_by(users(:admin)).where(uuid: containers(:running).uuid)
assert_not_empty Container.readable_by(users(:active)).where(uuid: containers(:running).uuid)
].each do |existing_image, request_image, expect_image|
test "if a #{existing_image} job exists, #{request_image} yields #{expect_image} after migration" do
Rails.configuration.docker_image_formats = ['v1']
- oldjob = Job.create!(
- job_attrs(
- script: 'foobar1',
- runtime_constraints: {
- 'docker_image' => collections(existing_image).portable_data_hash}))
- oldjob.reload
- assert_equal(oldjob.docker_image_locator,
- collections(existing_image).portable_data_hash)
+
+ if existing_image == :docker_image
+ oldjob = Job.create!(
+ job_attrs(
+ script: 'foobar1',
+ runtime_constraints: {
+ 'docker_image' => collections(existing_image).portable_data_hash}))
+ oldjob.reload
+ assert_equal(oldjob.docker_image_locator,
+ collections(existing_image).portable_data_hash)
+ elsif existing_image == :docker_image_1_12
+ assert_raises(ActiveRecord::RecordInvalid,
+ "Should not resolve v2 image when only v1 is supported") do
+ oldjob = Job.create!(
+ job_attrs(
+ script: 'foobar1',
+ runtime_constraints: {
+ 'docker_image' => collections(existing_image).portable_data_hash}))
+ end
+ end
Rails.configuration.docker_image_formats = ['v2']
add_docker19_migration_link
+ # Check that both v1 and v2 images get resolved to v2.
newjob = Job.create!(
job_attrs(
script: 'foobar1',
def _destroy_node(self):
if self._nodename:
- self._set_node_state(self._nodename, 'DOWN')
+ self._set_node_state(self._nodename, 'DOWN', 'Reason=Node Manager shutdown')
super(ComputeNodeShutdownActor, self)._destroy_node()
return s
def _node_states(self, size):
- states = pykka.get_all(rec.actor.get_state()
- for rec in self.cloud_nodes.nodes.itervalues()
- if ((size is None or rec.cloud_node.size.id == size.id) and
- rec.shutdown_actor is None))
- states += ['shutdown' for rec in self.cloud_nodes.nodes.itervalues()
- if ((size is None or rec.cloud_node.size.id == size.id) and
- rec.shutdown_actor is not None)]
- return states
+ proxy_states = []
+ states = []
+ for rec in self.cloud_nodes.nodes.itervalues():
+ if size is None or rec.cloud_node.size.id == size.id:
+ if rec.shutdown_actor is None and rec.actor is not None:
+ proxy_states.append(rec.actor.get_state())
+ else:
+ states.append("shutdown")
+ return states + pykka.get_all(proxy_states)
def _state_counts(self, size):
states = self._node_states(size)