minitest (~> 5.1)
thread_safe (~> 0.1)
tzinfo (~> 1.1)
- addressable (2.3.6)
+ addressable (2.4.0)
andand (1.3.3)
angularjs-rails (1.3.8)
arel (5.0.1.20140414130214)
- arvados (0.1.20150511150219)
- activesupport (>= 3.2.13)
+ arvados (0.1.20160420143004)
+ activesupport (>= 3, < 4.2.6)
andand (~> 1.3, >= 1.3.3)
- google-api-client (~> 0.6.3, >= 0.6.3)
+ google-api-client (>= 0.7, < 0.9)
+ i18n (~> 0)
json (~> 1.7, >= 1.7.7)
- jwt (>= 0.1.5, < 1.0.0)
+ jwt (>= 0.1.5, < 2)
autoparse (0.3.3)
addressable (>= 2.3.1)
extlib (>= 0.9.15)
erubis (2.7.0)
execjs (2.2.2)
extlib (0.9.16)
- faraday (0.8.9)
- multipart-post (~> 1.2.0)
+ faraday (0.9.2)
+ multipart-post (>= 1.2, < 3)
fast_stack (0.1.0)
rake
rake-compiler
ffi (1.9.10)
flamegraph (0.1.0)
fast_stack
- google-api-client (0.6.4)
- addressable (>= 2.3.2)
- autoparse (>= 0.3.3)
- extlib (>= 0.9.15)
- faraday (~> 0.8.4)
- jwt (>= 0.1.5)
- launchy (>= 2.1.1)
- multi_json (>= 1.0.0)
- signet (~> 0.4.5)
- uuidtools (>= 2.1.0)
+ google-api-client (0.8.6)
+ activesupport (>= 3.2)
+ addressable (~> 2.3)
+ autoparse (~> 0.3)
+ extlib (~> 0.9)
+ faraday (~> 0.9)
+ googleauth (~> 0.3)
+ launchy (~> 2.4)
+ multi_json (~> 1.10)
+ retriable (~> 1.4)
+ signet (~> 0.6)
+ googleauth (0.5.1)
+ faraday (~> 0.9)
+ jwt (~> 1.4)
+ logging (~> 2.0)
+ memoist (~> 0.12)
+ multi_json (~> 1.11)
+ os (~> 0.9)
+ signet (~> 0.7)
headless (1.0.2)
highline (1.6.21)
httpclient (2.6.0.1)
railties (>= 3.0, < 5.0)
thor (>= 0.14, < 2.0)
json (1.8.3)
- jwt (0.1.13)
- multi_json (>= 1.5)
+ jwt (1.5.4)
launchy (2.4.3)
addressable (~> 2.3)
less (2.6.0)
actionpack (>= 3.1)
less (~> 2.6.0)
libv8 (3.16.14.7)
+ little-plugger (1.1.4)
+ logging (2.1.0)
+ little-plugger (~> 1.1)
+ multi_json (~> 1.10)
mail (2.6.3)
mime-types (>= 1.16, < 3)
+ memoist (0.14.0)
metaclass (0.0.4)
mime-types (2.99)
mini_portile (0.6.2)
- minitest (5.7.0)
+ minitest (5.8.4)
mocha (1.1.0)
metaclass (~> 0.0.1)
morrisjs-rails (0.5.1)
railties (> 3.1, < 5)
- multi_json (1.11.2)
- multipart-post (1.2.0)
+ multi_json (1.12.0)
+ multipart-post (2.0.0)
net-scp (1.2.1)
net-ssh (>= 2.6.5)
net-sftp (2.1.2)
nokogiri (1.6.6.4)
mini_portile (~> 0.6.0)
oj (2.11.2)
+ os (0.9.6)
passenger (4.0.57)
daemon_controller (>= 1.2.0)
rack
rake
raphael-rails (2.1.2)
ref (1.0.5)
+ retriable (1.4.1)
ruby-debug-passenger (0.2.0)
ruby-prof (0.15.2)
rubyzip (1.1.7)
multi_json (~> 1.0)
rubyzip (~> 1.0)
websocket (~> 1.0)
- signet (0.4.5)
- addressable (>= 2.2.3)
- faraday (~> 0.8.1)
- jwt (>= 0.1.5)
- multi_json (>= 1.0.0)
+ signet (0.7.2)
+ addressable (~> 2.3)
+ faraday (~> 0.9)
+ jwt (~> 1.5)
+ multi_json (~> 1.10)
simplecov (0.9.1)
docile (~> 1.1.0)
multi_json (~> 1.0)
uglifier (2.7.0)
execjs (>= 0.3.0)
json (>= 1.8.0)
- uuidtools (2.1.5)
websocket (1.2.2)
websocket-driver (0.5.1)
websocket-extensions (>= 0.1.0)
therubyracer
uglifier (>= 1.0.3)
wiselinks
+
+BUNDLED WITH
+ 1.12.1
raw("<span class='utc-date' data-utc-date='#{date}' data-utc-date-opts='noseconds'>#{date}</span>")
end
+ def render_time duration, use_words, round_to_min=true
+ render_runtime duration, use_words, round_to_min
+ end
+
private
def is_textile?( object, attr )
is_textile = object.textile_attributes.andand.include?(attr)
def determine_wallclock_runtime jobs
timestamps = []
jobs.each do |j|
- insert_at = 0
- started_at = j[:started_at]
- finished_at = (if j[:finished_at] then j[:finished_at] else Time.now end)
+ started_at = (j.started_at if j.respond_to?(:started_at)) || (j[:started_at] if j.is_a?(Hash))
+ finished_at = (j.finished_at if j.respond_to?(:finished_at)) || (j[:finished_at] if j.is_a?(Hash)) || Time.now
if started_at
timestamps = merge_range timestamps, started_at, finished_at
end
stderr_log_query(limit).results.reverse.
flat_map { |log| log.properties[:text].split("\n") rescue [] }
end
+
+ def work_unit(label=nil)
+ JobWorkUnit.new(self, label)
+ end
end
class JobTask < ArvadosBase
+ def work_unit(label=nil)
+ JobTaskWorkUnit.new(self, label)
+ end
end
--- /dev/null
+class JobTaskWorkUnit < ProxyWorkUnit
+ def title
+ "job task"
+ end
+end
--- /dev/null
+class JobWorkUnit < ProxyWorkUnit
+ def children
+ return @my_children if @my_children
+
+ # Jobs components
+ items = []
+ components = get(:components)
+ uuids = components.andand.collect {|_, v| v}
+ return items if (!uuids or uuids.empty?)
+
+ rcs = {}
+ uuids.each do |u|
+ r = ArvadosBase::resource_class_for_uuid(u)
+ rcs[r] = [] unless rcs[r]
+ rcs[r] << u
+ end
+ rcs.each do |rc, ids|
+ rc.where(uuid: ids).each do |obj|
+ items << obj.work_unit(components.key(obj.uuid))
+ end
+ end
+
+ @my_children = items
+ end
+
+ def child_summary
+ if children.any?
+ super
+ else
+ get(:tasks_summary)
+ end
+ end
+
+ def parameters
+ get(:script_parameters)
+ end
+
+ def repository
+ get(:repository)
+ end
+
+ def script
+ get(:script)
+ end
+
+ def script_version
+ get(:script_version)
+ end
+
+ def supplied_script_version
+ get(:supplied_script_version)
+ end
+
+ def docker_image
+ get(:docker_image_locator)
+ end
+
+ def nondeterministic
+ get(:nondeterministic)
+ end
+
+ def runtime_constraints
+ get(:runtime_constraints)
+ end
+
+ def priority
+ get(:priority)
+ end
+
+ def log_collection
+ get(:log)
+ end
+
+ def output
+ get(:output)
+ end
+
+ def can_cancel?
+ state_label.in? ["Queued", "Running"]
+ end
+
+ def uri
+ uuid = get(:uuid)
+ "/jobs/#{uuid}"
+ end
+
+ def title
+ "job"
+ end
+end
end
end
+ def work_unit(label=nil)
+ PipelineInstanceWorkUnit.new(self, label || self.name)
+ end
+
private
def components_map
--- /dev/null
+class PipelineInstanceWorkUnit < ProxyWorkUnit
+ def children
+ return @my_children if @my_children
+
+ items = []
+
+ jobs = {}
+ results = Job.where(uuid: @proxied.job_ids.values).results
+ results.each do |j|
+ jobs[j.uuid] = j
+ end
+
+ components = get(:components)
+ components.each do |name, c|
+ if c.is_a?(Hash)
+ job = c[:job]
+ if job
+ if job[:uuid] and jobs[job[:uuid]]
+ items << jobs[job[:uuid]].work_unit(name)
+ else
+ items << JobWorkUnit.new(job, name)
+ end
+ else
+ items << JobWorkUnit.new(c, name)
+ end
+ else
+ @unreadable_children = true
+ break
+ end
+ end
+
+ @my_children = items
+ end
+
+ def uri
+ uuid = get(:uuid)
+ "/pipeline_instances/#{uuid}"
+ end
+
+ def title
+ "pipeline"
+ end
+end
--- /dev/null
+class ProxyWorkUnit < WorkUnit
+ require 'time'
+
+ attr_accessor :lbl
+ attr_accessor :proxied
+ attr_accessor :my_children
+ attr_accessor :unreadable_children
+
+ def initialize proxied, label
+ @lbl = label
+ @proxied = proxied
+ end
+
+ def label
+ @lbl
+ end
+
+ def uuid
+ get(:uuid)
+ end
+
+ def modified_by_user_uuid
+ get(:modified_by_user_uuid)
+ end
+
+ def created_at
+ t = get(:created_at)
+ t = Time.parse(t) if (t.andand.class == String)
+ t
+ end
+
+ def started_at
+ t = get(:started_at)
+ t = Time.parse(t) if (t.andand.class == String)
+ t
+ end
+
+ def finished_at
+ t = get(:finished_at)
+ t = Time.parse(t) if (t.andand.class == String)
+ t
+ end
+
+ def state_label
+ state = get(:state)
+ if ["Running", "RunningOnServer", "RunningOnClient"].include? state
+ "Running"
+ else
+ state
+ end
+ end
+
+ def state_bootstrap_class
+ state = get(:state)
+ case state
+ when 'Complete'
+ 'success'
+ when 'Failed', 'Cancelled'
+ 'danger'
+ when 'Running', 'RunningOnServer', 'RunningOnClient'
+ 'info'
+ else
+ 'default'
+ end
+ end
+
+ def success?
+ state = get(:state)
+ if state == 'Complete'
+ true
+ elsif state == 'Failed' or state == 'Cancelled'
+ false
+ else
+ nil
+ end
+ end
+
+ def child_summary
+ done = 0
+ failed = 0
+ todo = 0
+ running = 0
+ children.each do |c|
+ case c.state_label
+ when 'Complete'
+ done = done+1
+ when 'Failed', 'Cancelled'
+ failed = failed+1
+ when 'Running'
+ running = running+1
+ else
+ todo = todo+1
+ end
+ end
+
+ summary = {}
+ summary[:done] = done
+ summary[:failed] = failed
+ summary[:todo] = todo
+ summary[:running] = running
+ summary
+ end
+
+ def child_summary_str
+ summary = child_summary
+ summary_txt = ''
+
+ if state_label == 'Running'
+ done = summary[:done] || 0
+ running = summary[:running] || 0
+ failed = summary[:failed] || 0
+ todo = summary[:todo] || 0
+ total = done + running + failed + todo
+
+ if total > 0
+ summary_txt += "#{summary[:done]} #{'child'.pluralize(summary[:done])} done,"
+ summary_txt += "#{summary[:failed]} failed,"
+ summary_txt += "#{summary[:running]} running,"
+ summary_txt += "#{summary[:todo]} pending"
+ end
+ end
+ summary_txt
+ end
+
+ def progress
+ state = get(:state)
+ if state == 'Complete'
+ return 1.0
+ elsif state == 'Failed' or state == 'Cancelled'
+ return 0.0
+ end
+
+ summary = child_summary
+ return 0.0 if summary.nil?
+
+ done = summary[:done] || 0
+ running = summary[:running] || 0
+ failed = summary[:failed] || 0
+ todo = summary[:todo] || 0
+ total = done + running + failed + todo
+ if total > 0
+ (done+failed).to_f / total
+ else
+ 0.0
+ end
+ end
+
+ def children
+ []
+ end
+
+ def title
+ "process"
+ end
+
+ def has_unreadable_children
+ @unreadable_children
+ end
+
+ def readable?
+ resource_class = ArvadosBase::resource_class_for_uuid(uuid)
+ resource_class.where(uuid: [uuid]).first rescue nil
+ end
+
+ def link_to_log
+ if state_label.in? ["Complete", "Failed", "Cancelled"]
+ lc = log_collection
+ if lc
+ logCollection = Collection.find? lc
+ if logCollection
+ ApplicationController.helpers.link_to("Log", "#{uri}#Log")
+ else
+ "Log unavailable"
+ end
+ end
+ elsif state_label == "Running"
+ if readable?
+ ApplicationController.helpers.link_to("Log", "#{uri}#Log")
+ else
+ "Log unavailable"
+ end
+ end
+ end
+
+ def walltime
+ if state_label != "Queued"
+ if started_at
+ ((if finished_at then finished_at else Time.now() end) - started_at)
+ end
+ end
+ end
+
+ def cputime
+ if state_label != "Queued"
+ if started_at
+ (runtime_constraints.andand[:min_nodes] || 1) * ((finished_at || Time.now()) - started_at)
+ end
+ end
+ end
+
+ def queuedtime
+ if state_label == "Queued"
+ Time.now - Time.parse(created_at.to_s)
+ end
+ end
+
+ def is_running?
+ state_label == 'Running'
+ end
+
+ def is_paused?
+ state_label == 'Paused'
+ end
+
+ def is_finished?
+ state_label.in? ["Complete", "Failed", "Cancelled"]
+ end
+
+ def is_failed?
+ state_label == 'Failed'
+ end
+
+ def show_runtime
+ runningtime = ApplicationController.helpers.determine_wallclock_runtime(if children.any? then children else [self] end)
+
+ walltime = 0
+ if started_at
+ walltime = if finished_at then (finished_at - started_at) else (Time.now - started_at) end
+ end
+
+ resp = '<p>'
+
+ if started_at
+ resp << "This #{title} started at "
+ resp << ApplicationController.helpers.render_localized_date(started_at)
+ resp << ". It "
+ if state_label == 'Complete'
+ resp << "completed in "
+ elsif state_label == 'Failed'
+ resp << "failed after "
+ else
+ resp << "has been active for "
+ end
+
+ if walltime > runningtime
+ resp << ApplicationController.helpers.render_time(walltime, false)
+ else
+ resp << ApplicationController.helpers.render_time(runningtime, false)
+ end
+
+ if finished_at
+ resp << " at "
+ resp << ApplicationController.helpers.render_localized_date(finished_at)
+ end
+ resp << "."
+ else
+ if state_label
+ resp << "This #{title} is "
+ resp << if state_label == 'Running' then 'active' else state_label.downcase end
+ resp << "."
+ end
+ end
+
+ if is_failed?
+ resp << " Check the Log tab for more detail about why it failed."
+ end
+ resp << "</p>"
+
+ resp << "<p>"
+ if state_label
+ resp << "It "
+ if state_label == 'Running'
+ resp << "has run"
+ else
+ resp << "ran"
+ end
+ resp << " for "
+
+ cpu_time = 0
+ if children.any?
+ cpu_time = children.map { |c|
+ if c.started_at
+ (c.runtime_constraints.andand[:min_nodes] || 1) * ((c.finished_at || Time.now()) - c.started_at)
+ else
+ 0
+ end
+ }.reduce(:+) || 0
+ else
+ if started_at
+ cpu_time = (runtime_constraints.andand[:min_nodes] || 1) * ((finished_at || Time.now()) - started_at)
+ end
+ end
+
+ resp << ApplicationController.helpers.render_time(runningtime, false)
+ if (walltime - runningtime) > 0
+ resp << "("
+ resp << ApplicationController.helpers.render_time(walltime - runningtime, false)
+ resp << "queued)"
+ end
+ if cpu_time == 0
+ resp << "."
+ else
+ resp << " and used "
+ resp << ApplicationController.helpers.render_time(cpu_time, false)
+ resp << " of node allocation time ("
+ resp << (cpu_time/runningtime).round(1).to_s
+ resp << "⨯ scaling)."
+ end
+ end
+ resp << "</p>"
+
+ resp
+ end
+
+ protected
+
+ def get key
+ if @proxied.respond_to? key
+ @proxied.send(key)
+ elsif @proxied.is_a?(Hash)
+ @proxied[key]
+ end
+ end
+end
--- /dev/null
+class WorkUnit
+ # This is an abstract class that documents the WorkUnit interface
+
+ def label
+ # returns the label that was assigned when creating the work unit
+ end
+
+ def uuid
+ # returns the arvados UUID of the underlying object
+ end
+
+ def children
+ # returns an array of child work units
+ end
+
+ def modified_by_user_uuid
+ # returns uuid of the user who modified this work unit most recently
+ end
+
+ def created_at
+ # returns created_at timestamp
+ end
+
+ def started_at
+ # returns started_at timestamp for this work unit
+ end
+
+ def finished_at
+ # returns finished_at timestamp
+ end
+
+ def state_label
+ # returns a string representing state of the work unit
+ end
+
+ def state_bootstrap_class
+ # returns a class like "danger", "success", or "warning" that a view can use directly to make a display class
+ end
+
+ def success?
+ # returns true if the work unit finished successfully,
+ # false if it has a permanent failure,
+ # and nil if the final state is not determined.
+ end
+
+ def progress
+ # returns a number between 0 and 1
+ end
+
+ def log_collection
+ # returns uuid or pdh with saved log data, if any
+ end
+
+ def parameters
+ # returns work unit parameters, if any
+ end
+
+ def script
+ # returns script for this work unit, if any
+ end
+
+ def repository
+ # returns this work unit's script repository, if any
+ end
+
+ def script_version
+ # returns this work unit's script_version, if any
+ end
+
+ def supplied_script_version
+ # returns this work unit's supplied_script_version, if any
+ end
+
+ def docker_image
+ # returns this work unit's docker_image, if any
+ end
+
+ def runtime_constraints
+ # returns this work unit's runtime_constraints, if any
+ end
+
+ def priority
+ # returns this work unit's priority, if any
+ end
+
+ def nondeterministic
+ # returns if this is nondeterministic
+ end
+
+ def output
+ # returns uuid or pdh of output data, if any
+ end
+
+ def child_summary
+ # summary status of any children of this work unit
+ end
+
+ def child_summary_str
+ # textual representation of child summary
+ end
+
+ def can_cancel?
+ # returns true if this work unit can be canceled
+ end
+
+ def readable?
+ # is the proxied object readable by current user?
+ end
+
+ def uri
+ # returns the uri for this work unit
+ end
+
+ def title
+ # title for the work unit
+ end
+
+ def has_unreadable_children
+ # accept it if you can't understand your own children
+ end
+
+ # view helper methods
+ def link_to_log
+ # display a link to log if present
+ end
+
+ def walltime
+ # return walltime for a running or completed work unit
+ end
+
+ def cputime
+ # return cputime for a running or completed work unit
+ end
+
+ def queuedtime
+ # return queued time if the work unit is queued
+ end
+
+ def is_running?
+ # is the work unit in running state?
+ end
+
+ def is_paused?
+ # is the work unit in paused state?
+ end
+
+ def is_finished?
+ # is the work unit in finished state?
+ end
+
+ def is_failed?
+ # is this work unit in failed state?
+ end
+end
pj[:progress_bar] = render(partial: "job_progress",
locals: {:j => @object })
tasks = JobTask.filter([['job_uuid', '=', @object.uuid]]).results
- render(partial: 'pipeline_instances/running_component',
- locals: { tasks: tasks, pj: pj, i: 0, expanded: true})
+ render(partial: 'work_unit/show_component', locals: {wu: @object.work_unit(@object[:name] || "this job")})
%>
<div class="panel panel-default">
<div class="col-md-6">
<% queuetime = Time.now - Time.parse(current_job[:created_at].to_s) %>
Queued for <%= render_runtime(queuetime, false) %>.
- <% begin %>
- <% if current_job[:queue_position] == 0 %>
- This job is next in the queue to run.
- <% elsif current_job[:queue_position] == 1 %>
- There is 1 job in the queue ahead of this one.
- <% elsif current_job[:queue_position] %>
- There are <%= current_job[:queue_position] %> jobs in the queue ahead of this one.
- <% end %>
- <% rescue %>
- <% end %>
</div>
<% elsif current_job[:state] == "Running" %>
<%# column offset 8 %>
data-object-uuids="<%= @object.uuid %> <%= job_uuids.join(' ') %>"
></div>
- <%= render_pipeline_components("running", :json) %>
+ <%= render partial: 'work_unit/show_component', locals: {wu: @object.work_unit(@object.name)} %>
<% else %>
<%# state is either New or Ready %>
<pre><%= Oj.dump(@object.components, indent: 2) %></pre>
</div>
</div>
+ <% if backtrace %>
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" data-parent="#components-accordion" href="#components-backtrace">
<pre><%= backtrace %></pre>
</div>
</div>
+ <% end %>
</div>
--- /dev/null
+ <div class="container">
+ <div class="row">
+ <div class="col-md-5">
+ <% if current_obj.uuid.nil? %>
+ No <%= current_obj.title %> has been submitted yet.
+ <% else %>
+ <table>
+ <% [:uuid, :modified_by_user_uuid, :created_at, :started_at, :finished_at, :output, :priority].each do |k| %>
+ <% val = current_obj.send(k) if current_obj.respond_to?(k) %>
+ <% if val %>
+ <tr>
+ <td style="padding-right: 1em">
+ <%= k.to_s %>:
+ </td>
+ <td>
+ <% if k == :uuid %>
+ <%= link_to_arvados_object_if_readable(val, val, link_text: val) %>
+ <% elsif k.to_s.end_with? 'uuid' %>
+ <%= link_to_arvados_object_if_readable(val, val, friendly_name: true) %>
+ <% elsif k.to_s.end_with? '_at' %>
+ <%= render_localized_date(val) %>
+ <% elsif k == :output %>
+ <%= link_to_arvados_object_if_readable(val, 'Output data not available', friendly_name: true) %>
+ <% else %>
+ <%= val %>
+ <% end %>
+ </td>
+ </tr>
+ <% end %>
+ <% end %>
+ </table>
+ <% end %>
+ </div>
+ <div class="col-md-6">
+ <table>
+ <% # link to repo tree/file only if the repo is readable
+ # and the commit is a sha1...
+ repo =
+ (/^[0-9a-f]{40}$/ =~ current_obj.script_version and
+ Repository.where(name: current_obj.repository).first)
+
+ # ...and the api server provides an http:// or https:// url
+ repo = nil unless repo.andand.http_fetch_url
+ %>
+ <% [:script, :repository, :script_version, :supplied_script_version, :nondeterministic].each do |k| %>
+ <% val = current_obj.send(k) if current_obj.respond_to?(k) %>
+ <% if val %>
+ <tr>
+ <td style="padding-right: 1em">
+ <%= k.to_s %>:
+ </td>
+ <td>
+ <% if repo and k == :repository %>
+ <%= link_to val, show_repository_tree_path(id: repo.uuid, commit: current_obj.script_version, path: '/') %>
+ <% elsif repo and k == :script %>
+ <%= link_to val, show_repository_blob_path(id: repo.uuid, commit: current_obj.script_version, path: 'crunch_scripts/'+current_obj.script) %>
+ <% elsif repo and k == :script_version %>
+ <%= link_to val, show_repository_commit_path(id: repo.uuid, commit: current_obj.script_version) %>
+ <% else %>
+ <%= val %>
+ <% end %>
+ </td>
+ </tr>
+ <% end %>
+ <% end %>
+ <% if current_obj.runtime_constraints.andand[:docker_image] and current_obj.docker_image %>
+ <tr>
+ <td style="padding-right: 1em">
+ docker_image:
+ </td>
+ <td>
+ <%= current_obj.runtime_constraints[:docker_image] %>
+ </td>
+ </tr>
+ <tr>
+ <td style="padding-right: 1em">
+ docker_image_locator:
+ </td>
+ <td>
+ <%= link_to_arvados_object_if_readable(current_obj.docker_image,
+ current_obj.docker_image, friendly_name: true) %>
+ </td>
+ </tr>
+ <% end %>
+ </table>
+ </div>
+ </div>
+
+ <% if current_obj.parameters and !current_obj.parameters.empty? %>
+ <div class="row">
+ <div class="col-md-6">
+ <p>script_parameters:</p>
+ <pre><%= JSON.pretty_generate(current_obj.parameters) rescue nil %></pre>
+ </div>
+ </div>
+ <% end %>
+ </div>
--- /dev/null
+<% if wu.is_running? %>
+ <% if @object.uuid == wu.uuid and wu.progress == 0.0 %>
+ <span class="label label-<%= wu.state_bootstrap_class %>"> Active </span>
+ <% else%>
+ <div class="progress" style="margin-bottom: 0px">
+ <span class="progress-bar progress-bar-<%= wu.state_bootstrap_class %>" style="width: <%= wu.progress*100 %>%;">
+ </span>
+ </div>
+ <% end %>
+<% else %>
+ <span class="label label-<%= wu.state_bootstrap_class %>"><%= wu.state_label %></span>
+<% end %>
--- /dev/null
+<div class="panel panel-default">
+ <div class="panel-heading">
+ <div class="container-fluid">
+ <div class="row-fluid">
+ <div class="col-md-2" style="word-break:break-all;">
+ <h4 class="panel-title">
+ <a data-toggle="collapse" href="#collapse<%= i %>">
+ <%= current_obj.label %> <span class="caret"></span>
+ </a>
+ </h4>
+ </div>
+
+ <div class="col-md-2 pipeline-instance-spacing">
+ <%= render partial: 'work_unit/progress', locals: {wu: current_obj} %>
+ </div>
+
+ <% if not current_obj %>
+ <div class="col-md-8"></div>
+ <% else %>
+ <div class="col-md-1">
+ <%= current_obj.link_to_log %>
+ </div>
+
+ <% walltime = current_obj.walltime %>
+ <% cputime = current_obj.cputime %>
+ <div class="col-md-3">
+ <% if walltime and cputime %>
+ <%= render_runtime(walltime, false) %>
+ <% if cputime > 0 %> / <%= render_runtime(cputime, false) %> (<%= (cputime/walltime).round(1) %>⨯)<% end %>
+ <% end %>
+ </div>
+
+ <% queuetime = current_obj.queuedtime %>
+ <% if queuetime %>
+ <div class="col-md-3">
+ Queued for <%= render_runtime(queuetime, false) %>.
+ </div>
+ <% elsif current_obj.is_running? %>
+ <div class="col-md-3">
+ <span class="task-summary-status">
+ <%= current_obj.child_summary_str %>
+ </span>
+ </div>
+ <% elsif current_obj.is_finished? %>
+ <div class="col-md-3 text-overflow-ellipsis">
+ <% if current_obj.output %>
+ <%= link_to_arvados_object_if_readable(current_obj.output, 'Output data not available', link_text: "Output of #{current_obj.label}") %>
+ <% else %>
+ No output.
+ <% end %>
+ </div>
+ <% end %>
+
+ <div class="col-md-1 pipeline-instance-spacing">
+ <% if current_obj.can_cancel? and @object.editable? %>
+ <%= form_tag "#{current_obj.uri}/cancel", remote: true, style: "display:inline; padding-left: 1em" do |f| %>
+ <%= hidden_field_tag :return_to, url_for(@object) %>
+ <%= button_tag "Cancel", {class: 'btn btn-xs btn-danger', id: "cancel-child-button"} %>
+ <% end %>
+ <% end %>
+ </div>
+ <% end %>
+ </div>
+ </div>
+ </div>
+
+ <div id="collapse<%= i %>" class="panel-collapse collapse <%= if expanded then 'in' end %>">
+ <div class="panel-body">
+ <%= render partial: 'work_unit/show_component', locals: {wu: current_obj} %>
+ </div>
+ </div>
+</div>
--- /dev/null
+<%# Work unit status %>
+
+<div class="container-fluid>
+ <div class="row-fluid">
+ <%# Need additional handling for main object display %>
+ <% if @object.uuid == wu.uuid %>
+ <div class="container-fluid">
+ <div class="pull-right">
+ <div class="container-fluid">
+ <div class="row-fulid pipeline-instance-spacing">
+ <div class="col-md-8">
+ <% if wu.is_running? and wu.child_summary_str %>
+ <%= wu.child_summary_str %>
+ <% end %>
+ </div>
+ <div class="col-md-3">
+ <%= render partial: 'work_unit/progress', locals: {wu: wu} %>
+ </div>
+ <div class="col-md-1">
+ <% if wu.can_cancel? and @object.editable? %>
+ <%= form_tag "#{wu.uri}/cancel", remote: true, style: "display:inline; padding-left: 1em" do |f| %>
+ <%= hidden_field_tag :return_to, url_for(@object) %>
+ <%= button_tag "Cancel", {class: 'btn btn-xs btn-danger', id: "cancel-obj-button"} %>
+ <% end %>
+ <% end %>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ <% end %>
+
+ <div class="col-md-10" >
+ <% if wu.is_paused? %>
+ <p>
+ This <%= wu.title %> is paused. Children that are already running
+ will continue to run, but no new processes will be submitted.
+ </p>
+ <% end %>
+
+ <%= raw(wu.show_runtime) %>
+ </div>
+ </div>
+
+<p>
+ <%= render(partial: 'work_unit/component_detail', locals: {current_obj: wu}) %>
+</p>
+
+<%# Work unit children %>
+
+<%
+ uuids = wu.children.collect {|c| c.uuid}.compact
+ if uuids.any?
+ resource_class = resource_class_for_uuid(uuids.first, friendly_name: true)
+ preload_objects_for_dataclass resource_class, uuids
+ end
+
+ collections = wu.children.collect {|j| j.output}.compact
+ collections.concat wu.children.collect {|j| j.docker_image}.uniq.compact
+ collections_pdhs = collections.select {|x| !(m = CollectionsHelper.match(x)).nil?}.uniq.compact
+ collections_uuids = collections - collections_pdhs
+ preload_collections_for_objects collections_uuids if collections_uuids.any?
+ preload_for_pdhs collections_pdhs if collections_pdhs.any?
+%>
+
+<% if wu.has_unreadable_children %>
+ <%= render(partial: "pipeline_instances/show_components_json",
+ locals: {error_name: "Unreadable components", backtrace: nil, wu: wu}) %>
+<% else %>
+ <% @descendent_count = 0 if !@descendent_count %>
+ <% wu.children.each do |c| %>
+ <% @descendent_count += 1 %>
+ <%= render(partial: 'work_unit/show_child', locals: {current_obj: c, i: @descendent_count, expanded: false}) %>
+ <% end %>
+<% end %>
assert_response :success
assert_not_nil assigns(:object)
assert_not_nil assigns(:object).components[:foo][:job]
- assert assigns(:object).components[:foo][:job][:started_at].is_a? Time
- assert assigns(:object).components[:foo][:job][:finished_at].is_a? Time
+ start_at = assigns(:object).components[:foo][:job][:started_at]
+ start_at = Time.parse(start_at) if (start_at.andand.class == String)
+ assert start_at.is_a? Time
+ finished_at = assigns(:object).components[:foo][:job][:started_at]
+ finished_at = Time.parse(finished_at) if (finished_at.andand.class == String)
+ assert finished_at.is_a? Time
end
# The next two tests ensure that a pipeline instance can be copied
if expect_options
assert_text 'supplied_script_version: master'
else
- assert_text 'supplied_script_version: (none)'
+ assert_no_text 'supplied_script_version'
end
assert_triggers_dom_event 'shown.bs.modal' do
end
end
end
+
+ [
+ ['active', true],
+ ['job_reader2', false],
+ ].each do |user, readable|
+ test "view job with components as #{user} user" do
+ job = api_fixture('jobs')['running_job_with_components']
+ component1 = api_fixture('jobs')['completed_job_in_publicly_accessible_project']
+ component2 = api_fixture('pipeline_instances')['running_pipeline_with_complete_job']
+ component2_child1 = api_fixture('jobs')['previous_job_run']
+ component2_child2 = api_fixture('jobs')['running']
+
+ visit page_with_token(user, "/jobs/#{job['uuid']}")
+ assert page.has_text? job['script_version']
+ assert page.has_no_text? 'script_parameters'
+
+ # The job_reader2 is allowed to read job, component2, and component2_child1,
+ # and component2_child2 only as a component of the pipeline component2
+ if readable
+ assert page.has_link? 'component1'
+ assert page.has_link? 'component2'
+ else
+ assert page.has_no_link? 'component1'
+ assert page.has_link? 'component2'
+ end
+
+ if readable
+ click_link('component1')
+ within('#collapse1') do
+ assert(has_text? component1['uuid'])
+ assert(has_text? component1['script_version'])
+ assert(has_text? 'script_parameters')
+ end
+ click_link('component1')
+ end
+
+ click_link('component2')
+ within('.panel-collapse') do
+ assert(has_text? component2['uuid'])
+ assert(has_text? component2['script_version'])
+ assert(has_no_text? 'script_parameters')
+ assert(has_link? 'previous')
+ assert(has_link? 'running')
+
+ click_link('previous')
+ within('.panel-collapse') do
+ assert(has_text? component2_child1['uuid'])
+ assert(has_text? component2_child1['script_version'])
+ end
+ click_link('previous')
+
+ click_link('running')
+ within('.panel-collapse') do
+ assert(has_text? component2_child2['uuid'])
+ if readable
+ assert(has_text? component2_child2['script_version'])
+ else
+ assert(has_no_text? component2_child2['script_version'])
+ end
+ end
+ end
+ end
+ end
end
class ActionController::TestCase
setup do
- @counter = 0
+ @test_counter = 0
end
def check_counter action
- @counter += 1
- if @counter == 2
+ @test_counter += 1
+ if @test_counter == 2
assert_equal 1, 2, "Multiple actions in controller test"
end
end
--- /dev/null
+require 'test_helper'
+
+class WorkUnitTest < ActiveSupport::TestCase
+ setup do
+ Rails.configuration.anonymous_user_token = api_fixture('api_client_authorizations')['anonymous']['api_token']
+ end
+
+ [
+ [Job, 'running_job_with_components', "jwu", 2, "Running", nil, 0.5],
+ [PipelineInstance, 'pipeline_in_running_state', nil, 1, "Running", nil, 0.0],
+ [PipelineInstance, 'has_component_with_completed_jobs', nil, 3, "Complete", true, 1.0],
+ [PipelineInstance, 'pipeline_with_tagged_collection_input', "pwu", 1, "Ready", nil, 0.0],
+ ].each do |type, fixture, label, num_children, state, success, progress|
+ test "children of #{fixture}" do
+ use_token 'active'
+ obj = find_fixture(type, fixture)
+ wu = obj.work_unit(label)
+
+ if label != nil
+ assert_equal(label, wu.label)
+ else
+ assert_equal(obj.name, wu.label)
+ end
+ assert_equal(obj['uuid'], wu.uuid)
+ assert_equal(state, wu.state_label)
+ assert_equal(success, wu.success?)
+ assert_equal(progress, wu.progress)
+
+ assert_equal(num_children, wu.children.size)
+ wu.children.each do |child|
+ assert_equal(true, child.respond_to?(:script))
+ end
+ end
+ end
+
+ [
+ [Job, 'running_job_with_components', 1, 1, nil],
+ [Job, 'queued', nil, nil, 1],
+ [PipelineInstance, 'pipeline_in_running_state', 1, 1, nil],
+ [PipelineInstance, 'has_component_with_completed_jobs', 60, 60, nil],
+ ].each do |type, fixture, walltime, cputime, queuedtime|
+ test "times for #{fixture}" do
+ use_token 'active'
+ obj = find_fixture(type, fixture)
+ wu = obj.work_unit
+
+ if walltime
+ assert_equal true, (wu.walltime >= walltime)
+ else
+ assert_equal walltime, wu.walltime
+ end
+
+ if cputime
+ assert_equal true, (wu.cputime >= cputime)
+ else
+ assert_equal cputime, wu.cputime
+ end
+
+ if queuedtime
+ assert_equal true, (wu.queuedtime >= queuedtime)
+ else
+ assert_equal queuedtime, wu.queuedtime
+ end
+ end
+ end
+
+ [
+ [Job, 'active', 'running_job_with_components', true],
+ [Job, 'active', 'queued', false],
+ [Job, nil, 'completed_job_in_publicly_accessible_project', true],
+ [Job, 'active', 'completed_job_in_publicly_accessible_project', true],
+ [PipelineInstance, 'active', 'pipeline_in_running_state', true], # no log, but while running the log link points to pi Log tab
+ [PipelineInstance, nil, 'pipeline_in_publicly_accessible_project_but_other_objects_elsewhere', false],
+ [PipelineInstance, 'active', 'pipeline_in_publicly_accessible_project_but_other_objects_elsewhere', false], #no log for completed pi
+ [Job, nil, 'job_in_publicly_accessible_project_but_other_objects_elsewhere', false, "Log unavailable"],
+ ].each do |type, token, fixture, has_log, log_link|
+ test "link_to_log for #{fixture} for #{token}" do
+ use_token token if token
+ obj = find_fixture(type, fixture)
+ wu = obj.work_unit
+
+ link = "#{wu.uri}#Log" if has_log
+ link_to_log = wu.link_to_log
+
+ if has_log
+ assert_includes link_to_log, link
+ else
+ assert_equal log_link, link_to_log
+ end
+ end
+ end
+end
--- /dev/null
+#!/bin/bash
+
+case $TARGET in
+ centos7)
+ # fpm incorrectly transforms the dependency name in this case.
+ fpm_depends+=(python-backports-ssl_match_hostname)
+ fpm_args+=(--python-disable-dependency backports.ssl-match-hostname)
+ ;;
+esac
esac
# FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
+++ /dev/null
-fpm_args+=(-v 2.0)
esac
# FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
esac
# FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
# FIXME: Remove this line after #6885 is done.
-fpm_args+=(--iteration 2)
+fpm_args+=(--iteration 3)
case "$TARGET" in
centos6)
-all: centos6/generated debian7/generated debian8/generated ubuntu1204/generated ubuntu1404/generated
+all: centos6/generated centos7/generated debian7/generated debian8/generated ubuntu1204/generated ubuntu1404/generated
centos6/generated: common-generated-all
test -d centos6/generated || mkdir centos6/generated
cp -rlt centos6/generated common-generated/*
+centos7/generated: common-generated-all
+ test -d centos7/generated || mkdir centos7/generated
+ cp -rlt centos7/generated common-generated/*
+
debian7/generated: common-generated-all
test -d debian7/generated || mkdir debian7/generated
cp -rlt debian7/generated common-generated/*
test -d ubuntu1404/generated || mkdir ubuntu1404/generated
cp -rlt ubuntu1404/generated common-generated/*
-common-generated-all: common-generated/golang-amd64.tar.gz
+GOTARBALL=go1.6.2.linux-amd64.tar.gz
+
+common-generated-all: common-generated/$(GOTARBALL)
-common-generated/golang-amd64.tar.gz: common-generated
- wget -cqO common-generated/golang-amd64.tar.gz http://storage.googleapis.com/golang/go1.4.2.linux-amd64.tar.gz
+common-generated/$(GOTARBALL): common-generated
+ wget -cqO common-generated/$(GOTARBALL) http://storage.googleapis.com/golang/$(GOTARBALL)
common-generated:
mkdir common-generated
MAINTAINER Brett Smith <brett@curoverse.com>
# Install build dependencies provided in base distribution
-RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar scl-utils centos-release-SCL postgresql-devel
+RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel
# Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
RUN ln -s /usr/local/go/bin/go /usr/local/bin/
# Install RVM
curl -L https://get.rvm.io | bash -s stable && \
/usr/local/rvm/bin/rvm install 2.1 && \
/usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
- /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+ /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
# Need to "touch" RPM database to workaround bug in interaction between
# overlayfs and yum (https://bugzilla.redhat.com/show_bug.cgi?id=1213602)
RUN touch /var/lib/rpm/* && yum -q -y install python27 python33
RUN scl enable python33 "easy_install-3.3 pip" && scl enable python27 "easy_install-2.7 pip"
+# fpm requires ffi which now wants xz-libs-5 which isn't packaged for centos6
+# but the library from xz-libs-4.999 appears to be good enough.
+RUN ln -s /usr/lib64/liblzma.so.0 /usr/lib64/lzma.so.5
+
RUN cd /tmp && \
curl -OL 'http://pkgs.repoforge.org/rpmforge-release/rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm' && \
rpm -ivh rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm && \
--- /dev/null
+FROM centos:7
+MAINTAINER Brett Smith <brett@curoverse.com>
+
+# Install build dependencies provided in base distribution
+RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel python-devel python-setuptools fuse-devel xz-libs git
+
+# Install golang binary
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
+RUN ln -s /usr/local/go/bin/go /usr/local/bin/
+
+# Install RVM
+RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
+ curl -L https://get.rvm.io | bash -s stable && \
+ /usr/local/rvm/bin/rvm install 2.1 && \
+ /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
+ /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
+
+# Need to "touch" RPM database to workaround bug in interaction between
+# overlayfs and yum (https://bugzilla.redhat.com/show_bug.cgi?id=1213602)
+RUN touch /var/lib/rpm/* && yum -q -y install python33
+RUN scl enable python33 "easy_install-3.3 pip" && easy_install-2.7 pip
+
+ENV WORKSPACE /arvados
+CMD ["scl", "enable", "python33", "/usr/local/rvm/bin/rvm-exec default bash /jenkins/run-build-packages.sh --target centos7"]
MAINTAINER Ward Vandewege <ward@curoverse.com>
# Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libpq-dev python-pip
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libpq-dev python-pip unzip
# Install RVM
RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
curl -L https://get.rvm.io | bash -s stable && \
/usr/local/rvm/bin/rvm install 2.1 && \
/usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
- /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+ /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
# Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
RUN ln -s /usr/local/go/bin/go /usr/local/bin/
ENV WORKSPACE /arvados
MAINTAINER Ward Vandewege <ward@curoverse.com>
# Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip unzip
# Install RVM
RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
curl -L https://get.rvm.io | bash -s stable && \
/usr/local/rvm/bin/rvm install 2.1 && \
/usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
- /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+ /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
# Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
RUN ln -s /usr/local/go/bin/go /usr/local/bin/
ENV WORKSPACE /arvados
MAINTAINER Ward Vandewege <ward@curoverse.com>
# Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip build-essential
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip build-essential unzip
# Install RVM
RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
curl -L https://get.rvm.io | bash -s stable && \
/usr/local/rvm/bin/rvm install 2.1 && \
/usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
- /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+ /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
# Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
RUN ln -s /usr/local/go/bin/go /usr/local/bin/
ENV WORKSPACE /arvados
MAINTAINER Brett Smith <brett@curoverse.com>
# Install dependencies and set up system.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools libcurl4-gnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip unzip
# Install RVM
RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
curl -L https://get.rvm.io | bash -s stable && \
/usr/local/rvm/bin/rvm install 2.1 && \
/usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
- /usr/local/rvm/bin/rvm-exec default gem install bundler fpm
+ /usr/local/rvm/bin/rvm-exec default gem install bundler && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
# Install golang binary
-ADD generated/golang-amd64.tar.gz /usr/local/
+ADD generated/go1.6.2.linux-amd64.tar.gz /usr/local/
RUN ln -s /usr/local/go/bin/go /usr/local/bin/
ENV WORKSPACE /arvados
FROM centos:6
MAINTAINER Peter Amstutz <peter.amstutz@curoverse.com>
-RUN yum -q install --assumeyes scl-utils centos-release-SCL \
+RUN yum -q install --assumeyes scl-utils centos-release-scl \
which tar
# Install RVM
curl -L https://get.rvm.io | bash -s stable && \
/usr/local/rvm/bin/rvm install 2.1 && \
/usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
- /usr/local/rvm/bin/rvm-exec default gem install bundle fpm
+ /usr/local/rvm/bin/rvm-exec default gem install bundle && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
RUN cd /tmp && \
curl -OL 'http://pkgs.repoforge.org/rpmforge-release/rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm' && \
rpm -ivh rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm && \
sed -i 's/enabled = 0/enabled = 1/' /etc/yum.repos.d/rpmforge.repo
-COPY localrepo.repo /etc/yum.repos.d/localrepo.repo
\ No newline at end of file
+COPY localrepo.repo /etc/yum.repos.d/localrepo.repo
--- /dev/null
+FROM centos:7
+MAINTAINER Brett Smith <brett@curoverse.com>
+
+RUN yum -q -y install scl-utils centos-release-scl which tar
+
+# Install RVM
+RUN touch /var/lib/rpm/* && \
+ gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3 && \
+ curl -L https://get.rvm.io | bash -s stable && \
+ /usr/local/rvm/bin/rvm install 2.1 && \
+ /usr/local/rvm/bin/rvm alias create default ruby-2.1 && \
+ /usr/local/rvm/bin/rvm-exec default gem install bundle && \
+ /usr/local/rvm/bin/rvm-exec default gem install cure-fpm --version 1.6.0b
+
+COPY localrepo.repo /etc/yum.repos.d/localrepo.repo
--- /dev/null
+[localrepo]
+name=Arvados Test
+baseurl=file:///arvados/packages/centos7
+gpgcheck=0
+enabled=1
--- /dev/null
+#!/bin/bash
+
+set -eu
+
+target=$(basename "$0" | grep -Eo '\bcentos[[:digit:]]+\b')
+
+yum -q clean all
+touch /var/lib/rpm/*
+
+export ARV_PACKAGES_DIR="/arvados/packages/$target"
+
+rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.before"
+
+yum install --assumeyes $1
+
+rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.after"
+
+diff "$ARV_PACKAGES_DIR/$1".{before,after} >"$ARV_PACKAGES_DIR/$1.diff" || true
+
+# Enable any Software Collections that the package depended on.
+if [[ -d /opt/rh ]]; then
+ # We have to stage the list to a file, because `ls | while read` would
+ # make a subshell, causing the `source` lines to have no effect.
+ scl_list=$(mktemp)
+ ls /opt/rh >"$scl_list"
+
+ # SCL scripts aren't designed to run with -eu.
+ set +eu
+ while read scl; do
+ source scl_source enable "$scl"
+ done <"$scl_list"
+ set -eu
+ rm "$scl_list"
+fi
+
+mkdir -p /tmp/opts
+cd /tmp/opts
+
+rpm2cpio $(ls -t "$ARV_PACKAGES_DIR/$1"-*.rpm | head -n1) | cpio -idm 2>/dev/null
+
+find -name '*.so' | while read so; do
+ echo -e "\n== Packages dependencies for $so =="
+ ldd "$so" \
+ | awk '($3 ~ /^\//){print $3}' | sort -u | xargs rpm -qf | sort -u
+done
+
+exec /jenkins/package-testing/common-test-packages.sh "$1"
apt-get install -y nginx
dpkg-reconfigure arvados-api-server
;;
- centos6)
+ centos*)
yum install --assumeyes httpd
yum reinstall --assumeyes arvados-api-server
;;
debian*|ubuntu*)
FORMAT=deb
;;
- centos6)
+ centos*)
FORMAT=rpm
;;
*)
if [[ ! -e "/etc/arvados/sso/database.yml" ]]; then
# We need to set up our database configuration now.
if [[ "$FORMAT" == "rpm" ]]; then
- # postgres packaging on CentOS6 is kind of primitive, needs an initdb
service postgresql initdb
- if [ "$TARGET" = "centos6" ]; then
- sed -i -e "s/127.0.0.1\/32 ident/127.0.0.1\/32 md5/" /var/lib/pgsql/data/pg_hba.conf
- sed -i -e "s/::1\/128 ident/::1\/128 md5/" /var/lib/pgsql/data/pg_hba.conf
- fi
+ sed -i -e "s/127.0.0.1\/32 ident/127.0.0.1\/32 md5/" /var/lib/pgsql/data/pg_hba.conf
+ sed -i -e "s/::1\/128 ident/::1\/128 md5/" /var/lib/pgsql/data/pg_hba.conf
fi
service postgresql start
apt-get install -y nginx
dpkg-reconfigure arvados-workbench
;;
- centos6)
+ centos*)
yum install --assumeyes httpd
yum reinstall --assumeyes arvados-workbench
;;
+++ /dev/null
-#!/bin/bash
-
-set -eu
-
-yum -q clean all
-touch /var/lib/rpm/*
-
-export ARV_PACKAGES_DIR=/arvados/packages/centos6
-
-rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.before"
-
-yum install --assumeyes $1
-
-rpm -qa | sort > "$ARV_PACKAGES_DIR/$1.after"
-
-set +e
-diff "$ARV_PACKAGES_DIR/$1.before" "$ARV_PACKAGES_DIR/$1.after" >"$ARV_PACKAGES_DIR/$1.diff"
-set -e
-
-SCL=""
-if scl enable python27 true 2>/dev/null ; then
- SCL="scl enable python27"
-fi
-
-mkdir -p /tmp/opts
-cd /tmp/opts
-
-rpm2cpio $(ls -t "$ARV_PACKAGES_DIR/$1"-*.rpm | head -n1) | cpio -idm 2>/dev/null
-
-shared=$(find -name '*.so')
-if test -n "$shared" ; then
- for so in $shared ; do
- echo
- echo "== Packages dependencies for $so =="
- $SCL ldd "$so" \
- | awk '($3 ~ /^\//){print $3}' | sort -u | xargs rpm -qf | sort -u
- done
-fi
-
-if test -n "$SCL" ; then
- exec $SCL "/jenkins/package-testing/common-test-packages.sh '$1'"
-else
- exec /jenkins/package-testing/common-test-packages.sh "$1"
-fi
--- /dev/null
+rpm-common-test-packages.sh
\ No newline at end of file
--- /dev/null
+rpm-common-test-packages.sh
\ No newline at end of file
esac
done
-
EXITCODE=0
-COLUMNS=80
-
-title () {
- printf "\n%*s\n\n" $(((${#title}+$COLUMNS)/2)) "********** $1 **********"
+exit_cleanly() {
+ trap - INT
+ report_outcomes
+ exit $EXITCODE
}
+COLUMNS=80
+. $WORKSPACE/build/run-library.sh
+
docker_push () {
if [[ ! -z "$tags" ]]
then
for tag in $( echo $tags|tr "," " " )
do
- $DOCKER tag -f $1 $1:$tag
+ $DOCKER tag $1 $1:$tag
done
fi
done
if [[ "$ECODE" != "0" ]]; then
- title "!!!!!! docker push $* failed !!!!!!"
EXITCODE=$(($EXITCODE + $ECODE))
fi
-}
-
-timer_reset() {
- t0=$SECONDS
-}
-
-timer() {
- echo -n "$(($SECONDS - $t0))s"
+ checkexit $ECODE "docker push $*"
}
# Sanity check
# Get test config.yml file
cp $HOME/docker/config.yml .
-./build.sh jobs-image
+if [[ ! -z "$tags" ]]; then
+ COMMIT=${tags/,*/} ./build.sh jobs-image
+else
+ ./build.sh jobs-image
+fi
ECODE=$?
if [[ "$ECODE" != "0" ]]; then
- title "!!!!!! docker BUILD FAILED !!!!!!"
EXITCODE=$(($EXITCODE + $ECODE))
fi
+checkexit $ECODE "docker build"
title "docker build complete (`timer`)"
title "uploading images"
docker login -u arvados
docker_push arvados/jobs
- title "upload arvados images complete (`timer`)"
+ title "upload arvados images finished (`timer`)"
else
- title "upload arvados images SKIPPED because no --upload option set"
+ title "upload arvados images SKIPPED because no --upload option set (`timer`)"
fi
fi
-exit $EXITCODE
+exit_cleanly
set -e
if [[ -n "$test_packages" ]]; then
- if [[ -n "$(find $WORKSPACE/packages/$TARGET -name *.rpm)" ]] ; then
+ if [[ -n "$(find $WORKSPACE/packages/$TARGET -name '*.rpm')" ]] ; then
createrepo $WORKSPACE/packages/$TARGET
fi
- if [[ -n "$(find $WORKSPACE/packages/$TARGET -name *.deb)" ]] ; then
+ if [[ -n "$(find $WORKSPACE/packages/$TARGET -name '*.deb')" ]] ; then
(cd $WORKSPACE/packages/$TARGET
dpkg-scanpackages . 2> >(grep -v 'warning' 1>&2) | gzip -c > Packages.gz
)
arvados-src
arvados-workbench
crunchstat
+ keep-balance
+ keep-block-check
keepproxy
keep-rsync
keepstore
case "$TARGET" in
centos6)
packages="$packages python27-python-arvados-fuse
- python27-python-arvados-python-client"
+ python27-python-arvados-python-client python27-python-arvados-cwl-runner"
;;
*)
packages="$packages python-arvados-fuse
- python-arvados-python-client"
+ python-arvados-python-client python-arvados-cwl-runner"
;;
esac
fi
--- /dev/null
+#!/bin/bash
+
+COLUMNS=80
+
+. `dirname "$(readlink -f "$0")"`/run-library.sh
+#. `dirname "$(readlink -f "$0")"`/libcloud-pin
+
+read -rd "\000" helpmessage <<EOF
+$(basename $0): Build Arvados Python packages and Ruby gems
+
+Syntax:
+ WORKSPACE=/path/to/arvados $(basename $0) [options]
+
+Options:
+
+--debug
+ Output debug information (default: false)
+--upload
+ If the build and test steps are successful, upload the python
+ packages to pypi and the gems to rubygems (default: false)
+
+WORKSPACE=path Path to the Arvados source tree to build packages from
+
+EOF
+
+exit_cleanly() {
+ trap - INT
+ report_outcomes
+ exit ${#failures[@]}
+}
+
+gem_wrapper() {
+ local gem_name="$1"; shift
+ local gem_directory="$1"; shift
+
+ title "Start $gem_name gem build"
+ timer_reset
+
+ cd "$gem_directory"
+ handle_ruby_gem $gem_name
+
+ checkexit $? "$gem_name gem build"
+ title "End of $gem_name gem build (`timer`)"
+}
+
+python_wrapper() {
+ local package_name="$1"; shift
+ local package_directory="$1"; shift
+
+ title "Start $package_name python package build"
+ timer_reset
+
+ cd "$package_directory"
+ if [[ $DEBUG > 0 ]]; then
+ echo `pwd`
+ fi
+ handle_python_package
+
+ checkexit $? "$package_name python package build"
+ title "End of $package_name python package build (`timer`)"
+}
+
+TARGET=
+UPLOAD=0
+DEBUG=${ARVADOS_DEBUG:-0}
+
+PARSEDOPTS=$(getopt --name "$0" --longoptions \
+ help,debug,upload,target: \
+ -- "" "$@")
+if [ $? -ne 0 ]; then
+ exit 1
+fi
+
+eval set -- "$PARSEDOPTS"
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --help)
+ echo >&2 "$helpmessage"
+ echo >&2
+ exit 1
+ ;;
+ --target)
+ TARGET="$2"; shift
+ ;;
+ --upload)
+ UPLOAD=1
+ ;;
+ --debug)
+ DEBUG=1
+ ;;
+ --)
+ if [ $# -gt 1 ]; then
+ echo >&2 "$0: unrecognized argument '$2'. Try: $0 --help"
+ exit 1
+ fi
+ ;;
+ esac
+ shift
+done
+
+if ! [[ -n "$WORKSPACE" ]]; then
+ echo >&2 "$helpmessage"
+ echo >&2
+ echo >&2 "Error: WORKSPACE environment variable not set"
+ echo >&2
+ exit 1
+fi
+
+STDOUT_IF_DEBUG=/dev/null
+STDERR_IF_DEBUG=/dev/null
+DASHQ_UNLESS_DEBUG=-q
+if [[ "$DEBUG" != 0 ]]; then
+ STDOUT_IF_DEBUG=/dev/stdout
+ STDERR_IF_DEBUG=/dev/stderr
+ DASHQ_UNLESS_DEBUG=
+fi
+
+EASY_INSTALL2=$(find_easy_install -$PYTHON2_VERSION "")
+EASY_INSTALL3=$(find_easy_install -$PYTHON3_VERSION 3)
+
+RUN_BUILD_PACKAGES_PATH="`dirname \"$0\"`"
+RUN_BUILD_PACKAGES_PATH="`( cd \"$RUN_BUILD_PACKAGES_PATH\" && pwd )`" # absolutized and normalized
+if [ -z "$RUN_BUILD_PACKAGES_PATH" ] ; then
+ # error; for some reason, the path is not accessible
+ # to the script (e.g. permissions re-evaled after suid)
+ exit 1 # fail
+fi
+
+debug_echo "$0 is running from $RUN_BUILD_PACKAGES_PATH"
+debug_echo "Workspace is $WORKSPACE"
+
+if [[ -f /etc/profile.d/rvm.sh ]]; then
+ source /etc/profile.d/rvm.sh
+ GEM="rvm-exec default gem"
+else
+ GEM=gem
+fi
+
+# Make all files world-readable -- jenkins runs with umask 027, and has checked
+# out our git tree here
+chmod o+r "$WORKSPACE" -R
+
+# More cleanup - make sure all executables that we'll package are 755
+cd "$WORKSPACE"
+find -type d -name 'bin' |xargs -I {} find {} -type f |xargs -I {} chmod 755 {}
+
+# Now fix our umask to something better suited to building and publishing
+# gems and packages
+umask 0022
+
+debug_echo "umask is" `umask`
+
+gem_wrapper arvados "$WORKSPACE/sdk/ruby"
+gem_wrapper arvados-cli "$WORKSPACE/sdk/cli"
+gem_wrapper arvados-login-sync "$WORKSPACE/services/login-sync"
+
+GEM_BUILD_FAILURES=0
+if [ ${#failures[@]} -ne 0 ]; then
+ GEM_BUILD_FAILURES=${#failures[@]}
+fi
+
+python_wrapper arvados-pam "$WORKSPACE/sdk/pam"
+python_wrapper arvados-python-client "$WORKSPACE/sdk/python"
+python_wrapper arvados-cwl-runner "$WORKSPACE/sdk/cwl"
+python_wrapper arvados_fuse "$WORKSPACE/services/fuse"
+python_wrapper arvados-node-manager "$WORKSPACE/services/nodemanager"
+
+PYTHON_BUILD_FAILURES=0
+if [ $((${#failures[@]} - $GEM_BUILD_FAILURES)) -ne 0 ]; then
+ PYTHON_BUILD_FAILURES=${#failures[@]} - $GEM_BUILD_FAILURES
+fi
+
+if [[ "$UPLOAD" != 0 ]]; then
+
+ if [[ $DEBUG > 0 ]]; then
+ EXTRA_UPLOAD_FLAGS=" --verbose"
+ else
+ EXTRA_UPLOAD_FLAGS=""
+ fi
+
+ if [[ ! -e "$WORKSPACE/packages" ]]; then
+ mkdir -p "$WORKSPACE/packages"
+ fi
+
+ title "Start upload python packages"
+ timer_reset
+
+ if [ "$PYTHON_BUILD_FAILURES" -eq 0 ]; then
+ /usr/local/arvados-dev/jenkins/run_upload_packages.py $EXTRA_UPLOAD_FLAGS --workspace $WORKSPACE python
+ else
+ echo "Skipping python packages upload, there were errors building the packages"
+ fi
+ checkexit $? "upload python packages"
+ title "End of upload python packages (`timer`)"
+
+ title "Start upload ruby gems"
+ timer_reset
+
+ if [ "$GEM_BUILD_FAILURES" -eq 0 ]; then
+ /usr/local/arvados-dev/jenkins/run_upload_packages.py $EXTRA_UPLOAD_FLAGS --workspace $WORKSPACE gems
+ else
+ echo "Skipping ruby gem upload, there were errors building the packages"
+ fi
+ checkexit $? "upload ruby gems"
+ title "End of upload ruby gems (`timer`)"
+
+fi
+
+exit_cleanly
ubuntu1404)
FORMAT=deb
;;
- centos6)
+ centos6|centos7)
FORMAT=rpm
;;
*)
PYTHON2_VERSION=2.7
PYTHON3_VERSION=$(python3 -c 'import sys; print("{v.major}.{v.minor}".format(v=sys.version_info))')
+## These defaults are suitable for any Debian-based distribution.
+# You can customize them as needed in distro sections below.
+PYTHON2_PACKAGE=python$PYTHON2_VERSION
+PYTHON2_PKG_PREFIX=python
+PYTHON2_PREFIX=/usr
+PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/dist-packages
+
+PYTHON3_PACKAGE=python$PYTHON3_VERSION
+PYTHON3_PKG_PREFIX=python3
+PYTHON3_PREFIX=/usr
+PYTHON3_INSTALL_LIB=lib/python$PYTHON3_VERSION/dist-packages
+## End Debian Python defaults.
+
case "$TARGET" in
debian7)
FORMAT=deb
- PYTHON2_PACKAGE=python$PYTHON2_VERSION
- PYTHON2_PKG_PREFIX=python
- PYTHON3_PACKAGE=python$PYTHON3_VERSION
- PYTHON3_PKG_PREFIX=python3
- PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+ PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
- 'pycurl<7.21.5' contextlib2)
- PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+ 'pycurl<7.21.5' contextlib2 pyyaml 'rdflib>=4.2.0' \
+ shellescape mistune typing avro ruamel.ordereddict)
+ PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
;;
debian8)
FORMAT=deb
- PYTHON2_PACKAGE=python$PYTHON2_VERSION
- PYTHON2_PKG_PREFIX=python
- PYTHON3_PACKAGE=python$PYTHON3_VERSION
- PYTHON3_PKG_PREFIX=python3
- PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+ PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
- 'pycurl<7.21.5')
- PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+ 'pycurl<7.21.5' pyyaml 'rdflib>=4.2.0' \
+ shellescape mistune typing avro ruamel.ordereddict)
+ PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
;;
ubuntu1204)
FORMAT=deb
- PYTHON2_PACKAGE=python$PYTHON2_VERSION
- PYTHON2_PKG_PREFIX=python
- PYTHON3_PACKAGE=python$PYTHON3_VERSION
- PYTHON3_PKG_PREFIX=python3
- PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+ PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
- contextlib2 \
- 'pycurl<7.21.5')
- PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+ contextlib2 'pycurl<7.21.5' pyyaml 'rdflib>=4.2.0' \
+ shellescape mistune typing avro isodate ruamel.ordereddict)
+ PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
;;
ubuntu1404)
FORMAT=deb
- PYTHON2_PACKAGE=python$PYTHON2_VERSION
- PYTHON2_PKG_PREFIX=python
- PYTHON3_PACKAGE=python$PYTHON3_VERSION
- PYTHON3_PKG_PREFIX=python3
- PYTHON_BACKPORTS=(pyasn1==0.1.7 pyvcf pyasn1-modules==0.0.5 llfuse==0.41.1 ciso8601 \
+ PYTHON_BACKPORTS=(pyasn1==0.1.7 pyasn1-modules==0.0.5 llfuse==0.41.1 ciso8601 \
google-api-python-client==1.4.2 six uritemplate oauth2client==1.5.2 httplib2 \
- rsa 'pycurl<7.21.5' backports.ssl_match_hostname)
- PYTHON3_BACKPORTS=(docker-py requests websocket-client)
+ rsa 'pycurl<7.21.5' backports.ssl_match_hostname pyyaml 'rdflib>=4.2.0' \
+ shellescape mistune typing avro ruamel.ordereddict)
+ PYTHON3_BACKPORTS=(docker-py==1.7.2 requests websocket-client)
;;
centos6)
FORMAT=rpm
PYTHON2_PACKAGE=$(rpm -qf "$(which python$PYTHON2_VERSION)" --queryformat '%{NAME}\n')
PYTHON2_PKG_PREFIX=$PYTHON2_PACKAGE
+ PYTHON2_PREFIX=/opt/rh/python27/root/usr
+ PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/site-packages
PYTHON3_PACKAGE=$(rpm -qf "$(which python$PYTHON3_VERSION)" --queryformat '%{NAME}\n')
PYTHON3_PKG_PREFIX=$PYTHON3_PACKAGE
- PYTHON_BACKPORTS=(python-gflags pyvcf google-api-python-client==1.4.2 \
+ PYTHON3_PREFIX=/opt/rh/python33/root/usr
+ PYTHON3_INSTALL_LIB=lib/python$PYTHON3_VERSION/site-packages
+ PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
ciso8601 pycrypto backports.ssl_match_hostname 'pycurl<7.21.5' \
- python-daemon lockfile llfuse==0.41.1 'pbr<1.0')
- PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
+ python-daemon lockfile llfuse==0.41.1 'pbr<1.0' pyyaml \
+ 'rdflib>=4.2.0' shellescape mistune typing avro requests \
+ isodate pyparsing sparqlwrapper html5lib keepalive \
+ ruamel.ordereddict)
+ PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
+ export PYCURL_SSL_LIBRARY=nss
+ ;;
+ centos7)
+ FORMAT=rpm
+ PYTHON2_PACKAGE=$(rpm -qf "$(which python$PYTHON2_VERSION)" --queryformat '%{NAME}\n')
+ PYTHON2_PKG_PREFIX=$PYTHON2_PACKAGE
+ PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/site-packages
+ PYTHON3_PACKAGE=$(rpm -qf "$(which python$PYTHON3_VERSION)" --queryformat '%{NAME}\n')
+ PYTHON3_PKG_PREFIX=$PYTHON3_PACKAGE
+ PYTHON3_PREFIX=/opt/rh/python33/root/usr
+ PYTHON3_INSTALL_LIB=lib/python$PYTHON3_VERSION/site-packages
+ PYTHON_BACKPORTS=(python-gflags==2.0 google-api-python-client==1.4.2 \
+ oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
+ rsa uritemplate httplib2 ws4py pykka pyexecjs jsonschema \
+ ciso8601 pycrypto 'pycurl<7.21.5' \
+ python-daemon llfuse==0.41.1 'pbr<1.0' pyyaml \
+ 'rdflib>=4.2.0' shellescape mistune typing avro \
+ isodate pyparsing sparqlwrapper html5lib keepalive \
+ ruamel.ordereddict)
+ PYTHON3_BACKPORTS=(docker-py==1.7.2 six requests websocket-client)
export PYCURL_SSL_LIBRARY=nss
;;
*)
chmod o+r "$WORKSPACE" -R
# More cleanup - make sure all executables that we'll package are 755
+cd "$WORKSPACE"
find -type d -name 'bin' |xargs -I {} find {} -type f |xargs -I {} chmod 755 {}
# Now fix our umask to something better suited to building and publishing
rpm2cpio ${LIBFUSE_DIR}/fuse-2.9.2-6.el7.src.rpm | cpio -i
perl -pi -e 's/Conflicts:\s*filesystem.*//g' fuse.spec
)
- # build rpms from source
+ # build rpms from source
rpmbuild -bb /root/rpmbuild/SOURCES/fuse.spec
rm -f fuse-2.9.2-6.el7.src.rpm
# move built RPMs to LIBFUSE_DIR
"Keep storage daemon, accessible to clients on the LAN"
package_go_binary services/keepproxy keepproxy \
"Make a Keep cluster accessible to clients that are not on the LAN"
+package_go_binary services/keep-balance keep-balance \
+ "Rebalance and garbage-collect data blocks stored in Arvados Keep"
package_go_binary services/keep-web keep-web \
"Static web hosting service for user data stored in Arvados Keep"
package_go_binary services/datamanager arvados-data-manager \
"Gather cpu/memory/network statistics of running Crunch jobs"
package_go_binary tools/keep-rsync keep-rsync \
"Copy all data from one set of Keep servers to another"
+package_go_binary tools/keep-block-check keep-block-check \
+ "Verify that all data from one set of Keep servers to another was copied"
+package_go_binary sdk/go/crunchrunner crunchrunner \
+ "Crunchrunner executes a command inside a container and uploads the output"
# The Python SDK
# Please resist the temptation to add --no-python-fix-name to the fpm call here
# cwl-runner
cd $WORKSPACE/packages/$TARGET
rm -rf "$WORKSPACE/sdk/cwl/build"
-fpm_build $WORKSPACE/sdk/cwl "${PYTHON2_PKG_PREFIX}-arvados-cwl-runner" 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/sdk/cwl/arvados_cwl_runner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados CWL runner"
+fpm_build $WORKSPACE/sdk/cwl "${PYTHON2_PKG_PREFIX}-arvados-cwl-runner" 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/sdk/cwl/arvados_cwl_runner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados CWL runner" --iteration 3
+
+# schema_salad. This is a python dependency of arvados-cwl-runner,
+# but we can't use the usual PYTHONPACKAGES way to build this package due to the
+# intricacies of how version numbers get generated in setup.py: we need version
+# 1.7.20160316203940. If we don't explicitly list that version with the -v
+# argument to fpm, and instead specify it as schema_salad==1.7.20160316203940, we get
+# a package with version 1.7. That's because our gittagger hack is not being
+# picked up by self.distribution.get_version(), which is called from
+# https://github.com/jordansissel/fpm/blob/master/lib/fpm/package/pyfpm/get_metadata.py
+# by means of this command:
+#
+# python2.7 setup.py --command-packages=pyfpm get_metadata --output=metadata.json
+#
+# So we build this thing separately.
+#
+# Ward, 2016-03-17
+fpm_build schema_salad "" "" python 1.11.20160506154702
+
+# And schema_salad now depends on ruamel-yaml, which apparently has a braindead setup.py that requires special arguments to build (otherwise, it aborts with 'error: you have to install with "pip install ."'). Sigh.
+# Ward, 2016-05-26
+fpm_build ruamel.yaml "" "" python "" --python-setup-py-arguments "--single-version-externally-managed"
+
+# And for cwltool we have the same problem as for schema_salad. Ward, 2016-03-17
+fpm_build cwltool "" "" python 1.0.20160519182434
+
+# FPM eats the trailing .0 in the python-rdflib-jsonld package when built with 'rdflib-jsonld>=0.3.0'. Force the version. Ward, 2016-03-25
+fpm_build rdflib-jsonld "" "" python 0.3.0
# The PAM module
if [[ $TARGET =~ debian|ubuntu ]]; then
rm -rf "$WORKSPACE/services/dockercleaner/build"
fpm_build $WORKSPACE/services/dockercleaner arvados-docker-cleaner 'Curoverse, Inc.' 'python3' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/services/dockercleaner/arvados_docker_cleaner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados Docker image cleaner"
+# The Arvados crunchstat-summary tool
+cd $WORKSPACE/packages/$TARGET
+rm -rf "$WORKSPACE/tools/crunchstat-summary/build"
+fpm_build $WORKSPACE/tools/crunchstat-summary ${PYTHON2_PKG_PREFIX}-crunchstat-summary 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/tools/crunchstat-summary/crunchstat_summary.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=Crunchstat-summary reads Arvados Crunch log files and summarize resource usage"
+
# Forked libcloud
LIBCLOUD_DIR=$(mktemp -d)
(
set -e
cd "$pyfpm_workdir"
pip install "${PIP_DOWNLOAD_SWITCHES[@]}" --download . "$deppkg"
- tar -xf "$deppkg"-*.tar*
+ # Sometimes pip gives us a tarball, sometimes a zip file...
+ DOWNLOADED=`ls $deppkg-*`
+ [[ "$DOWNLOADED" =~ ".tar" ]] && tar -xf $DOWNLOADED
+ [[ "$DOWNLOADED" =~ ".zip" ]] && unzip $DOWNLOADED
cd "$deppkg"-*/
"python$PYTHON2_VERSION" setup.py $DASHQ_UNLESS_DEBUG egg_info build
chmod -R go+rX .
set +e
- # --iteration 2 provides an upgrade for previously built
- # buggy packages.
- fpm_build . "$outname" "" python "" --iteration 2
+ fpm_build . "$outname" "" python "" --iteration 3
# The upload step uses the package timestamp to determine
# whether it's new. --no-clobber plays nice with that.
mv --no-clobber "$outname"*.$FORMAT "$WORKSPACE/packages/$TARGET"
--- /dev/null
+#!/bin/bash
+
+read -rd "\000" helpmessage <<EOF
+$(basename $0): Build, test and (optionally) upload packages for one target
+
+Syntax:
+ WORKSPACE=/path/to/arvados $(basename $0) [options]
+
+--target <target>
+ Distribution to build packages for (default: debian7)
+--upload
+ If the build and test steps are successful, upload the packages
+ to a remote apt repository (default: false)
+
+WORKSPACE=path Path to the Arvados source tree to build packages from
+
+EOF
+
+if ! [[ -n "$WORKSPACE" ]]; then
+ echo >&2 "$helpmessage"
+ echo >&2
+ echo >&2 "Error: WORKSPACE environment variable not set"
+ echo >&2
+ exit 1
+fi
+
+if ! [[ -d "$WORKSPACE" ]]; then
+ echo >&2 "$helpmessage"
+ echo >&2
+ echo >&2 "Error: $WORKSPACE is not a directory"
+ echo >&2
+ exit 1
+fi
+
+PARSEDOPTS=$(getopt --name "$0" --longoptions \
+ help,upload,target: \
+ -- "" "$@")
+if [ $? -ne 0 ]; then
+ exit 1
+fi
+
+TARGET=debian7
+UPLOAD=0
+
+eval set -- "$PARSEDOPTS"
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --help)
+ echo >&2 "$helpmessage"
+ echo >&2
+ exit 1
+ ;;
+ --target)
+ TARGET="$2"; shift
+ ;;
+ --upload)
+ UPLOAD=1
+ ;;
+ --)
+ if [ $# -gt 1 ]; then
+ echo >&2 "$0: unrecognized argument '$2'. Try: $0 --help"
+ exit 1
+ fi
+ ;;
+ esac
+ shift
+done
+
+exit_cleanly() {
+ trap - INT
+ report_outcomes
+ exit ${#failures}
+}
+
+COLUMNS=80
+. $WORKSPACE/build/run-library.sh
+
+title "Start build packages"
+timer_reset
+
+$WORKSPACE/build/run-build-packages-one-target.sh --target $TARGET
+
+checkexit $? "build packages"
+title "End of build packages (`timer`)"
+
+title "Start test packages"
+timer_reset
+
+if [ ${#failures[@]} -eq 0 ]; then
+ $WORKSPACE/build/run-build-packages-one-target.sh --target $TARGET --test-packages
+else
+ echo "Skipping package upload, there were errors building the packages"
+fi
+
+checkexit $? "test packages"
+title "End of test packages (`timer`)"
+
+if [[ "$UPLOAD" != 0 ]]; then
+ title "Start upload packages"
+ timer_reset
+
+ if [ ${#failures[@]} -eq 0 ]; then
+ /usr/local/arvados-dev/jenkins/run_upload_packages.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET
+ else
+ echo "Skipping package upload, there were errors building and/or testing the packages"
+ fi
+ checkexit $? "upload packages"
+ title "End of upload packages (`timer`)"
+fi
+
+exit_cleanly
# pip).
PACKAGE=$1
shift
- # The name of the package to build. Defaults to $PACKAGE.
- PACKAGE_NAME=${1:-$PACKAGE}
+ # The name of the package to build.
+ PACKAGE_NAME=$1
shift
# Optional: the vendor of the package. Should be "Curoverse, Inc." for
# packages of our own software. Passed to fpm --vendor.
VERSION=$1
shift
+ local default_iteration_value="$(default_iteration "$PACKAGE" "$VERSION")"
+
case "$PACKAGE_TYPE" in
python)
# All Arvados Python2 packages depend on Python 2.7.
set -- "$@" --python-bin python2.7 \
--python-easyinstall "$EASY_INSTALL2" \
--python-package-name-prefix "$PYTHON2_PKG_PREFIX" \
+ --prefix "$PYTHON2_PREFIX" \
+ --python-install-lib "$PYTHON2_INSTALL_LIB" \
+ --exclude "${PYTHON2_INSTALL_LIB#/}/tests" \
--depends "$PYTHON2_PACKAGE"
+ # Fix --iteration for #9242.
+ default_iteration_value=$(($default_iteration_value + 1))
;;
python3)
# fpm does not actually support a python3 package type. Instead
set -- "$@" --python-bin python3 \
--python-easyinstall "$EASY_INSTALL3" \
--python-package-name-prefix "$PYTHON3_PKG_PREFIX" \
+ --prefix "$PYTHON3_PREFIX" \
+ --python-install-lib "$PYTHON3_INSTALL_LIB" \
+ --exclude "${PYTHON3_INSTALL_LIB#/}/tests" \
--depends "$PYTHON3_PACKAGE"
+ # Fix --iteration for #9242.
+ default_iteration_value=$(($default_iteration_value + 1))
;;
esac
declare -a COMMAND_ARR=("fpm" "--maintainer=Ward Vandewege <ward@curoverse.com>" "-s" "$PACKAGE_TYPE" "-t" "$FORMAT")
- if [ python = "$PACKAGE_TYPE" ]; then
- COMMAND_ARR+=(--exclude=\*/{dist,site}-packages/tests/\*)
- if [ deb = "$FORMAT" ]; then
- # Dependencies are built from setup.py. Since setup.py will never
- # refer to Debian package iterations, it doesn't make sense to
- # enforce those in the .deb dependencies.
- COMMAND_ARR+=(--deb-ignore-iteration-in-dependencies)
- fi
+ if [ python = "$PACKAGE_TYPE" ] && [ deb = "$FORMAT" ]; then
+ # Dependencies are built from setup.py. Since setup.py will never
+ # refer to Debian package iterations, it doesn't make sense to
+ # enforce those in the .deb dependencies.
+ COMMAND_ARR+=(--deb-ignore-iteration-in-dependencies)
fi
if [[ "${DEBUG:-0}" != "0" ]]; then
COMMAND_ARR+=('--verbose' '--log' 'info')
fi
- if [[ "$PACKAGE_NAME" != "$PACKAGE" ]]; then
+ if [[ -n "$PACKAGE_NAME" ]]; then
COMMAND_ARR+=('-n' "$PACKAGE_NAME")
fi
fi
# We can always add an --iteration here. If another one is specified in $@,
# that will take precedence, as desired.
- COMMAND_ARR+=(--iteration "$(default_iteration "$PACKAGE" "$VERSION")")
+ COMMAND_ARR+=(--iteration "$default_iteration_value")
# Append --depends X and other arguments specified by fpm-info.sh in
# the package source dir. These are added last so they can override
"${PACKAGE%%=/*}"
# backports ("llfuse==0.41.1" => "backports/python-llfuse")
"${WORKSPACE}/backports/${PACKAGE_TYPE}-${PACKAGE%%[<=>]*}")
+ if [[ -n "$PACKAGE_NAME" ]]; then
+ fpm_dirs+=("${WORKSPACE}/backports/${PACKAGE_NAME}")
+ fi
for pkgdir in "${fpm_dirs[@]}"; do
fpminfo="$pkgdir/fpm-info.sh"
if [[ -e "$fpminfo" ]]; then
$SUDO yum -q -y install $PACKAGES
fi
}
+
+title () {
+ txt="********** $1 **********"
+ printf "\n%*s%s\n\n" $((($COLUMNS-${#txt})/2)) "" "$txt"
+}
+
+checkexit() {
+ if [[ "$1" != "0" ]]; then
+ title "!!!!!! $2 FAILED !!!!!!"
+ failures+=("$2 (`timer`)")
+ else
+ successes+=("$2 (`timer`)")
+ fi
+}
+
+timer_reset() {
+ t0=$SECONDS
+}
+
+timer() {
+ echo -n "$(($SECONDS - $t0))s"
+}
+
+report_outcomes() {
+ for x in "${successes[@]}"
+ do
+ echo "Pass: $x"
+ done
+
+ if [[ ${#failures[@]} == 0 ]]
+ then
+ echo "All test suites passed."
+ else
+ echo "Failures (${#failures[@]}):"
+ for x in "${failures[@]}"
+ do
+ echo "Fail: $x"
+ done
+ fi
+}
. `dirname "$(readlink -f "$0")"`/libcloud-pin
+COLUMNS=80
+. `dirname "$(readlink -f "$0")"`/run-library.sh
+
read -rd "\000" helpmessage <<EOF
$(basename $0): Install and test Arvados components.
You should provide GOPATH, GEMHOME, and VENVDIR options
from a previous invocation if you use this option.
--only-install Run specific install step
+--short Skip (or scale down) some slow tests.
WORKSPACE=path Arvados source tree to test.
CONFIGSRC=path Dir with api server config files to copy into source tree.
(If none given, leave config files alone in source tree.)
services/keep-web
services/keepproxy
services/keepstore
+services/keep-balance
services/login-sync
services/nodemanager
services/crunch-run
sdk/pam
sdk/python
sdk/ruby
+sdk/go/arvados
sdk/go/arvadosclient
sdk/go/keepclient
+sdk/go/httpserver
sdk/go/manifest
sdk/go/blockdigest
sdk/go/streamer
sdk/cwl
tools/crunchstat-summary
tools/keep-rsync
+tools/keep-block-check
EOF
GEMHOME=
PERLINSTALLBASE=
-COLUMNS=80
-
+short=
skip_install=
temp=
temp_preserve=
exit 1
}
-report_outcomes() {
- for x in "${successes[@]}"
- do
- echo "Pass: $x"
- done
-
- if [[ ${#failures[@]} == 0 ]]
- then
- echo "All test suites passed."
- else
- echo "Failures (${#failures[@]}):"
- for x in "${failures[@]}"
- do
- echo "Fail: $x"
- done
- fi
-}
-
exit_cleanly() {
trap - INT
create-plot-data-from-log.sh $BUILD_NUMBER "$WORKSPACE/apps/workbench/log/test.log" "$WORKSPACE/apps/workbench/log/"
echo -n 'go: '
go version \
|| fatal "No go binary. See http://golang.org/doc/install"
+ [[ $(go version) =~ go1.([0-9]+) ]] && [[ ${BASH_REMATCH[1]} -ge 6 ]] \
+ || fatal "Go >= 1.6 required. See http://golang.org/doc/install"
echo -n 'gcc: '
gcc --version | egrep ^gcc \
|| fatal "No gcc. Try: apt-get install build-essential"
--only)
only="$1"; skip[$1]=""; shift
;;
+ --short)
+ short=1
+ ;;
--skip-install)
skip_install=1
;;
if ! [[ -e "$venvdest/bin/activate" ]] || ! [[ -e "$venvdest/bin/pip" ]]; then
virtualenv --setuptools "$@" "$venvdest" || fatal "virtualenv $venvdest failed"
fi
- "$venvdest/bin/pip" install 'setuptools>=18' 'pip>=7'
+ if [[ $("$venvdest/bin/python" --version 2>&1) =~ \ 3\.[012]\. ]]; then
+ # pip 8.0.0 dropped support for python 3.2, e.g., debian wheezy
+ "$venvdest/bin/pip" install 'setuptools>=18' 'pip>=7,<8'
+ else
+ "$venvdest/bin/pip" install 'setuptools>=18' 'pip>=7'
+ fi
# ubuntu1404 can't seem to install mock via tests_require, but it can do this.
"$venvdest/bin/pip" install 'mock>=1.0' 'pbr<1.7.0'
}
gem install --user-install bundler || fatal 'Could not install bundler'
fi
-checkexit() {
- if [[ "$1" != "0" ]]; then
- title "!!!!!! $2 FAILED !!!!!!"
- failures+=("$2 (`timer`)")
- else
- successes+=("$2 (`timer`)")
- fi
-}
-
-timer_reset() {
- t0=$SECONDS
-}
-
-timer() {
- echo -n "$(($SECONDS - $t0))s"
-}
-
retry() {
while ! ${@} && [[ "$retry" == 1 ]]
do
# before trying "go test". Otherwise, coverage-reporting
# mode makes Go show the wrong line numbers when reporting
# compilation errors.
+ go get -t "git.curoverse.com/arvados.git/$1" || return 1
if [[ -n "${testargs[$1]}" ]]
then
# "go test -check.vv giturl" doesn't work, but this
# does:
- cd "$WORKSPACE/$1" && \
- go get -t "git.curoverse.com/arvados.git/$1" && \
- go test ${coverflags[@]} ${testargs[$1]}
+ cd "$WORKSPACE/$1" && go test ${short:+-short} ${testargs[$1]}
else
# The above form gets verbose even when testargs is
# empty, so use this form in such cases:
- go get -t "git.curoverse.com/arvados.git/$1" && \
- go test ${coverflags[@]} "git.curoverse.com/arvados.git/$1"
+ go test ${short:+-short} ${coverflags[@]} "git.curoverse.com/arvados.git/$1"
fi
result="$?"
- go tool cover -html="$WORKSPACE/tmp/.$covername.tmp" -o "$WORKSPACE/tmp/$covername.html"
- rm "$WORKSPACE/tmp/.$covername.tmp"
+ if [[ -f "$WORKSPACE/tmp/.$covername.tmp" ]]
+ then
+ go tool cover -html="$WORKSPACE/tmp/.$covername.tmp" -o "$WORKSPACE/tmp/$covername.html"
+ rm "$WORKSPACE/tmp/.$covername.tmp"
+ fi
elif [[ "$2" == "pip" ]]
then
# $3 can name a path directory for us to use, including trailing
# slash; e.g., the bin/ subdirectory of a virtualenv.
cd "$WORKSPACE/$1" \
- && "${3}python" setup.py test ${testargs[$1]}
+ && "${3}python" setup.py ${short:+--short-tests-only} test ${testargs[$1]}
elif [[ "$2" != "" ]]
then
"test_$2"
fi
}
-title () {
- txt="********** $1 **********"
- printf "\n%*s%s\n\n" $((($COLUMNS-${#txt})/2)) "" "$txt"
-}
-
bundle_install_trylocal() {
(
set -e
declare -a gostuff
gostuff=(
+ sdk/go/arvados
sdk/go/arvadosclient
sdk/go/blockdigest
+ sdk/go/httpserver
sdk/go/manifest
sdk/go/streamer
sdk/go/crunchrunner
services/keep-web
services/keepstore
sdk/go/keepclient
+ services/keep-balance
services/keepproxy
services/datamanager/summary
services/datamanager/collection
services/crunch-dispatch-slurm
services/crunch-run
tools/keep-rsync
+ tools/keep-block-check
)
for g in "${gostuff[@]}"
do
test_apiserver() {
rm -f "$WORKSPACE/services/api/git-commit.version"
cd "$WORKSPACE/services/api" \
- && RAILS_ENV=test bundle exec rake test TESTOPTS=-v ${testargs[services/api]}
+ && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test TESTOPTS=-v ${testargs[services/api]}
}
do_test services/api apiserver
test_workbench() {
start_nginx_proxy_services \
&& cd "$WORKSPACE/apps/workbench" \
- && RAILS_ENV=test bundle exec rake test TESTOPTS=-v ${testargs[apps/workbench]}
+ && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test TESTOPTS=-v ${testargs[apps/workbench]}
}
do_test apps/workbench workbench
test_workbench_benchmark() {
start_nginx_proxy_services \
&& cd "$WORKSPACE/apps/workbench" \
- && RAILS_ENV=test bundle exec rake test:benchmark ${testargs[apps/workbench_benchmark]}
+ && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test:benchmark ${testargs[apps/workbench_benchmark]}
}
do_test apps/workbench_benchmark workbench_benchmark
test_workbench_profile() {
start_nginx_proxy_services \
&& cd "$WORKSPACE/apps/workbench" \
- && RAILS_ENV=test bundle exec rake test:profile ${testargs[apps/workbench_profile]}
+ && env RAILS_ENV=test ${short:+RAILS_TEST_SHORT=1} bundle exec rake test:profile ${testargs[apps/workbench_profile]}
}
do_test apps/workbench_profile workbench_profile
#!/bin/sh
-exec $TASK_KEEPMOUNT/$JOB_PARAMETER_CRUNCHRUNNER
+
+if test -n "$JOB_PARAMETER_CRUNCHRUNNER" ; then
+ exec $TASK_KEEPMOUNT/$JOB_PARAMETER_CRUNCHRUNNER
+else
+ exec /usr/local/bin/crunchrunner
+fi
--- /dev/null
+#!/usr/bin/env python
+
+# Crunch script integration for running arvados-cwl-runner (importing
+# arvados_cwl module) inside a crunch job.
+#
+# This gets the job record, transforms the script parameters into a valid CWL
+# input object, then executes the CWL runner to run the underlying workflow or
+# tool. When the workflow completes, record the output object in an output
+# collection for this runner job.
+
+import arvados
+import arvados_cwl
+import arvados.collection
+import arvados.util
+from cwltool.process import shortname
+import cwltool.main
+import logging
+import os
+import json
+import argparse
+from arvados.api import OrderedJsonModel
+from cwltool.process import adjustFiles
+from cwltool.load_tool import load_tool
+
+# Print package versions
+logging.info(cwltool.main.versionstring())
+
+api = arvados.api("v1")
+
+try:
+ job_order_object = arvados.current_job()['script_parameters']
+
+ def keeppath(v):
+ if arvados.util.keep_locator_pattern.match(v):
+ return "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], v)
+ else:
+ return v
+
+ job_order_object["cwl:tool"] = keeppath(job_order_object["cwl:tool"])
+
+ for k,v in job_order_object.items():
+ if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
+ job_order_object[k] = {
+ "class": "File",
+ "path": keeppath(v)
+ }
+
+ adjustFiles(job_order_object, keeppath)
+
+ runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()))
+
+ t = load_tool(job_order_object, runner.arvMakeTool)
+
+ args = argparse.Namespace()
+ args.project_uuid = arvados.current_job()["owner_uuid"]
+ args.enable_reuse = True
+ args.submit = False
+ args.debug = True
+ args.quiet = False
+ args.ignore_docker_for_reuse = False
+ args.basedir = os.getcwd()
+ args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
+ outputObj = runner.arvExecutor(t, job_order_object, **vars(args))
+
+ files = {}
+ def capture(path):
+ sp = path.split("/")
+ col = sp[0][5:]
+ if col not in files:
+ files[col] = set()
+ files[col].add("/".join(sp[1:]))
+ return path
+
+ adjustFiles(outputObj, capture)
+
+ final = arvados.collection.Collection()
+
+ for k,v in files.iteritems():
+ with arvados.collection.Collection(k) as c:
+ for f in c:
+ final.copy(f, f, c, True)
+
+ def makeRelative(path):
+ return "/".join(path.split("/")[1:])
+
+ adjustFiles(outputObj, makeRelative)
+
+ with final.open("cwl.output.json", "w") as f:
+ json.dump(outputObj, f, indent=4)
+
+ api.job_tasks().update(uuid=arvados.current_task()['uuid'],
+ body={
+ 'output': final.save_new(create_collection_record=False),
+ 'success': True,
+ 'progress':1.0
+ }).execute()
+except Exception as e:
+ logging.exception("Unhandled exception")
+ api.job_tasks().update(uuid=arvados.current_task()['uuid'],
+ body={
+ 'output': None,
+ 'success': False,
+ 'progress':1.0
+ }).execute()
To "enable Software Collections on CentOS":https://wiki.centos.org/AdditionalResources/Repositories/SCL, run:
<notextile>
-<pre><code>~$ <span class="userinput">sudo yum install centos-release-SCL scl-utils</span>
+<pre><code>~$ <span class="userinput">sudo yum install centos-release-scl scl-utils</span>
</code></pre>
</notextile>
|arvados_sdk_version|string|Git commit hash that specifies the SDK version to use from the Arvados repository|This is set by searching the Arvados repository for a match for the arvados_sdk_version runtime constraint.|
|docker_image_locator|string|Portable data hash of the collection that contains the Docker image to use|This is set by searching readable collections for a match for the docker_image runtime constraint.|
|runtime_constraints|hash|Constraints that must be satisfied by the job/task scheduler in order to run the job.|See below.|
+|components|hash|Name and uuid pairs representing the child work units of this job. The uuids can be of different object types.|Example components hash: @{"name1": "zzzzz-8i9sb-xyz...", "name2": "zzzzz-d1hrv-xyz...",}@|
h3(#script_version). Specifying Git versions
$ arvbox
Arvados-in-a-box http://arvados.org
-arvbox (build|start|run|open|shell|ip|stop|rebuild|reset|destroy|log|svrestart)
-
-build <config> build arvbox Docker image
+build <config> build arvbox Docker image
+rebuild <config> build arvbox Docker image, no layer cache
start|run <config> start arvbox container
open open arvbox workbench in a web browser
shell enter arvbox shell
status print some information about current arvbox
stop stop arvbox container
restart <config> stop, then run again
-rebuild <config> stop, build arvbox Docker image, run
+reboot <config> stop, build arvbox Docker image, run
reset delete arvbox arvados data (be careful!)
destroy delete all arvbox code and data (be careful!)
log <service> tail log of specified service
In "dev" and "localdemo" mode, Arvbox can only be accessed on the same host it is running. To publish Arvbox service ports to the host's service ports and advertise the host's IP address for services, use @publicdev@ or @publicdemo@:
<pre>
-$ arvbox rebuild publicdemo
+$ arvbox start publicdemo
</pre>
This attempts to auto-detect the correct IP address to use by taking the IP address of the default route device. If the auto-detection is wrong, you want to publish a hostname instead of a raw address, or you need to access it through a different device (such as a router or firewall), set @ARVBOX_PUBLISH_IP@ to the desire hostname or IP address.
<pre>
$ export ARVBOX_PUBLISH_IP=example.com
-$ arvbox rebuild publicdemo
+$ arvbox start publicdemo
</pre>
Note: this expects to bind the host's port 80 (http) for workbench, so you cannot have a conflicting web server already running on the host. It does not attempt to take bind the host's port 22 (ssh), as a result the arvbox ssh port is not published.
On Debian-based systems:
<notextile>
-<pre><code>~$ <span class="userinput">sudo apt-get install perl python-virtualenv fuse python-arvados-python-client python-arvados-fuse crunchstat arvados-docker-cleaner iptables ca-certificates</span>
+<pre><code>~$ <span class="userinput">sudo apt-get install perl python-virtualenv fuse python-arvados-python-client python-arvados-fuse crunchrunner crunchstat arvados-docker-cleaner iptables ca-certificates</span>
</code></pre>
</notextile>
On Red Hat-based systems:
<notextile>
-<pre><code>~$ <span class="userinput">sudo yum install perl python27-python-virtualenv fuse python27-python-arvados-python-client python27-python-arvados-fuse crunchstat arvados-docker-cleaner iptables ca-certificates</span>
+<pre><code>~$ <span class="userinput">sudo yum install perl python27-python-virtualenv fuse python27-python-arvados-python-client python27-python-arvados-fuse crunchrunner crunchstat arvados-docker-cleaner iptables ca-certificates</span>
</code></pre>
</notextile>
-azure-storage-account-name="": Azure storage account name used for subsequent --azure-storage-container-volume arguments.
-azure-storage-container-volume=[]: Use the given container as a storage volume. Can be given multiple times.
-azure-storage-replication=3: Replication level to report to clients when data is stored in an Azure container.
- -blob-signature-ttl=1209600: Lifetime of blob permission signatures. See services/api/config/application.default.yml.
+ -blob-signature-ttl=1209600: Lifetime of blob permission signatures. Modifying the ttl will invalidate all existing signatures. See services/api/config/application.default.yml.
-blob-signing-key-file="": File containing the secret key for generating and verifying blob permission signatures.
-data-manager-token-file="": File with the API token used by the Data Manager. All DELETE requests or GET /index requests must carry this token.
-enforce-permissions=false: Enforce permission signatures on requests.
-listen=":25107": Listening address, in the form "host:port". e.g., 10.0.1.24:8000. Omit the host part to listen on all interfaces.
-max-buffers=128: Maximum RAM to use for data buffers, given in multiples of block size (64 MiB). When this limit is reached, HTTP requests requiring buffers (like GET and PUT) will wait for buffer space to be released.
+ -max-requests int
+ Maximum concurrent requests. When this limit is reached, new requests will receive 503 responses. Note: this limit does not include idle connections from clients using HTTP keepalive, so it does not strictly limit the number of concurrent connections. (default 2 * max-buffers)
-never-delete=false: If set, nothing will be deleted. HTTP 405 will be returned for valid DELETE requests.
-permission-key-file="": Synonym for -blob-signing-key-file.
-permission-ttl=0: Synonym for -blob-signature-ttl.
On Debian-based systems:
<notextile>
-<pre><code>~$ <span class="userinput">sudo apt-get install python-arvados-python-client python-arvados-fuse</span>
+<pre><code>~$ <span class="userinput">sudo apt-get install python-arvados-python-client python-arvados-fuse crunchrunner</span>
</code></pre>
</notextile>
On Red Hat-based systems:
<notextile>
-<pre><code>~$ <span class="userinput">sudo yum install python27-python-arvados-python-client python27-python-arvados-fuse</span>
+<pre><code>~$ <span class="userinput">sudo yum install python27-python-arvados-python-client python27-python-arvados-fuse crunchrunner</span>
</code></pre>
</notextile>
date >keep-proxy-image
jobs-image: debian-arvados-image $(BUILD) $(JOBS_DEPS)
- $(DOCKER_BUILD) -t arvados/jobs jobs
+ $(DOCKER_BUILD) --build-arg COMMIT=$(COMMIT) -t arvados/jobs jobs
date >jobs-image
java-bwa-samtools-image: jobs-image $(BUILD) $(JAVA_BWA_SAMTOOLS_DEPS)
RUN apt-get update -q
## 20150915 nico -- fuse.postint has sporatic failures, spliting this up to see if it helps
RUN apt-get install -qy fuse
-RUN apt-get install -qy supervisor python-pip python-pyvcf python-gflags python-google-api-python-client python-virtualenv libattr1-dev libfuse-dev python-dev python-llfuse crunchstat python-arvados-fuse cron dnsmasq
+RUN apt-get install -qy supervisor python-pip python-gflags python-google-api-python-client python-virtualenv libattr1-dev libfuse-dev python-dev python-llfuse crunchstat python-arvados-fuse cron dnsmasq
ADD fuse.conf /etc/fuse.conf
RUN chmod 644 /etc/fuse.conf
-# Based on Debian Wheezy
-FROM arvados/debian:wheezy
+# Based on Debian Jessie
+FROM debian:jessie
MAINTAINER Ward Vandewege <ward@curoverse.com>
ENV DEBIAN_FRONTEND noninteractive
ADD apt.arvados.org.list /etc/apt/sources.list.d/
RUN apt-key adv --keyserver pool.sks-keyservers.net --recv 1078ECD7
-RUN apt-get update -q
+RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3
-RUN apt-get install -qy git python-pip python-virtualenv python-arvados-python-client python-dev libcurl4-gnutls-dev
+ARG COMMIT=latest
+RUN echo $COMMIT && apt-get update -q
-RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3
+RUN apt-get install -qy git python-pip python-virtualenv python-arvados-python-client python-dev libcurl4-gnutls-dev nodejs python-arvados-cwl-runner
# Install dependencies and set up system.
-# The FUSE packages help ensure that we can install the Python SDK (arv-mount).
RUN /usr/sbin/adduser --disabled-password \
--gecos 'Crunch execution user' crunch && \
/usr/bin/install --directory --owner=crunch --group=crunch --mode=0700 /keep /tmp/crunch-src /tmp/crunch-job
# apt.arvados.org
-deb http://apt.arvados.org/ wheezy main
+deb http://apt.arvados.org/ jessie main
RUN apt-get update -q
RUN apt-get install -qy \
- python-pip python-pyvcf python-gflags python-google-api-python-client \
+ python-pip python-gflags python-google-api-python-client \
python-virtualenv libattr1-dev libfuse-dev python-dev python-llfuse fuse \
crunchstat python-arvados-fuse cron vim supervisor openssh-server
s.executables << "arv-tag"
s.required_ruby_version = '>= 2.1.0'
s.add_runtime_dependency 'arvados', '~> 0.1', '>= 0.1.20150128223554'
- s.add_runtime_dependency 'google-api-client', '~> 0.6.3', '>= 0.6.3'
+ # Our google-api-client dependency used to be < 0.9, but that could be
+ # satisfied by the buggy 0.9.pre*. https://dev.arvados.org/issues/9213
+ s.add_runtime_dependency 'google-api-client', '~> 0.6', '>= 0.6.3', '<0.8.9'
s.add_runtime_dependency 'activesupport', '~> 3.2', '>= 3.2.13'
s.add_runtime_dependency 'json', '~> 1.7', '>= 1.7.7'
s.add_runtime_dependency 'trollop', '~> 2.0'
s.add_runtime_dependency 'andand', '~> 1.3', '>= 1.3.3'
s.add_runtime_dependency 'oj', '~> 2.0', '>= 2.0.3'
s.add_runtime_dependency 'curb', '~> 0.8'
- s.add_runtime_dependency('jwt', '>= 0.1.5', '< 1.0.0')
s.homepage =
'https://arvados.org'
end
end
begin
- require 'curb'
- require 'rubygems'
- require 'arvados/google_api_client'
require 'json'
+ require 'net/http'
require 'pp'
- require 'trollop'
+ require 'tempfile'
+ require 'yaml'
+rescue LoadError => error
+ abort "Error loading libraries: #{error}\n"
+end
+
+begin
+ require 'rubygems'
+ # Load the gems with more requirements first, so we respect any version
+ # constraints they put on gems loaded later.
+ require 'arvados/google_api_client'
+ require 'active_support/inflector'
require 'andand'
+ require 'curb'
require 'oj'
- require 'active_support/inflector'
- require 'yaml'
- require 'tempfile'
- require 'net/http'
-rescue LoadError
+ require 'trollop'
+rescue LoadError => error
abort <<-EOS
+Error loading gems: #{error}
+
Please install all required gems:
- gem install activesupport andand curb google-api-client json oj trollop yaml
+ gem install arvados activesupport andand curb json oj trollop
EOS
end
end
end
- discovered_params.each do |k,v|
+ discovered_params.merge({resource => {'type' => 'object'}}).each do |k,v|
k = k.to_sym
if ['object', 'array'].index(v["type"]) and method_opts.has_key? k
if method_opts[k].andand.match /^\//
require 'trollop'
require 'google/api_client'
rescue LoadError => l
- puts $:
+ $stderr.puts $:
abort <<-EOS
#{$0}: fatal: #{l.message}
Some runtime dependencies may be missing.
abort "#{$0}: syntax error: --instance cannot be combined with --template or --submit."
end
elsif not $options[:template]
- puts "error: you must supply a --template or --instance."
+ $stderr.puts "error: you must supply a --template or --instance."
p.educate
abort
end
Log (undef, "docker image hash is $docker_hash");
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
-if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
- arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
+if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then
+ exit 0
+fi
+declare -a exit_codes=("\${PIPESTATUS[@]}")
+if [ 0 != "\${exit_codes[0]}" ]; then
+ exit "\${exit_codes[0]}" # `docker images` failed
+elif [ 1 != "\${exit_codes[1]}" ]; then
+ exit "\${exit_codes[1]}" # `grep` encountered an error
+else
+ # Everything worked fine, but grep didn't find the image on this host.
+ arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
fi
};
.q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
.q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
- ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
+ ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
+ .q{&& declare -a VOLUMES=() }
+ .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi }
+ .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; }
+ .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi };
$command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
$ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
# For now, use the same approach as TASK_WORK above.
$ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
+ # Bind mount the crunchrunner binary and host TLS certificates file into
+ # the container.
+ $command .= '"${VOLUMES[@]}" ';
+
while (my ($env_key, $env_val) = each %ENV)
{
if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
. $slot[$proc{$pid}->{slot}]->{cpu});
my $jobstepidx = $proc{$pid}->{jobstepidx};
- if (!WIFEXITED($childstatus))
- {
- # child did not exit (may be temporarily stopped)
- Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now.");
- next;
- }
-
$children_reaped++;
my $elapsed = time - $proc{$pid}->{time};
my $Jobstep = $jobstep[$jobstepidx];
sub preprocess_stderr
{
my $jobstepidx = shift;
+ # slotindex is only defined for children running Arvados job tasks.
+ # Be prepared to handle the undef case (for setup srun calls, etc.).
+ my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
my $line = $1;
# whoa.
$main::please_freeze = 1;
}
- elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
- # Skip the following tempfail checks if this srun proc isn't
- # attached to a particular worker slot.
- }
elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
- my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
- $slot[$job_slot_index]->{node}->{fail_count}++;
$jobstep[$jobstepidx]->{tempfail} = 1;
- ban_node_by_slot($job_slot_index);
+ if (defined($job_slot_index)) {
+ $slot[$job_slot_index]->{node}->{fail_count}++;
+ ban_node_by_slot($job_slot_index);
+ }
}
elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
$jobstep[$jobstepidx]->{tempfail} = 1;
- ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+ ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
}
elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
$jobstep[$jobstepidx]->{tempfail} = 1;
delete $reader{$jobstepidx};
my $j = pop @jobstep;
+ # If the srun showed signs of tempfail, ensure the caller treats that as a
+ # failure case.
+ if ($main::please_freeze || $j->{tempfail}) {
+ $exited ||= 255;
+ }
return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
}
require 'minitest/autorun'
require 'digest/md5'
require 'active_support/core_ext'
+require 'tempfile'
class TestCollectionCreate < Minitest::Test
def setup
end
assert /^([0-9a-z]{5}-4zz18-[0-9a-z]{15})?$/.match(out)
assert_equal '', err
- $stderr.puts err
+ end
+
+ def test_read_resource_object_from_file
+ tempfile = Tempfile.new('collection')
+ begin
+ tempfile.write({manifest_text: foo_manifest}.to_json)
+ tempfile.close
+ out, err = capture_subprocess_io do
+ assert_arv('--format', 'uuid',
+ 'collection', 'create', '--collection', tempfile.path)
+ end
+ assert /^([0-9a-z]{5}-4zz18-[0-9a-z]{15})?$/.match(out)
+ assert_equal '', err
+ ensure
+ tempfile.unlink
+ end
end
protected
--- /dev/null
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
--- /dev/null
+include LICENSE-2.0.txt
+include README.rst
#!/usr/bin/env python
+# Implement cwl-runner interface for submitting and running jobs on Arvados.
+
import argparse
import arvados
-import arvados.events
+import arvados.collection
import arvados.commands.keepdocker
import arvados.commands.run
-import arvados.collection
+import arvados.events
import arvados.util
-import cwltool.draft2tool
-import cwltool.workflow
-import cwltool.main
-from cwltool.process import shortname
-import threading
+import copy
import cwltool.docker
+from cwltool.draft2tool import revmap_file, remove_hostfs, CommandLineTool
+from cwltool.errors import WorkflowException
+import cwltool.main
+import cwltool.workflow
import fnmatch
+from functools import partial
+import json
import logging
-import re
import os
+import pkg_resources # part of setuptools
+import re
import sys
+import threading
+from cwltool.load_tool import fetch_document
+from cwltool.builder import Builder
+import urlparse
-from cwltool.process import get_feature
+from cwltool.process import shortname, get_feature, adjustFiles, adjustFileObjs, scandeps
from arvados.api import OrderedJsonModel
logger = logging.getLogger('arvados.cwl-runner')
logger.setLevel(logging.INFO)
-crunchrunner_pdh = "83db29f08544e1c319572a6bd971088a+140"
-crunchrunner_download = "https://cloud.curoverse.com/collections/download/qr1hi-4zz18-n3m1yxd0vx78jic/1i1u2qtq66k1atziv4ocfgsg5nu5tj11n4r6e0bhvjg03rix4m/crunchrunner"
-certs_download = "https://cloud.curoverse.com/collections/download/qr1hi-4zz18-n3m1yxd0vx78jic/1i1u2qtq66k1atziv4ocfgsg5nu5tj11n4r6e0bhvjg03rix4m/ca-certificates.crt"
-
tmpdirre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.tmpdir\)=(.*)")
outdirre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.outdir\)=(.*)")
keepre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.keep\)=(.*)")
def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid):
+ """Check if a Docker image is available in Keep, if not, upload it using arv-keepdocker."""
+
if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement:
dockerRequirement["dockerImageId"] = dockerRequirement["dockerPull"]
if image_tag:
args.append(image_tag)
logger.info("Uploading Docker image %s", ":".join(args[1:]))
- arvados.commands.keepdocker.main(args)
+ arvados.commands.keepdocker.main(args, stdout=sys.stderr)
return dockerRequirement["dockerImageId"]
class CollectionFsAccess(cwltool.process.StdFsAccess):
+ """Implement the cwltool FsAccess interface for Arvados Collections."""
+
def __init__(self, basedir):
+ super(CollectionFsAccess, self).__init__(basedir)
self.collections = {}
- self.basedir = basedir
def get_collection(self, path):
p = path.split("/")
return os.path.exists(self._abs(fn))
class ArvadosJob(object):
+ """Submit and manage a Crunch job for executing a CWL CommandLineTool."""
+
def __init__(self, runner):
self.arvrunner = runner
self.running = False
(docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
if docker_req and kwargs.get("use_container") is not False:
runtime_constraints["docker_image"] = arv_docker_get_image(self.arvrunner.api, docker_req, pull_image, self.arvrunner.project_uuid)
+ else:
+ runtime_constraints["docker_image"] = "arvados/jobs"
resources = self.builder.resources
if resources is not None:
runtime_constraints["min_ram_mb_per_node"] = resources.get("ram")
runtime_constraints["min_scratch_mb_per_node"] = resources.get("tmpdirSize", 0) + resources.get("outdirSize", 0)
+ filters = [["repository", "=", "arvados"],
+ ["script", "=", "crunchrunner"],
+ ["script_version", "in git", "9e5b98e8f5f4727856b53447191f9c06e3da2ba6"]]
+ if not self.arvrunner.ignore_docker_for_reuse:
+ filters.append(["docker_image_locator", "in docker", runtime_constraints["docker_image"]])
+
try:
- response = self.arvrunner.api.jobs().create(body={
- "owner_uuid": self.arvrunner.project_uuid,
- "script": "crunchrunner",
- "repository": "arvados",
- "script_version": "master",
- "minimum_script_version": "9e5b98e8f5f4727856b53447191f9c06e3da2ba6",
- "script_parameters": {"tasks": [script_parameters], "crunchrunner": crunchrunner_pdh+"/crunchrunner"},
- "runtime_constraints": runtime_constraints
- }, find_or_create=kwargs.get("enable_reuse", True)).execute(num_retries=self.arvrunner.num_retries)
+ response = self.arvrunner.api.jobs().create(
+ body={
+ "owner_uuid": self.arvrunner.project_uuid,
+ "script": "crunchrunner",
+ "repository": "arvados",
+ "script_version": "master",
+ "minimum_script_version": "9e5b98e8f5f4727856b53447191f9c06e3da2ba6",
+ "script_parameters": {"tasks": [script_parameters]},
+ "runtime_constraints": runtime_constraints
+ },
+ filters=filters,
+ find_or_create=kwargs.get("enable_reuse", True)
+ ).execute(num_retries=self.arvrunner.num_retries)
self.arvrunner.jobs[response["uuid"]] = self
- self.arvrunner.pipeline["components"][self.name] = {"job": response}
- self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
- body={
- "components": self.arvrunner.pipeline["components"]
- }).execute(num_retries=self.arvrunner.num_retries)
+ self.update_pipeline_component(response)
logger.info("Job %s (%s) is %s", self.name, response["uuid"], response["state"])
self.output_callback({}, "permanentFail")
def update_pipeline_component(self, record):
- self.arvrunner.pipeline["components"][self.name] = {"job": record}
- self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
+ if self.arvrunner.pipeline:
+ self.arvrunner.pipeline["components"][self.name] = {"job": record}
+ self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
body={
"components": self.arvrunner.pipeline["components"]
}).execute(num_retries=self.arvrunner.num_retries)
+ if self.arvrunner.uuid:
+ try:
+ job = self.arvrunner.api.jobs().get(uuid=self.arvrunner.uuid).execute()
+ if job:
+ components = job["components"]
+ components[self.name] = record["uuid"]
+ self.arvrunner.api.jobs().update(uuid=self.arvrunner.uuid,
+ body={
+ "components": components
+ }).execute(num_retries=self.arvrunner.num_retries)
+ except Exception as e:
+ logger.info("Error adding to components: %s", e)
def done(self, record):
try:
tmpdir = None
outdir = None
keepdir = None
- for l in log.readlines():
+ for l in log:
+ # Determine the tmpdir, outdir and keepdir paths from
+ # the job run. Unfortunately, we can't take the first
+ # values we find (which are expected to be near the
+ # top) and stop scanning because if the node fails and
+ # the job restarts on a different node these values
+ # will different runs, and we need to know about the
+ # final run that actually produced output.
+
g = tmpdirre.match(l)
if g:
tmpdir = g.group(1)
g = keepre.match(l)
if g:
keepdir = g.group(1)
- if tmpdir and outdir and keepdir:
- break
+
+ colname = "Output %s of %s" % (record["output"][0:7], self.name)
+
+ # check if collection already exists with same owner, name and content
+ collection_exists = self.arvrunner.api.collections().list(
+ filters=[["owner_uuid", "=", self.arvrunner.project_uuid],
+ ['portable_data_hash', '=', record["output"]],
+ ["name", "=", colname]]
+ ).execute(num_retries=self.arvrunner.num_retries)
+
+ if not collection_exists["items"]:
+ # Create a collection located in the same project as the
+ # pipeline with the contents of the output.
+ # First, get output record.
+ collections = self.arvrunner.api.collections().list(
+ limit=1,
+ filters=[['portable_data_hash', '=', record["output"]]],
+ select=["manifest_text"]
+ ).execute(num_retries=self.arvrunner.num_retries)
+
+ if not collections["items"]:
+ raise WorkflowException(
+ "Job output '%s' cannot be found on API server" % (
+ record["output"]))
+
+ # Create new collection in the parent project
+ # with the output contents.
+ self.arvrunner.api.collections().create(body={
+ "owner_uuid": self.arvrunner.project_uuid,
+ "name": colname,
+ "portable_data_hash": record["output"],
+ "manifest_text": collections["items"][0]["manifest_text"]
+ }, ensure_unique_name=True).execute(
+ num_retries=self.arvrunner.num_retries)
self.builder.outdir = outdir
self.builder.pathmapper.keepdir = keepdir
outputs = self.collect_outputs("keep:" + record["output"])
+ except WorkflowException as e:
+ logger.error("Error while collecting job outputs:\n%s", e, exc_info=(e if self.arvrunner.debug else False))
+ processStatus = "permanentFail"
except Exception as e:
- logger.exception("Got exception while collecting job outputs:")
+ logger.exception("Got unknown exception while collecting job outputs:")
processStatus = "permanentFail"
self.output_callback(outputs, processStatus)
del self.arvrunner.jobs[record["uuid"]]
+class RunnerJob(object):
+ """Submit and manage a Crunch job that runs crunch_scripts/cwl-runner."""
+
+ def __init__(self, runner, tool, job_order, enable_reuse):
+ self.arvrunner = runner
+ self.tool = tool
+ self.job_order = job_order
+ self.running = False
+ self.enable_reuse = enable_reuse
+
+ def update_pipeline_component(self, record):
+ pass
+
+ def upload_docker(self, tool):
+ if isinstance(tool, CommandLineTool):
+ (docker_req, docker_is_req) = get_feature(tool, "DockerRequirement")
+ if docker_req:
+ arv_docker_get_image(self.arvrunner.api, docker_req, True, self.arvrunner.project_uuid)
+ elif isinstance(tool, cwltool.workflow.Workflow):
+ for s in tool.steps:
+ self.upload_docker(s.embedded_tool)
+
+ def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs):
+ """Create an Arvados job specification for this workflow.
+
+ The returned dict can be used to create a job (i.e., passed as
+ the +body+ argument to jobs().create()), or as a component in
+ a pipeline template or pipeline instance.
+ """
+ self.upload_docker(self.tool)
+
+ workflowfiles = set()
+ jobfiles = set()
+ workflowfiles.add(self.tool.tool["id"])
+
+ self.name = os.path.basename(self.tool.tool["id"])
+
+ def visitFiles(files, path):
+ files.add(path)
+ return path
+
+ document_loader, workflowobj, uri = fetch_document(self.tool.tool["id"])
+ def loadref(b, u):
+ return document_loader.fetch(urlparse.urljoin(b, u))
+
+ sc = scandeps(uri, workflowobj,
+ set(("$import", "run")),
+ set(("$include", "$schemas", "path")),
+ loadref)
+ adjustFiles(sc, partial(visitFiles, workflowfiles))
+ adjustFiles(self.job_order, partial(visitFiles, jobfiles))
+
+ workflowmapper = ArvPathMapper(self.arvrunner, workflowfiles, "",
+ "%s",
+ "%s/%s",
+ name=self.name,
+ **kwargs)
+
+ jobmapper = ArvPathMapper(self.arvrunner, jobfiles, "",
+ "%s",
+ "%s/%s",
+ name=os.path.basename(self.job_order.get("id", "#")),
+ **kwargs)
+
+ adjustFiles(self.job_order, lambda p: jobmapper.mapper(p)[1])
+
+ if "id" in self.job_order:
+ del self.job_order["id"]
+
+ self.job_order["cwl:tool"] = workflowmapper.mapper(self.tool.tool["id"])[1]
+ return {
+ "script": "cwl-runner",
+ "script_version": "master",
+ "repository": "arvados",
+ "script_parameters": self.job_order,
+ "runtime_constraints": {
+ "docker_image": "arvados/jobs"
+ }
+ }
+
+ def run(self, *args, **kwargs):
+ job_spec = self.arvados_job_spec(*args, **kwargs)
+ job_spec.setdefault("owner_uuid", self.arvrunner.project_uuid)
+
+ response = self.arvrunner.api.jobs().create(
+ body=job_spec,
+ find_or_create=self.enable_reuse
+ ).execute(num_retries=self.arvrunner.num_retries)
+
+ self.uuid = response["uuid"]
+ self.arvrunner.jobs[self.uuid] = self
+
+ logger.info("Submitted job %s", response["uuid"])
+
+ if kwargs.get("submit"):
+ self.pipeline = self.arvrunner.api.pipeline_instances().create(
+ body={
+ "owner_uuid": self.arvrunner.project_uuid,
+ "name": shortname(self.tool.tool["id"]),
+ "components": {"cwl-runner": {"job": {"uuid": self.uuid, "state": response["state"]} } },
+ "state": "RunningOnClient"}).execute(num_retries=self.arvrunner.num_retries)
+
+ if response["state"] in ("Complete", "Failed", "Cancelled"):
+ self.done(response)
+
+ def done(self, record):
+ if record["state"] == "Complete":
+ processStatus = "success"
+ else:
+ processStatus = "permanentFail"
+
+ outputs = None
+ try:
+ try:
+ outc = arvados.collection.Collection(record["output"])
+ with outc.open("cwl.output.json") as f:
+ outputs = json.load(f)
+ def keepify(path):
+ if not path.startswith("keep:"):
+ return "keep:%s/%s" % (record["output"], path)
+ adjustFiles(outputs, keepify)
+ except Exception as e:
+ logger.error("While getting final output object: %s", e)
+ self.arvrunner.output_callback(outputs, processStatus)
+ finally:
+ del self.arvrunner.jobs[record["uuid"]]
+
+
+class RunnerTemplate(object):
+ """An Arvados pipeline template that invokes a CWL workflow."""
+
+ type_to_dataclass = {
+ 'boolean': 'boolean',
+ 'File': 'File',
+ 'float': 'number',
+ 'int': 'number',
+ 'string': 'text',
+ }
+
+ def __init__(self, runner, tool, job_order, enable_reuse):
+ self.runner = runner
+ self.tool = tool
+ self.job = RunnerJob(
+ runner=runner,
+ tool=tool,
+ job_order=job_order,
+ enable_reuse=enable_reuse)
+
+ def pipeline_component_spec(self):
+ """Return a component that Workbench and a-r-p-i will understand.
+
+ Specifically, translate CWL input specs to Arvados pipeline
+ format, like {"dataclass":"File","value":"xyz"}.
+ """
+ spec = self.job.arvados_job_spec()
+
+ # Most of the component spec is exactly the same as the job
+ # spec (script, script_version, etc.).
+ # spec['script_parameters'] isn't right, though. A component
+ # spec's script_parameters hash is a translation of
+ # self.tool.tool['inputs'] with defaults/overrides taken from
+ # the job order. So we move the job parameters out of the way
+ # and build a new spec['script_parameters'].
+ job_params = spec['script_parameters']
+ spec['script_parameters'] = {}
+
+ for param in self.tool.tool['inputs']:
+ param = copy.deepcopy(param)
+
+ # Data type and "required" flag...
+ types = param['type']
+ if not isinstance(types, list):
+ types = [types]
+ param['required'] = 'null' not in types
+ non_null_types = set(types) - set(['null'])
+ if len(non_null_types) == 1:
+ the_type = [c for c in non_null_types][0]
+ dataclass = self.type_to_dataclass.get(the_type)
+ if dataclass:
+ param['dataclass'] = dataclass
+ # Note: If we didn't figure out a single appropriate
+ # dataclass, we just left that attribute out. We leave
+ # the "type" attribute there in any case, which might help
+ # downstream.
+
+ # Title and description...
+ title = param.pop('label', '')
+ descr = param.pop('description', '').rstrip('\n')
+ if title:
+ param['title'] = title
+ if descr:
+ param['description'] = descr
+
+ # Fill in the value from the current job order, if any.
+ param_id = shortname(param.pop('id'))
+ value = job_params.get(param_id)
+ if value is None:
+ pass
+ elif not isinstance(value, dict):
+ param['value'] = value
+ elif param.get('dataclass') == 'File' and value.get('path'):
+ param['value'] = value['path']
+
+ spec['script_parameters'][param_id] = param
+ spec['script_parameters']['cwl:tool'] = job_params['cwl:tool']
+ return spec
+
+ def save(self):
+ job_spec = self.pipeline_component_spec()
+ response = self.runner.api.pipeline_templates().create(body={
+ "components": {
+ self.job.name: job_spec,
+ },
+ "name": self.job.name,
+ "owner_uuid": self.runner.project_uuid,
+ }, ensure_unique_name=True).execute(num_retries=self.runner.num_retries)
+ self.uuid = response["uuid"]
+ logger.info("Created template %s", self.uuid)
+
+
class ArvPathMapper(cwltool.pathmapper.PathMapper):
- def __init__(self, arvrunner, referenced_files, basedir, **kwargs):
+ """Convert container-local paths to and from Keep collection ids."""
+
+ def __init__(self, arvrunner, referenced_files, input_basedir,
+ collection_pattern, file_pattern, name=None, **kwargs):
self._pathmap = arvrunner.get_uploaded()
- uploadfiles = []
+ uploadfiles = set()
pdh_path = re.compile(r'^keep:[0-9a-f]{32}\+\d+/.+')
for src in referenced_files:
if isinstance(src, basestring) and pdh_path.match(src):
- self._pathmap[src] = (src, "$(task.keep)/%s" % src[5:])
+ self._pathmap[src] = (src, collection_pattern % src[5:])
+ if "#" in src:
+ src = src[:src.index("#")]
if src not in self._pathmap:
- ab = cwltool.pathmapper.abspath(src, basedir)
- st = arvados.commands.run.statfile("", ab, fnPattern="$(task.keep)/%s/%s")
+ ab = cwltool.pathmapper.abspath(src, input_basedir)
+ st = arvados.commands.run.statfile("", ab, fnPattern=file_pattern)
if kwargs.get("conformance_test"):
self._pathmap[src] = (src, ab)
elif isinstance(st, arvados.commands.run.UploadFile):
- uploadfiles.append((src, ab, st))
+ uploadfiles.add((src, ab, st))
elif isinstance(st, arvados.commands.run.ArvFile):
self._pathmap[src] = (ab, st.fn)
else:
arvrunner.api,
dry_run=kwargs.get("dry_run"),
num_retries=3,
- fnPattern="$(task.keep)/%s/%s",
+ fnPattern=file_pattern,
+ name=name,
project=arvrunner.project_uuid)
for src, ab, st in uploadfiles:
return super(ArvPathMapper, self).reversemap(target)
-class ArvadosCommandTool(cwltool.draft2tool.CommandLineTool):
+class ArvadosCommandTool(CommandLineTool):
+ """Wrap cwltool CommandLineTool to override selected methods."""
+
def __init__(self, arvrunner, toolpath_object, **kwargs):
super(ArvadosCommandTool, self).__init__(toolpath_object, **kwargs)
self.arvrunner = arvrunner
def makeJobRunner(self):
return ArvadosJob(self.arvrunner)
- def makePathMapper(self, reffiles, input_basedir, **kwargs):
- return ArvPathMapper(self.arvrunner, reffiles, input_basedir, **kwargs)
+ def makePathMapper(self, reffiles, **kwargs):
+ return ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"],
+ "$(task.keep)/%s",
+ "$(task.keep)/%s/%s",
+ **kwargs)
class ArvCwlRunner(object):
+ """Execute a CWL tool or workflow, submit crunch jobs, wait for them to
+ complete, and report output."""
+
def __init__(self, api_client):
self.api = api_client
self.jobs = {}
self.final_output = None
self.uploaded = {}
self.num_retries = 4
+ self.uuid = None
def arvMakeTool(self, toolpath_object, **kwargs):
if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
def output_callback(self, out, processStatus):
if processStatus == "success":
logger.info("Overall job status is %s", processStatus)
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Complete"}).execute(num_retries=self.num_retries)
+ if self.pipeline:
+ self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+ body={"state": "Complete"}).execute(num_retries=self.num_retries)
else:
logger.warn("Overall job status is %s", processStatus)
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Failed"}).execute(num_retries=self.num_retries)
+ if self.pipeline:
+ self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+ body={"state": "Failed"}).execute(num_retries=self.num_retries)
self.final_output = out
-
def on_message(self, event):
if "object_uuid" in event:
if event["object_uuid"] in self.jobs and event["event_type"] == "update":
def add_uploaded(self, src, pair):
self.uploaded[src] = pair
- def arvExecutor(self, tool, job_order, input_basedir, args, **kwargs):
- events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)
+ def arvExecutor(self, tool, job_order, **kwargs):
+ self.debug = kwargs.get("debug")
- try:
- self.api.collections().get(uuid=crunchrunner_pdh).execute()
- except arvados.errors.ApiError as e:
- import httplib2
- h = httplib2.Http(ca_certs=arvados.util.ca_certs_path())
- resp, content = h.request(crunchrunner_download, "GET")
- resp2, content2 = h.request(certs_download, "GET")
- with arvados.collection.Collection() as col:
- with col.open("crunchrunner", "w") as f:
- f.write(content)
- with col.open("ca-certificates.crt", "w") as f:
- f.write(content2)
-
- col.save_new("crunchrunner binary", ensure_unique_name=True)
-
- self.fs_access = CollectionFsAccess(input_basedir)
+ if kwargs.get("quiet"):
+ logger.setLevel(logging.WARN)
+ logging.getLogger('arvados.arv-run').setLevel(logging.WARN)
- kwargs["fs_access"] = self.fs_access
- kwargs["enable_reuse"] = args.enable_reuse
+ useruuid = self.api.users().current().execute()["uuid"]
+ self.project_uuid = kwargs.get("project_uuid") if kwargs.get("project_uuid") else useruuid
+ self.pipeline = None
- kwargs["outdir"] = "$(task.outdir)"
- kwargs["tmpdir"] = "$(task.tmpdir)"
+ if kwargs.get("create_template"):
+ tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse"))
+ tmpl.save()
+ # cwltool.main will write our return value to stdout.
+ return tmpl.uuid
- useruuid = self.api.users().current().execute()["uuid"]
- self.project_uuid = args.project_uuid if args.project_uuid else useruuid
+ if kwargs.get("submit"):
+ runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse"))
- if kwargs.get("conformance_test"):
- return cwltool.main.single_job_executor(tool, job_order, input_basedir, args, **kwargs)
- else:
+ if not kwargs.get("submit") and "cwl_runner_job" not in kwargs:
+ # Create pipeline for local run
self.pipeline = self.api.pipeline_instances().create(
body={
"owner_uuid": self.project_uuid,
"name": shortname(tool.tool["id"]),
"components": {},
"state": "RunningOnClient"}).execute(num_retries=self.num_retries)
-
logger.info("Pipeline instance %s", self.pipeline["uuid"])
- jobiter = tool.job(job_order,
- input_basedir,
- self.output_callback,
- docker_outdir="$(task.outdir)",
- **kwargs)
+ if kwargs.get("submit") and not kwargs.get("wait"):
+ runnerjob.run()
+ return runnerjob.uuid
+
+ events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)
+
+ self.debug = kwargs.get("debug")
+ self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")
+ self.fs_access = CollectionFsAccess(kwargs["basedir"])
+
+ kwargs["fs_access"] = self.fs_access
+ kwargs["enable_reuse"] = kwargs.get("enable_reuse")
+
+ kwargs["outdir"] = "$(task.outdir)"
+ kwargs["tmpdir"] = "$(task.tmpdir)"
+
+ if kwargs.get("conformance_test"):
+ return cwltool.main.single_job_executor(tool, job_order, **kwargs)
+ else:
+ if kwargs.get("submit"):
+ jobiter = iter((runnerjob,))
+ else:
+ if "cwl_runner_job" in kwargs:
+ self.uuid = kwargs.get("cwl_runner_job").get('uuid')
+ jobiter = tool.job(job_order,
+ self.output_callback,
+ docker_outdir="$(task.outdir)",
+ **kwargs)
try:
self.cond.acquire()
self.cond.wait(1)
events.close()
-
- if self.final_output is None:
- raise cwltool.workflow.WorkflowException("Workflow did not return a result.")
-
- # create final output collection
except:
if sys.exc_info()[0] is KeyboardInterrupt:
logger.error("Interrupted, marking pipeline as failed")
else:
- logger.exception("Caught unhandled exception, marking pipeline as failed")
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Failed"}).execute(num_retries=self.num_retries)
+ logger.error("Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[0], exc_info=(sys.exc_info()[1] if self.debug else False))
+ if self.pipeline:
+ self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
+ body={"state": "Failed"}).execute(num_retries=self.num_retries)
finally:
self.cond.release()
+ if self.final_output is None:
+ raise cwltool.workflow.WorkflowException("Workflow did not return a result.")
+
return self.final_output
+def versionstring():
+ """Print version string of key packages for provenance and debugging."""
+
+ arvcwlpkg = pkg_resources.require("arvados-cwl-runner")
+ arvpkg = pkg_resources.require("arvados-python-client")
+ cwlpkg = pkg_resources.require("cwltool")
+
+ return "%s %s, %s %s, %s %s" % (sys.argv[0], arvcwlpkg[0].version,
+ "arvados-python-client", arvpkg[0].version,
+ "cwltool", cwlpkg[0].version)
+
+def arg_parser(): # type: () -> argparse.ArgumentParser
+ parser = argparse.ArgumentParser(description='Arvados executor for Common Workflow Language')
+
+ parser.add_argument("--conformance-test", action="store_true")
+ parser.add_argument("--basedir", type=str,
+ help="Base directory used to resolve relative references in the input, default to directory of input object file or current directory (if inputs piped/provided on command line).")
+ parser.add_argument("--outdir", type=str, default=os.path.abspath('.'),
+ help="Output directory, default current directory")
+
+ parser.add_argument("--eval-timeout",
+ help="Time to wait for a Javascript expression to evaluate before giving an error, default 20s.",
+ type=float,
+ default=20)
+ parser.add_argument("--version", action="store_true", help="Print version and exit")
+
+ exgroup = parser.add_mutually_exclusive_group()
+ exgroup.add_argument("--verbose", action="store_true", help="Default logging")
+ exgroup.add_argument("--quiet", action="store_true", help="Only print warnings and errors.")
+ exgroup.add_argument("--debug", action="store_true", help="Print even more logging")
+
+ parser.add_argument("--tool-help", action="store_true", help="Print command line help for tool")
-def main(args, stdout, stderr, api_client=None):
- args.insert(0, "--leave-outputs")
- parser = cwltool.main.arg_parser()
exgroup = parser.add_mutually_exclusive_group()
exgroup.add_argument("--enable-reuse", action="store_true",
default=True, dest="enable_reuse",
exgroup.add_argument("--disable-reuse", action="store_false",
default=True, dest="enable_reuse",
help="")
- parser.add_argument("--project-uuid", type=str, help="Project that will own the workflow jobs")
+
+ parser.add_argument("--project-uuid", type=str, help="Project that will own the workflow jobs, if not provided, will go to home project.")
+ parser.add_argument("--ignore-docker-for-reuse", action="store_true",
+ help="Ignore Docker image version when deciding whether to reuse past jobs.",
+ default=False)
+
+ exgroup = parser.add_mutually_exclusive_group()
+ exgroup.add_argument("--submit", action="store_true", help="Submit workflow to run on Arvados.",
+ default=True, dest="submit")
+ exgroup.add_argument("--local", action="store_false", help="Run workflow on local host (submits jobs to Arvados).",
+ default=True, dest="submit")
+ exgroup.add_argument("--create-template", action="store_true", help="Create an Arvados pipeline template.")
+
+ exgroup = parser.add_mutually_exclusive_group()
+ exgroup.add_argument("--wait", action="store_true", help="After submitting workflow runner job, wait for completion.",
+ default=True, dest="wait")
+ exgroup.add_argument("--no-wait", action="store_false", help="Submit workflow runner job and exit.",
+ default=True, dest="wait")
+
+ parser.add_argument("workflow", type=str, nargs="?", default=None)
+ parser.add_argument("job_order", nargs=argparse.REMAINDER)
+
+ return parser
+
+def main(args, stdout, stderr, api_client=None):
+ parser = arg_parser()
+
+ job_order_object = None
+ arvargs = parser.parse_args(args)
+ if arvargs.create_template and not arvargs.job_order:
+ job_order_object = ({}, "")
try:
- runner = ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()))
+ if api_client is None:
+ api_client=arvados.api('v1', model=OrderedJsonModel())
+ runner = ArvCwlRunner(api_client)
except Exception as e:
logger.error(e)
return 1
- return cwltool.main.main(args, executor=runner.arvExecutor, makeTool=runner.arvMakeTool, parser=parser)
+ return cwltool.main.main(args=arvargs,
+ stdout=stdout,
+ stderr=stderr,
+ executor=runner.arvExecutor,
+ makeTool=runner.arvMakeTool,
+ versionfunc=versionstring,
+ job_order_object=job_order_object)
'bin/arvados-cwl-runner'
],
install_requires=[
- 'cwltool>=1.0.20160311170456',
- 'arvados-python-client>=0.1.20160219154918'
+ 'cwltool==1.0.20160519182434',
+ 'arvados-python-client>=0.1.20160322001610'
+ ],
+ data_files=[
+ ('share/doc/arvados-cwl-runner', ['LICENSE-2.0.txt', 'README.rst']),
],
test_suite='tests',
tests_require=['mock>=1.0'],
config=$2
shift ; shift
;;
+ -h|--help)
+ echo "$0 [--no-reset-container] [--leave-running] [--config dev|localdemo]"
+ exit
+ ;;
-*)
break
;;
export ARVADOS_API_HOST=localhost:8000
export ARVADOS_API_HOST_INSECURE=1
export ARVADOS_API_TOKEN=\$(cat /var/lib/arvados/superuser_token)
+
+arv-keepdocker --pull arvados/jobs
+
env
exec ./run_test.sh "$@"
EOF
--- /dev/null
+blopper blubber
--- /dev/null
+import difflib
+import json
+
+
+class JsonDiffMatcher(object):
+ """Raise AssertionError with a readable JSON diff when not __eq__().
+
+ Used with assert_called_with() so it's possible for a human to see
+ the differences between expected and actual call arguments that
+ include non-trivial data structures.
+ """
+ def __init__(self, expected):
+ self.expected = expected
+
+ def __eq__(self, actual):
+ expected_json = json.dumps(self.expected, sort_keys=True, indent=2)
+ actual_json = json.dumps(actual, sort_keys=True, indent=2)
+ if expected_json != actual_json:
+ raise AssertionError("".join(difflib.context_diff(
+ expected_json.splitlines(1),
+ actual_json.splitlines(1),
+ fromfile="Expected", tofile="Actual")))
+ return True
--- /dev/null
+{
+ "fileInput": {
+ "class": "File",
+ "path": "../input/blorp.txt"
+ },
+ "boolInput": true,
+ "floatInput": 1.234,
+ "optionalFloatInput": null
+}
--- /dev/null
+{
+ "x": {
+ "class": "File",
+ "path": "input/blorp.txt"
+ }
+}
-import unittest
-import mock
import arvados_cwl
+import logging
+import mock
+import unittest
+import os
+import cwltool.process
+
+if not os.getenv('ARVADOS_DEBUG'):
+ logging.getLogger('arvados.cwl-runner').setLevel(logging.WARN)
+ logging.getLogger('arvados.arv-run').setLevel(logging.WARN)
+
class TestJob(unittest.TestCase):
def test_run(self):
runner = mock.MagicMock()
runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ runner.ignore_docker_for_reuse = False
+ document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema("draft-3")
+
tool = {
"inputs": [],
"outputs": [],
"baseCommand": "ls"
}
- arvtool = arvados_cwl.ArvadosCommandTool(runner, tool)
+ arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, avsc_names=avsc_names, basedir="")
arvtool.formatgraph = None
- for j in arvtool.job({}, "", mock.MagicMock()):
+ for j in arvtool.job({}, mock.MagicMock(), basedir=""):
j.run()
- runner.api.jobs().create.assert_called_with(body={
- 'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
- 'runtime_constraints': {},
- 'script_parameters': {
- 'tasks': [{
- 'task.env': {'TMPDIR': '$(task.tmpdir)'},
- 'command': ['ls']
- }],
- 'crunchrunner': '83db29f08544e1c319572a6bd971088a+140/crunchrunner'
- },
- 'script_version': 'master',
- 'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
- 'repository': 'arvados',
- 'script': 'crunchrunner',
- 'runtime_constraints': {
- 'min_cores_per_node': 1,
- 'min_ram_mb_per_node': 1024,
- 'min_scratch_mb_per_node': 2048 # tmpdirSize + outdirSize
- }
- }, find_or_create=True)
+ runner.api.jobs().create.assert_called_with(
+ body={
+ 'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+ 'runtime_constraints': {},
+ 'script_parameters': {
+ 'tasks': [{
+ 'task.env': {'TMPDIR': '$(task.tmpdir)'},
+ 'command': ['ls']
+ }],
+ },
+ 'script_version': 'master',
+ 'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
+ 'repository': 'arvados',
+ 'script': 'crunchrunner',
+ 'runtime_constraints': {
+ 'docker_image': 'arvados/jobs',
+ 'min_cores_per_node': 1,
+ 'min_ram_mb_per_node': 1024,
+ 'min_scratch_mb_per_node': 2048 # tmpdirSize + outdirSize
+ }
+ },
+ find_or_create=True,
+ filters=[['repository', '=', 'arvados'],
+ ['script', '=', 'crunchrunner'],
+ ['script_version', 'in git', '9e5b98e8f5f4727856b53447191f9c06e3da2ba6'],
+ ['docker_image_locator', 'in docker', 'arvados/jobs']]
+ )
# The test passes some fields in builder.resources
# For the remaining fields, the defaults will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
def test_resource_requirements(self):
runner = mock.MagicMock()
runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ runner.ignore_docker_for_reuse = False
+ document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema("draft-3")
+
tool = {
"inputs": [],
"outputs": [],
}],
"baseCommand": "ls"
}
- arvtool = arvados_cwl.ArvadosCommandTool(runner, tool)
+ arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, avsc_names=avsc_names)
arvtool.formatgraph = None
- for j in arvtool.job({}, "", mock.MagicMock()):
+ for j in arvtool.job({}, mock.MagicMock(), basedir=""):
j.run()
- runner.api.jobs().create.assert_called_with(body={
- 'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
- 'runtime_constraints': {},
- 'script_parameters': {
- 'tasks': [{
- 'task.env': {'TMPDIR': '$(task.tmpdir)'},
- 'command': ['ls']
- }],
- 'crunchrunner': '83db29f08544e1c319572a6bd971088a+140/crunchrunner'
+ runner.api.jobs().create.assert_called_with(
+ body={
+ 'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+ 'runtime_constraints': {},
+ 'script_parameters': {
+ 'tasks': [{
+ 'task.env': {'TMPDIR': '$(task.tmpdir)'},
+ 'command': ['ls']
+ }]
},
'script_version': 'master',
- 'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
- 'repository': 'arvados',
- 'script': 'crunchrunner',
- 'runtime_constraints': {
- 'min_cores_per_node': 3,
- 'min_ram_mb_per_node': 3000,
- 'min_scratch_mb_per_node': 5024 # tmpdirSize + outdirSize
- }
- }, find_or_create=True)
+ 'minimum_script_version': '9e5b98e8f5f4727856b53447191f9c06e3da2ba6',
+ 'repository': 'arvados',
+ 'script': 'crunchrunner',
+ 'runtime_constraints': {
+ 'docker_image': 'arvados/jobs',
+ 'min_cores_per_node': 3,
+ 'min_ram_mb_per_node': 3000,
+ 'min_scratch_mb_per_node': 5024 # tmpdirSize + outdirSize
+ }
+ },
+ find_or_create=True,
+ filters=[['repository', '=', 'arvados'],
+ ['script', '=', 'crunchrunner'],
+ ['script_version', 'in git', '9e5b98e8f5f4727856b53447191f9c06e3da2ba6'],
+ ['docker_image_locator', 'in docker', 'arvados/jobs']])
+
+ @mock.patch("arvados.collection.Collection")
+ def test_done(self, col):
+ api = mock.MagicMock()
+
+ runner = mock.MagicMock()
+ runner.api = api
+ runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ runner.num_retries = 0
+ runner.ignore_docker_for_reuse = False
+
+ col().open.return_value = []
+ api.collections().list().execute.side_effect = ({"items": []},
+ {"items": [{"manifest_text": "XYZ"}]})
+
+ arvjob = arvados_cwl.ArvadosJob(runner)
+ arvjob.name = "testjob"
+ arvjob.builder = mock.MagicMock()
+ arvjob.output_callback = mock.MagicMock()
+ arvjob.collect_outputs = mock.MagicMock()
+
+ arvjob.done({
+ "state": "Complete",
+ "output": "99999999999999999999999999999993+99",
+ "log": "99999999999999999999999999999994+99",
+ "uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ })
+
+ api.collections().list.assert_has_calls([
+ mock.call(),
+ mock.call(filters=[['owner_uuid', '=', 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'],
+ ['portable_data_hash', '=', '99999999999999999999999999999993+99'],
+ ['name', '=', 'Output 9999999 of testjob']]),
+ mock.call().execute(num_retries=0),
+ mock.call(limit=1, filters=[['portable_data_hash', '=', '99999999999999999999999999999993+99']],
+ select=['manifest_text']),
+ mock.call().execute(num_retries=0)])
+
+ api.collections().create.assert_called_with(
+ ensure_unique_name=True,
+ body={'portable_data_hash': '99999999999999999999999999999993+99',
+ 'manifest_text': 'XYZ',
+ 'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+ 'name': 'Output 9999999 of testjob'})
+
+ @mock.patch("arvados.collection.Collection")
+ def test_done_use_existing_collection(self, col):
+ api = mock.MagicMock()
+
+ runner = mock.MagicMock()
+ runner.api = api
+ runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ runner.num_retries = 0
+
+ col().open.return_value = []
+ api.collections().list().execute.side_effect = ({"items": [{"uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2"}]},)
+
+ arvjob = arvados_cwl.ArvadosJob(runner)
+ arvjob.name = "testjob"
+ arvjob.builder = mock.MagicMock()
+ arvjob.output_callback = mock.MagicMock()
+ arvjob.collect_outputs = mock.MagicMock()
+
+ arvjob.done({
+ "state": "Complete",
+ "output": "99999999999999999999999999999993+99",
+ "log": "99999999999999999999999999999994+99",
+ "uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ })
+
+ api.collections().list.assert_has_calls([
+ mock.call(),
+ mock.call(filters=[['owner_uuid', '=', 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'],
+ ['portable_data_hash', '=', '99999999999999999999999999999993+99'],
+ ['name', '=', 'Output 9999999 of testjob']]),
+ mock.call().execute(num_retries=0)])
+
+ self.assertFalse(api.collections().create.called)
--- /dev/null
+import arvados
+import arvados.keep
+import arvados.collection
+import arvados_cwl
+import copy
+import cStringIO
+import functools
+import hashlib
+import mock
+import sys
+import unittest
+
+from .matcher import JsonDiffMatcher
+
+
+def stubs(func):
+ @functools.wraps(func)
+ @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
+ @mock.patch("arvados.collection.KeepClient")
+ @mock.patch("arvados.events.subscribe")
+ def wrapped(self, events, KeepClient, keepdocker, *args, **kwargs):
+ class Stubs:
+ pass
+ stubs = Stubs()
+ stubs.events = events
+ stubs.KeepClient = KeepClient
+ stubs.keepdocker = keepdocker
+
+ def putstub(p, **kwargs):
+ return "%s+%i" % (hashlib.md5(p).hexdigest(), len(p))
+ stubs.KeepClient().put.side_effect = putstub
+
+ stubs.keepdocker.return_value = True
+ stubs.fake_user_uuid = "zzzzz-tpzed-zzzzzzzzzzzzzzz"
+
+ stubs.api = mock.MagicMock()
+ stubs.api.users().current().execute.return_value = {
+ "uuid": stubs.fake_user_uuid,
+ }
+ stubs.api.collections().list().execute.return_value = {"items": []}
+ stubs.api.collections().create().execute.side_effect = ({
+ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
+ "portable_data_hash": "99999999999999999999999999999991+99",
+ }, {
+ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
+ "portable_data_hash": "99999999999999999999999999999992+99",
+ })
+ stubs.expect_job_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ stubs.api.jobs().create().execute.return_value = {
+ "uuid": stubs.expect_job_uuid,
+ "state": "Queued",
+ }
+ stubs.expect_pipeline_template_uuid = "zzzzz-d1hrv-zzzzzzzzzzzzzzz"
+ stubs.api.pipeline_templates().create().execute.return_value = {
+ "uuid": stubs.expect_pipeline_template_uuid,
+ }
+ stubs.expect_job_spec = {
+ 'runtime_constraints': {
+ 'docker_image': 'arvados/jobs'
+ },
+ 'script_parameters': {
+ 'x': {
+ 'path': '99999999999999999999999999999992+99/blorp.txt',
+ 'class': 'File'
+ },
+ 'cwl:tool':
+ '99999999999999999999999999999991+99/wf/submit_wf.cwl'
+ },
+ 'repository': 'arvados',
+ 'script_version': 'master',
+ 'script': 'cwl-runner'
+ }
+ return func(self, stubs, *args, **kwargs)
+ return wrapped
+
+
+class TestSubmit(unittest.TestCase):
+ @stubs
+ def test_submit(self, stubs):
+ capture_stdout = cStringIO.StringIO()
+ exited = arvados_cwl.main(
+ ["--submit", "--no-wait",
+ "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+ capture_stdout, sys.stderr, api_client=stubs.api)
+ self.assertEqual(exited, 0)
+
+ stubs.api.collections().create.assert_has_calls([
+ mock.call(),
+ mock.call(body={
+ 'manifest_text':
+ './tool a3954c369b8924d40547ec8cf5f6a7f4+449 '
+ '0:16:blub.txt 16:433:submit_tool.cwl\n./wf '
+ 'e046cace0b1a0a6ee645f6ea8688f7e2+364 0:364:submit_wf.cwl\n',
+ 'owner_uuid': 'zzzzz-tpzed-zzzzzzzzzzzzzzz',
+ 'name': 'submit_wf.cwl',
+ }, ensure_unique_name=True),
+ mock.call().execute(),
+ mock.call(body={
+ 'manifest_text':
+ '. 979af1245a12a1fed634d4222473bfdc+16 0:16:blorp.txt\n',
+ 'owner_uuid': 'zzzzz-tpzed-zzzzzzzzzzzzzzz',
+ 'name': '#',
+ }, ensure_unique_name=True),
+ mock.call().execute()])
+
+ expect_job = copy.deepcopy(stubs.expect_job_spec)
+ expect_job["owner_uuid"] = stubs.fake_user_uuid
+ stubs.api.jobs().create.assert_called_with(
+ body=expect_job,
+ find_or_create=True)
+ self.assertEqual(capture_stdout.getvalue(),
+ stubs.expect_job_uuid + '\n')
+
+ @stubs
+ def test_submit_with_project_uuid(self, stubs):
+ project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+
+ exited = arvados_cwl.main(
+ ["--submit", "--no-wait",
+ "--project-uuid", project_uuid,
+ "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+ sys.stdout, sys.stderr, api_client=stubs.api)
+ self.assertEqual(exited, 0)
+
+ expect_body = copy.deepcopy(stubs.expect_job_spec)
+ expect_body["owner_uuid"] = project_uuid
+ stubs.api.jobs().create.assert_called_with(
+ body=expect_body,
+ find_or_create=True)
+
+
+class TestCreateTemplate(unittest.TestCase):
+ @stubs
+ def test_create(self, stubs):
+ project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+
+ capture_stdout = cStringIO.StringIO()
+
+ exited = arvados_cwl.main(
+ ["--create-template", "--no-wait",
+ "--project-uuid", project_uuid,
+ "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+ capture_stdout, sys.stderr, api_client=stubs.api)
+ self.assertEqual(exited, 0)
+
+ stubs.api.pipeline_instances().create.refute_called()
+ stubs.api.jobs().create.refute_called()
+
+ expect_component = copy.deepcopy(stubs.expect_job_spec)
+ expect_component['script_parameters']['x'] = {
+ 'dataclass': 'File',
+ 'required': True,
+ 'type': 'File',
+ 'value': '99999999999999999999999999999992+99/blorp.txt',
+ }
+ expect_template = {
+ "components": {
+ "submit_wf.cwl": expect_component,
+ },
+ "name": "submit_wf.cwl",
+ "owner_uuid": project_uuid,
+ }
+ stubs.api.pipeline_templates().create.assert_called_with(
+ body=JsonDiffMatcher(expect_template), ensure_unique_name=True)
+
+ self.assertEqual(capture_stdout.getvalue(),
+ stubs.expect_pipeline_template_uuid + '\n')
+
+
+class TestTemplateInputs(unittest.TestCase):
+ expect_template = {
+ "components": {
+ "inputs_test.cwl": {
+ 'runtime_constraints': {
+ 'docker_image': 'arvados/jobs',
+ },
+ 'script_parameters': {
+ 'cwl:tool':
+ '99999999999999999999999999999991+99/'
+ 'wf/inputs_test.cwl',
+ 'optionalFloatInput': None,
+ 'fileInput': {
+ 'type': 'File',
+ 'dataclass': 'File',
+ 'required': True,
+ 'title': "It's a file; we expect to find some characters in it.",
+ 'description': 'If there were anything further to say, it would be said here,\nor here.'
+ },
+ 'floatInput': {
+ 'type': 'float',
+ 'dataclass': 'number',
+ 'required': True,
+ 'title': 'Floats like a duck',
+ 'default': 0.1,
+ 'value': 0.1,
+ },
+ 'optionalFloatInput': {
+ 'type': ['null', 'float'],
+ 'dataclass': 'number',
+ 'required': False,
+ },
+ 'boolInput': {
+ 'type': 'boolean',
+ 'dataclass': 'boolean',
+ 'required': True,
+ 'title': 'True or false?',
+ },
+ },
+ 'repository': 'arvados',
+ 'script_version': 'master',
+ 'script': 'cwl-runner',
+ },
+ },
+ "name": "inputs_test.cwl",
+ }
+
+ @stubs
+ def test_inputs_empty(self, stubs):
+ exited = arvados_cwl.main(
+ ["--create-template", "--no-wait",
+ "tests/wf/inputs_test.cwl", "tests/order/empty_order.json"],
+ cStringIO.StringIO(), sys.stderr, api_client=stubs.api)
+ self.assertEqual(exited, 0)
+
+ expect_template = copy.deepcopy(self.expect_template)
+ expect_template["owner_uuid"] = stubs.fake_user_uuid
+
+ stubs.api.pipeline_templates().create.assert_called_with(
+ body=JsonDiffMatcher(expect_template), ensure_unique_name=True)
+
+ @stubs
+ def test_inputs(self, stubs):
+ exited = arvados_cwl.main(
+ ["--create-template", "--no-wait",
+ "tests/wf/inputs_test.cwl", "tests/order/inputs_test_order.json"],
+ cStringIO.StringIO(), sys.stderr, api_client=stubs.api)
+ self.assertEqual(exited, 0)
+
+ self.expect_template["owner_uuid"] = stubs.fake_user_uuid
+
+ expect_template = copy.deepcopy(self.expect_template)
+ expect_template["owner_uuid"] = stubs.fake_user_uuid
+ params = expect_template[
+ "components"]["inputs_test.cwl"]["script_parameters"]
+ params["fileInput"]["value"] = '99999999999999999999999999999992+99/blorp.txt'
+ params["floatInput"]["value"] = 1.234
+ params["boolInput"]["value"] = True
+
+ stubs.api.pipeline_templates().create.assert_called_with(
+ body=JsonDiffMatcher(expect_template), ensure_unique_name=True)
--- /dev/null
+blibber blubber
--- /dev/null
+# Test case for arvados-cwl-runner
+#
+# Used to test whether scanning a tool file for dependencies (e.g. default
+# value blub.txt) and uploading to Keep works as intended.
+
+class: CommandLineTool
+cwlVersion: draft-3
+requirements:
+ - class: DockerRequirement
+ dockerPull: debian:8
+inputs:
+ - id: x
+ type: File
+ default:
+ class: File
+ path: blub.txt
+ inputBinding:
+ position: 1
+outputs: []
+baseCommand: cat
--- /dev/null
+# Test case for arvados-cwl-runner. Used to test propagation of
+# various input types as script_parameters in pipeline templates.
+
+class: Workflow
+cwlVersion: draft-3
+inputs:
+ - id: "#fileInput"
+ type: File
+ label: It's a file; we expect to find some characters in it.
+ description: |
+ If there were anything further to say, it would be said here,
+ or here.
+ - id: "#boolInput"
+ type: boolean
+ label: True or false?
+ - id: "#floatInput"
+ type: float
+ label: Floats like a duck
+ default: 0.1
+ - id: "#optionalFloatInput"
+ type: ["null", float]
+outputs: []
+steps:
+ - id: step1
+ inputs:
+ - { id: x, source: "#x" }
+ outputs: []
+ run: ../tool/submit_tool.cwl
--- /dev/null
+# Test case for arvados-cwl-runner
+#
+# Used to test whether scanning a workflow file for dependencies
+# (e.g. submit_tool.cwl) and uploading to Keep works as intended.
+
+class: Workflow
+cwlVersion: draft-3
+inputs:
+ - id: x
+ type: File
+outputs: []
+steps:
+ - id: step1
+ inputs:
+ - { id: x, source: "#x" }
+ outputs: []
+ run: ../tool/submit_tool.cwl
--- /dev/null
+package arvados
+
+import (
+ "crypto/tls"
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "os"
+)
+
+// A Client is an HTTP client with an API endpoint and a set of
+// Arvados credentials.
+//
+// It offers methods for accessing individual Arvados APIs, and
+// methods that implement common patterns like fetching multiple pages
+// of results using List APIs.
+type Client struct {
+ // HTTP client used to make requests. If nil,
+ // http.DefaultClient or InsecureHTTPClient will be used.
+ Client *http.Client
+
+ // Hostname (or host:port) of Arvados API server.
+ APIHost string
+
+ // User authentication token.
+ AuthToken string
+
+ // Accept unverified certificates. This works only if the
+ // Client field is nil: otherwise, it has no effect.
+ Insecure bool
+}
+
+// The default http.Client used by a Client with Insecure==true and
+// Client==nil.
+var InsecureHTTPClient = &http.Client{
+ Transport: &http.Transport{
+ TLSClientConfig: &tls.Config{
+ InsecureSkipVerify: true}}}
+
+// NewClientFromEnv creates a new Client that uses the default HTTP
+// client with the API endpoint and credentials given by the
+// ARVADOS_API_* environment variables.
+func NewClientFromEnv() *Client {
+ return &Client{
+ APIHost: os.Getenv("ARVADOS_API_HOST"),
+ AuthToken: os.Getenv("ARVADOS_API_TOKEN"),
+ Insecure: os.Getenv("ARVADOS_API_HOST_INSECURE") != "",
+ }
+}
+
+// Do adds authentication headers and then calls (*http.Client)Do().
+func (c *Client) Do(req *http.Request) (*http.Response, error) {
+ if c.AuthToken != "" {
+ req.Header.Add("Authorization", "OAuth2 "+c.AuthToken)
+ }
+ return c.httpClient().Do(req)
+}
+
+// DoAndDecode performs req and unmarshals the response (which must be
+// JSON) into dst. Use this instead of RequestAndDecode if you need
+// more control of the http.Request object.
+func (c *Client) DoAndDecode(dst interface{}, req *http.Request) error {
+ resp, err := c.Do(req)
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+ buf, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ return err
+ }
+ if resp.StatusCode != 200 {
+ return fmt.Errorf("request failed (%s): %s", req.URL, resp.Status)
+ }
+ if dst == nil {
+ return nil
+ }
+ return json.Unmarshal(buf, dst)
+}
+
+// RequestAndDecode performs an API request and unmarshals the
+// response (which must be JSON) into dst. Method and body arguments
+// are the same as for http.NewRequest(). The given path is added to
+// the server's scheme/host/port to form the request URL. The given
+// params are passed via POST form or query string.
+//
+// path must not contain a query string.
+func (c *Client) RequestAndDecode(dst interface{}, method, path string, body io.Reader, params interface{}) error {
+ urlString := c.apiURL(path)
+ var urlValues url.Values
+ if v, ok := params.(url.Values); ok {
+ urlValues = v
+ } else if params != nil {
+ // Convert an arbitrary struct to url.Values. For
+ // example, Foo{Bar: []int{1,2,3}, Baz: "waz"} becomes
+ // url.Values{`bar`:`{"a":[1,2,3]}`,`Baz`:`waz`}
+ //
+ // TODO: Do this more efficiently, possibly using
+ // json.Decode/Encode, so the whole thing doesn't have
+ // to get encoded, decoded, and re-encoded.
+ j, err := json.Marshal(params)
+ if err != nil {
+ return err
+ }
+ var generic map[string]interface{}
+ err = json.Unmarshal(j, &generic)
+ if err != nil {
+ return err
+ }
+ urlValues = url.Values{}
+ for k, v := range generic {
+ if v, ok := v.(string); ok {
+ urlValues.Set(k, v)
+ continue
+ }
+ j, err := json.Marshal(v)
+ if err != nil {
+ return err
+ }
+ urlValues.Set(k, string(j))
+ }
+ }
+ if (method == "GET" || body != nil) && urlValues != nil {
+ // FIXME: what if params don't fit in URL
+ u, err := url.Parse(urlString)
+ if err != nil {
+ return err
+ }
+ u.RawQuery = urlValues.Encode()
+ urlString = u.String()
+ }
+ req, err := http.NewRequest(method, urlString, body)
+ if err != nil {
+ return err
+ }
+ return c.DoAndDecode(dst, req)
+}
+
+func (c *Client) httpClient() *http.Client {
+ switch {
+ case c.Client != nil:
+ return c.Client
+ case c.Insecure:
+ return InsecureHTTPClient
+ default:
+ return http.DefaultClient
+ }
+}
+
+func (c *Client) apiURL(path string) string {
+ return "https://" + c.APIHost + "/" + path
+}
+
+// DiscoveryDocument is the Arvados server's description of itself.
+type DiscoveryDocument struct {
+ DefaultCollectionReplication int `json:"defaultCollectionReplication"`
+ BlobSignatureTTL int64 `json:"blobSignatureTtl"`
+}
+
+// DiscoveryDocument returns a *DiscoveryDocument. The returned object
+// should not be modified: the same object may be returned by
+// subsequent calls.
+func (c *Client) DiscoveryDocument() (*DiscoveryDocument, error) {
+ var dd DiscoveryDocument
+ return &dd, c.RequestAndDecode(&dd, "GET", "discovery/v1/apis/arvados/v1/rest", nil, nil)
+}
--- /dev/null
+package arvados
+
+import (
+ "bytes"
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "sync"
+ "testing"
+)
+
+type stubTransport struct {
+ Responses map[string]string
+ Requests []http.Request
+ sync.Mutex
+}
+
+func (stub *stubTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+ stub.Lock()
+ stub.Requests = append(stub.Requests, *req)
+ stub.Unlock()
+
+ resp := &http.Response{
+ Status: "200 OK",
+ StatusCode: 200,
+ Proto: "HTTP/1.1",
+ ProtoMajor: 1,
+ ProtoMinor: 1,
+ Request: req,
+ }
+ str := stub.Responses[req.URL.Path]
+ if str == "" {
+ resp.Status = "404 Not Found"
+ resp.StatusCode = 404
+ str = "{}"
+ }
+ buf := bytes.NewBufferString(str)
+ resp.Body = ioutil.NopCloser(buf)
+ resp.ContentLength = int64(buf.Len())
+ return resp, nil
+}
+
+type errorTransport struct{}
+
+func (stub *errorTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+ return nil, fmt.Errorf("something awful happened")
+}
+
+func TestCurrentUser(t *testing.T) {
+ t.Parallel()
+ stub := &stubTransport{
+ Responses: map[string]string{
+ "/arvados/v1/users/current": `{"uuid":"zzzzz-abcde-012340123401234"}`,
+ },
+ }
+ c := &Client{
+ Client: &http.Client{
+ Transport: stub,
+ },
+ APIHost: "zzzzz.arvadosapi.com",
+ AuthToken: "xyzzy",
+ }
+ u, err := c.CurrentUser()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if x := "zzzzz-abcde-012340123401234"; u.UUID != x {
+ t.Errorf("got uuid %q, expected %q", u.UUID, x)
+ }
+ if len(stub.Requests) < 1 {
+ t.Fatal("empty stub.Requests")
+ }
+ hdr := stub.Requests[len(stub.Requests)-1].Header
+ if hdr.Get("Authorization") != "OAuth2 xyzzy" {
+ t.Errorf("got headers %+q, expected Authorization header", hdr)
+ }
+
+ c.Client.Transport = &errorTransport{}
+ u, err = c.CurrentUser()
+ if err == nil {
+ t.Errorf("got nil error, expected something awful")
+ }
+}
--- /dev/null
+package arvados
+
+import (
+ "bufio"
+ "fmt"
+ "strings"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/manifest"
+)
+
+// Collection is an arvados#collection resource.
+type Collection struct {
+ UUID string `json:"uuid,omitempty"`
+ ExpiresAt *time.Time `json:"expires_at,omitempty"`
+ ManifestText string `json:"manifest_text,omitempty"`
+ CreatedAt *time.Time `json:"created_at,omitempty"`
+ ModifiedAt *time.Time `json:"modified_at,omitempty"`
+ PortableDataHash string `json:"portable_data_hash,omitempty"`
+ ReplicationConfirmed *int `json:"replication_confirmed,omitempty"`
+ ReplicationConfirmedAt *time.Time `json:"replication_confirmed_at,omitempty"`
+ ReplicationDesired *int `json:"replication_desired,omitempty"`
+}
+
+// SizedDigests returns the hash+size part of each data block
+// referenced by the collection.
+func (c *Collection) SizedDigests() ([]SizedDigest, error) {
+ if c.ManifestText == "" && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
+ // TODO: Check more subtle forms of corruption, too
+ return nil, fmt.Errorf("manifest is missing")
+ }
+ var sds []SizedDigest
+ scanner := bufio.NewScanner(strings.NewReader(c.ManifestText))
+ scanner.Buffer(make([]byte, 1048576), len(c.ManifestText))
+ for scanner.Scan() {
+ line := scanner.Text()
+ tokens := strings.Split(line, " ")
+ if len(tokens) < 3 {
+ return nil, fmt.Errorf("Invalid stream (<3 tokens): %q", line)
+ }
+ for _, token := range tokens[1:] {
+ if !manifest.LocatorPattern.MatchString(token) {
+ // FIXME: ensure it's a file token
+ break
+ }
+ // FIXME: shouldn't assume 32 char hash
+ if i := strings.IndexRune(token[33:], '+'); i >= 0 {
+ token = token[:33+i]
+ }
+ sds = append(sds, SizedDigest(token))
+ }
+ }
+ return sds, scanner.Err()
+}
+
+// CollectionList is an arvados#collectionList resource.
+type CollectionList struct {
+ Items []Collection `json:"items"`
+ ItemsAvailable int `json:"items_available"`
+ Offset int `json:"offset"`
+ Limit int `json:"limit"`
+}
--- /dev/null
+// Package arvados is a client library for Arvados.
+//
+// The API is not stable: it should be considered experimental
+// pre-release.
+//
+// The intent is to offer model types and API call functions that can
+// be generated automatically (or at least mostly automatically) from
+// a discovery document. For the time being, there is a manually
+// generated subset of those types and API calls with (approximately)
+// the right signatures, plus client/authentication support and some
+// convenience functions.
+package arvados
--- /dev/null
+package arvados
+
+import (
+ "encoding/json"
+ "fmt"
+ "time"
+)
+
+// Duration is time.Duration but looks like "12s" in JSON, rather than
+// a number of nanoseconds.
+type Duration time.Duration
+
+// UnmarshalJSON implements json.Unmarshaler
+func (d *Duration) UnmarshalJSON(data []byte) error {
+ if data[0] == '"' {
+ dur, err := time.ParseDuration(string(data[1 : len(data)-1]))
+ *d = Duration(dur)
+ return err
+ }
+ return fmt.Errorf("duration must be given as a string like \"600s\" or \"1h30m\"")
+}
+
+// MarshalJSON implements json.Marshaler
+func (d *Duration) MarshalJSON() ([]byte, error) {
+ return json.Marshal(d.String())
+}
+
+// String implements fmt.Stringer
+func (d Duration) String() string {
+ return time.Duration(d).String()
+}
--- /dev/null
+package arvados
+
+import (
+ "strconv"
+ "strings"
+)
+
+// SizedDigest is a minimal Keep block locator: hash+size
+type SizedDigest string
+
+// Size returns the size of the data block, in bytes.
+func (sd SizedDigest) Size() int64 {
+ n, _ := strconv.ParseInt(strings.Split(string(sd), "+")[1], 10, 64)
+ return n
+}
--- /dev/null
+package arvados
+
+import (
+ "bufio"
+ "fmt"
+ "net/http"
+ "strconv"
+ "strings"
+)
+
+// KeepService is an arvados#keepService record
+type KeepService struct {
+ UUID string `json:"uuid"`
+ ServiceHost string `json:"service_host"`
+ ServicePort int `json:"service_port"`
+ ServiceSSLFlag bool `json:"service_ssl_flag"`
+ ServiceType string `json:"service_type"`
+ ReadOnly bool `json:"read_only"`
+}
+
+// KeepServiceList is an arvados#keepServiceList record
+type KeepServiceList struct {
+ Items []KeepService `json:"items"`
+ ItemsAvailable int `json:"items_available"`
+ Offset int `json:"offset"`
+ Limit int `json:"limit"`
+}
+
+// KeepServiceIndexEntry is what a keep service's index response tells
+// us about a stored block.
+type KeepServiceIndexEntry struct {
+ SizedDigest
+ Mtime int64
+}
+
+// EachKeepService calls f once for every readable
+// KeepService. EachKeepService stops if it encounters an
+// error, such as f returning a non-nil error.
+func (c *Client) EachKeepService(f func(KeepService) error) error {
+ params := ResourceListParams{}
+ for {
+ var page KeepServiceList
+ err := c.RequestAndDecode(&page, "GET", "arvados/v1/keep_services", nil, params)
+ if err != nil {
+ return err
+ }
+ for _, item := range page.Items {
+ err = f(item)
+ if err != nil {
+ return err
+ }
+ }
+ params.Offset = params.Offset + len(page.Items)
+ if params.Offset >= page.ItemsAvailable {
+ return nil
+ }
+ }
+}
+
+func (s *KeepService) url(path string) string {
+ var f string
+ if s.ServiceSSLFlag {
+ f = "https://%s:%d/%s"
+ } else {
+ f = "http://%s:%d/%s"
+ }
+ return fmt.Sprintf(f, s.ServiceHost, s.ServicePort, path)
+}
+
+// String implements fmt.Stringer
+func (s *KeepService) String() string {
+ return s.UUID
+}
+
+// Index returns an unsorted list of blocks that can be retrieved from
+// this server.
+func (s *KeepService) Index(c *Client, prefix string) ([]KeepServiceIndexEntry, error) {
+ url := s.url("index/" + prefix)
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return nil, fmt.Errorf("NewRequest(%v): %v", url, err)
+ }
+ resp, err := c.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("Do(%v): %v", url, err)
+ } else if resp.StatusCode != 200 {
+ return nil, fmt.Errorf("%v: %v", url, resp.Status)
+ }
+ defer resp.Body.Close()
+
+ var entries []KeepServiceIndexEntry
+ scanner := bufio.NewScanner(resp.Body)
+ sawEOF := false
+ for scanner.Scan() {
+ if sawEOF {
+ return nil, fmt.Errorf("Index response contained non-terminal blank line")
+ }
+ line := scanner.Text()
+ if line == "" {
+ sawEOF = true
+ continue
+ }
+ fields := strings.Split(line, " ")
+ if len(fields) != 2 {
+ return nil, fmt.Errorf("Malformed index line %q: %d fields", line, len(fields))
+ }
+ mtime, err := strconv.ParseInt(fields[1], 10, 64)
+ if err != nil {
+ return nil, fmt.Errorf("Malformed index line %q: mtime: %v", line, err)
+ }
+ entries = append(entries, KeepServiceIndexEntry{
+ SizedDigest: SizedDigest(fields[0]),
+ Mtime: mtime,
+ })
+ }
+ if err := scanner.Err(); err != nil {
+ return nil, fmt.Errorf("Error scanning index response: %v", err)
+ }
+ if !sawEOF {
+ return nil, fmt.Errorf("Index response had no EOF marker")
+ }
+ return entries, nil
+}
--- /dev/null
+package arvados
+
+import "encoding/json"
+
+// ResourceListParams expresses which results are requested in a
+// list/index API.
+type ResourceListParams struct {
+ Select []string `json:"select,omitempty"`
+ Filters []Filter `json:"filters,omitempty"`
+ Limit *int `json:"limit,omitempty"`
+ Offset int `json:"offset,omitempty"`
+ Order string `json:"order,omitempty"`
+}
+
+// A Filter restricts the set of records returned by a list/index API.
+type Filter struct {
+ Attr string
+ Operator string
+ Operand interface{}
+}
+
+// MarshalJSON encodes a Filter in the form expected by the API.
+func (f *Filter) MarshalJSON() ([]byte, error) {
+ return json.Marshal([]interface{}{f.Attr, f.Operator, f.Operand})
+}
--- /dev/null
+package arvados
+
+import (
+ "bytes"
+ "encoding/json"
+ "testing"
+ "time"
+)
+
+func TestMarshalFiltersWithNanoseconds(t *testing.T) {
+ t0 := time.Now()
+ t0str := t0.Format(time.RFC3339Nano)
+ buf, err := json.Marshal([]Filter{
+ {Attr: "modified_at", Operator: "=", Operand: t0}})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if expect := []byte(`[["modified_at","=","` + t0str + `"]]`); 0 != bytes.Compare(buf, expect) {
+ t.Errorf("Encoded as %q, expected %q", buf, expect)
+ }
+}
--- /dev/null
+package arvados
+
+// User is an arvados#user record
+type User struct {
+ UUID string `json:"uuid,omitempty"`
+ IsActive bool `json:"is_active"`
+ IsAdmin bool `json:"is_admin"`
+ Username string `json:"username,omitempty"`
+}
+
+// CurrentUser calls arvados.v1.users.current, and returns the User
+// record corresponding to this client's credentials.
+func (c *Client) CurrentUser() (User, error) {
+ var u User
+ err := c.RequestAndDecode(&u, "GET", "arvados/v1/users/current", nil, nil)
+ return u, err
+}
// Returns a non-nil error if an error occurs making the API call, the
// API responds with a non-successful HTTP status, or an error occurs
// parsing the response body.
-func (c ArvadosClient) Call(method string, resourceType string, uuid string, action string, parameters Dict, output interface{}) error {
+func (c ArvadosClient) Call(method, resourceType, uuid, action string, parameters Dict, output interface{}) error {
reader, err := c.CallRaw(method, resourceType, uuid, action, parameters)
if reader != nil {
defer reader.Close()
FooBarDirCollection = "zzzzz-4zz18-foonbarfilesdir"
FooPdh = "1f4b0bc7583c2a7f9102c395f4ffc5e3+45"
HelloWorldPdh = "55713e6a34081eb03609e7ad5fcad129+62"
+
+ Dispatch1Token = "kwi8oowusvbutahacwk2geulqewy5oaqmpalczfna4b6bb0hfw"
+ Dispatch1AuthUUID = "zzzzz-gj3su-k9dvestay1plssr"
)
// A valid manifest designed to test various edge cases and parsing
}
MD5CollisionMD5 = "cee9a457e790cf20d4bdaa6d69f01e41"
)
+
+const BlobSigningKey = "zfhgfenhffzltr9dixws36j1yhksjoll2grmku38mi7yxd66h5j4q9w4jzanezacp8s6q0ro3hxakfye02152hncy6zml2ed0uc"
"os"
"os/exec"
"os/signal"
- "path"
"strings"
"syscall"
)
cmd.Stdout = os.Stdout
}
+ cmd.Stderr = os.Stderr
+
if taskp.Env != nil {
// Set up subprocess environment
cmd.Env = os.Environ()
log.Fatal(err)
}
- certpath := path.Join(path.Dir(os.Args[0]), "ca-certificates.crt")
- certdata, err := ioutil.ReadFile(certpath)
- if err == nil {
- log.Printf("Using TLS certificates at %v", certpath)
- certs := x509.NewCertPool()
- certs.AppendCertsFromPEM(certdata)
- api.Client.Transport.(*http.Transport).TLSClientConfig.RootCAs = certs
+ // Container may not have certificates installed, so need to look for
+ // /etc/arvados/ca-certificates.crt in addition to normal system certs.
+ var certFiles = []string{
+ "/etc/ssl/certs/ca-certificates.crt", // Debian
+ "/etc/pki/tls/certs/ca-bundle.crt", // Red Hat
+ "/etc/arvados/ca-certificates.crt",
+ }
+
+ certs := x509.NewCertPool()
+ for _, file := range certFiles {
+ data, err := ioutil.ReadFile(file)
+ if err == nil {
+ log.Printf("Using TLS certificates at %v", file)
+ certs.AppendCertsFromPEM(data)
+ }
}
+ api.Client.Transport.(*http.Transport).TLSClientConfig.RootCAs = certs
jobUuid := os.Getenv("JOB_UUID")
taskUuid := os.Getenv("TASK_UUID")
--- /dev/null
+// Framework for monitoring the Arvados container Queue, Locks container
+// records, and runs goroutine callbacks which implement execution and
+// monitoring of the containers.
+package dispatch
+
+import (
+ "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+ "log"
+ "os"
+ "os/signal"
+ "sync"
+ "syscall"
+ "time"
+)
+
+// Constants for container states
+const (
+ Queued = "Queued"
+ Locked = "Locked"
+ Running = "Running"
+ Complete = "Complete"
+ Cancelled = "Cancelled"
+)
+
+type apiClientAuthorization struct {
+ UUID string `json:"uuid"`
+ APIToken string `json:"api_token"`
+}
+
+type apiClientAuthorizationList struct {
+ Items []apiClientAuthorization `json:"items"`
+}
+
+// Represents an Arvados container record
+type Container struct {
+ UUID string `json:"uuid"`
+ State string `json:"state"`
+ Priority int `json:"priority"`
+ RuntimeConstraints map[string]int64 `json:"runtime_constraints"`
+ LockedByUUID string `json:"locked_by_uuid"`
+}
+
+// ContainerList is a list of the containers from api
+type ContainerList struct {
+ Items []Container `json:"items"`
+ ItemsAvailable int `json:"items_available"`
+}
+
+// Dispatcher holds the state of the dispatcher
+type Dispatcher struct {
+ // The Arvados client
+ Arv arvadosclient.ArvadosClient
+
+ // When a new queued container appears and is either already owned by
+ // this dispatcher or is successfully locked, the dispatcher will call
+ // go RunContainer(). The RunContainer() goroutine gets a channel over
+ // which it will receive updates to the container state. The
+ // RunContainer() goroutine should only assume status updates come when
+ // the container record changes on the API server; if it needs to
+ // monitor the job submission to the underlying slurm/grid engine/etc
+ // queue it should spin up its own polling goroutines. When the
+ // channel is closed, that means the container is no longer being
+ // handled by this dispatcher and the goroutine should terminate. The
+ // goroutine is responsible for draining the 'status' channel, failure
+ // to do so may deadlock the dispatcher.
+ RunContainer func(*Dispatcher, Container, chan Container)
+
+ // Amount of time to wait between polling for updates.
+ PollInterval time.Duration
+
+ // Channel used to signal that RunDispatcher loop should exit.
+ DoneProcessing chan struct{}
+
+ mineMutex sync.Mutex
+ mineMap map[string]chan Container
+ Auth apiClientAuthorization
+ containers chan Container
+}
+
+// Goroutine-safely add/remove uuid to the set of "my" containers, i.e., ones
+// for which this process is actively starting/monitoring. Returns channel to
+// be used to send container status updates.
+func (dispatcher *Dispatcher) setMine(uuid string) chan Container {
+ dispatcher.mineMutex.Lock()
+ defer dispatcher.mineMutex.Unlock()
+ if ch, ok := dispatcher.mineMap[uuid]; ok {
+ return ch
+ }
+
+ ch := make(chan Container)
+ dispatcher.mineMap[uuid] = ch
+ return ch
+}
+
+// Release a container which is no longer being monitored.
+func (dispatcher *Dispatcher) notMine(uuid string) {
+ dispatcher.mineMutex.Lock()
+ defer dispatcher.mineMutex.Unlock()
+ if ch, ok := dispatcher.mineMap[uuid]; ok {
+ close(ch)
+ delete(dispatcher.mineMap, uuid)
+ }
+}
+
+// Check if there is a channel for updates associated with this container. If
+// so send the container record on the channel and return true, if not return
+// false.
+func (dispatcher *Dispatcher) updateMine(c Container) bool {
+ dispatcher.mineMutex.Lock()
+ defer dispatcher.mineMutex.Unlock()
+ ch, ok := dispatcher.mineMap[c.UUID]
+ if ok {
+ ch <- c
+ return true
+ }
+ return false
+}
+
+func (dispatcher *Dispatcher) getContainers(params arvadosclient.Dict, touched map[string]bool) {
+ var containers ContainerList
+ err := dispatcher.Arv.List("containers", params, &containers)
+ if err != nil {
+ log.Printf("Error getting list of containers: %q", err)
+ return
+ }
+
+ if containers.ItemsAvailable > len(containers.Items) {
+ // TODO: support paging
+ log.Printf("Warning! %d containers are available but only received %d, paged requests are not yet supported, some containers may be ignored.",
+ containers.ItemsAvailable,
+ len(containers.Items))
+ }
+ for _, container := range containers.Items {
+ touched[container.UUID] = true
+ dispatcher.containers <- container
+ }
+}
+
+func (dispatcher *Dispatcher) pollContainers() {
+ ticker := time.NewTicker(dispatcher.PollInterval)
+
+ paramsQ := arvadosclient.Dict{
+ "filters": [][]interface{}{{"state", "=", "Queued"}, {"priority", ">", "0"}},
+ "order": []string{"priority desc"},
+ "limit": "1000"}
+ paramsP := arvadosclient.Dict{
+ "filters": [][]interface{}{{"locked_by_uuid", "=", dispatcher.Auth.UUID}},
+ "limit": "1000"}
+
+ for {
+ select {
+ case <-ticker.C:
+ touched := make(map[string]bool)
+ dispatcher.getContainers(paramsQ, touched)
+ dispatcher.getContainers(paramsP, touched)
+ dispatcher.mineMutex.Lock()
+ var monitored []string
+ for k := range dispatcher.mineMap {
+ if _, ok := touched[k]; !ok {
+ monitored = append(monitored, k)
+ }
+ }
+ dispatcher.mineMutex.Unlock()
+ if monitored != nil {
+ dispatcher.getContainers(arvadosclient.Dict{
+ "filters": [][]interface{}{{"uuid", "in", monitored}}}, touched)
+ }
+ case <-dispatcher.DoneProcessing:
+ close(dispatcher.containers)
+ ticker.Stop()
+ return
+ }
+ }
+}
+
+func (dispatcher *Dispatcher) handleUpdate(container Container) {
+ if container.LockedByUUID != dispatcher.Auth.UUID && container.State != Queued {
+ // If container is Complete, Cancelled, or Queued, LockedByUUID
+ // will be nil. If the container was formerly Locked, moved
+ // back to Queued and then locked by another dispatcher,
+ // LockedByUUID will be different. In either case, we want
+ // to stop monitoring it.
+ log.Printf("Container %v now in state %q with locked_by_uuid %q", container.UUID, container.State, container.LockedByUUID)
+ dispatcher.notMine(container.UUID)
+ return
+ }
+
+ if dispatcher.updateMine(container) {
+ // Already monitored, sent status update
+ return
+ }
+
+ if container.State == Queued {
+ // Try to take the lock
+ if err := dispatcher.UpdateState(container.UUID, Locked); err != nil {
+ return
+ }
+ container.State = Locked
+ }
+
+ if container.State == Locked || container.State == Running {
+ // Not currently monitored but in Locked or Running state and
+ // owned by this dispatcher, so start monitoring.
+ go dispatcher.RunContainer(dispatcher, container, dispatcher.setMine(container.UUID))
+ }
+}
+
+// UpdateState makes an API call to change the state of a container.
+func (dispatcher *Dispatcher) UpdateState(uuid, newState string) error {
+ err := dispatcher.Arv.Update("containers", uuid,
+ arvadosclient.Dict{
+ "container": arvadosclient.Dict{"state": newState}},
+ nil)
+ if err != nil {
+ log.Printf("Error updating container %s to state %q: %q", uuid, newState, err)
+ }
+ return err
+}
+
+// RunDispatcher runs the main loop of the dispatcher until receiving a message
+// on the dispatcher.DoneProcessing channel. It also installs a signal handler
+// to terminate gracefully on SIGINT, SIGTERM or SIGQUIT.
+func (dispatcher *Dispatcher) RunDispatcher() (err error) {
+ err = dispatcher.Arv.Call("GET", "api_client_authorizations", "", "current", nil, &dispatcher.Auth)
+ if err != nil {
+ log.Printf("Error getting my token UUID: %v", err)
+ return
+ }
+
+ dispatcher.mineMap = make(map[string]chan Container)
+ dispatcher.containers = make(chan Container)
+
+ // Graceful shutdown on signal
+ sigChan := make(chan os.Signal)
+ signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
+
+ go func(sig <-chan os.Signal) {
+ for sig := range sig {
+ log.Printf("Caught signal: %v", sig)
+ dispatcher.DoneProcessing <- struct{}{}
+ }
+ }(sigChan)
+
+ defer close(sigChan)
+ defer signal.Stop(sigChan)
+
+ go dispatcher.pollContainers()
+ for container := range dispatcher.containers {
+ dispatcher.handleUpdate(container)
+ }
+
+ return nil
+}
--- /dev/null
+package httpserver
+
+import (
+ "net/http"
+)
+
+type limiterHandler struct {
+ requests chan struct{}
+ handler http.Handler
+}
+
+func NewRequestLimiter(maxRequests int, handler http.Handler) http.Handler {
+ return &limiterHandler{
+ requests: make(chan struct{}, maxRequests),
+ handler: handler,
+ }
+}
+
+func (h *limiterHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+ select {
+ case h.requests <- struct{}{}:
+ default:
+ // reached max requests
+ resp.WriteHeader(http.StatusServiceUnavailable)
+ return
+ }
+ h.handler.ServeHTTP(resp, req)
+ <-h.requests
+}
--- /dev/null
+package httpserver
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "sync"
+ "testing"
+ "time"
+)
+
+type testHandler struct {
+ inHandler chan struct{}
+ okToProceed chan struct{}
+}
+
+func (h *testHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+ h.inHandler <- struct{}{}
+ <-h.okToProceed
+}
+
+func newTestHandler(maxReqs int) *testHandler {
+ return &testHandler{
+ inHandler: make(chan struct{}),
+ okToProceed: make(chan struct{}),
+ }
+}
+
+func TestRequestLimiter1(t *testing.T) {
+ h := newTestHandler(10)
+ l := NewRequestLimiter(1, h)
+ var wg sync.WaitGroup
+ resps := make([]*httptest.ResponseRecorder, 10)
+ for i := 0; i < 10; i++ {
+ wg.Add(1)
+ resps[i] = httptest.NewRecorder()
+ go func(i int) {
+ l.ServeHTTP(resps[i], &http.Request{})
+ wg.Done()
+ }(i)
+ }
+ done := make(chan struct{})
+ go func() {
+ // Make sure one request has entered the handler
+ <-h.inHandler
+ // Make sure all unsuccessful requests finish (but don't wait
+ // for the one that's still waiting for okToProceed)
+ wg.Add(-1)
+ wg.Wait()
+ // Wait for the last goroutine
+ wg.Add(1)
+ h.okToProceed <- struct{}{}
+ wg.Wait()
+ done <- struct{}{}
+ }()
+ select {
+ case <-done:
+ case <-time.After(10 * time.Second):
+ t.Fatal("test timed out, probably deadlocked")
+ }
+ n200 := 0
+ n503 := 0
+ for i := 0; i < 10; i++ {
+ switch resps[i].Code {
+ case 200:
+ n200++
+ case 503:
+ n503++
+ default:
+ t.Fatalf("Unexpected response code %d", resps[i].Code)
+ }
+ }
+ if n200 != 1 || n503 != 9 {
+ t.Fatalf("Got %d 200 responses, %d 503 responses (expected 1, 9)", n200, n503)
+ }
+ // Now that all 10 are finished, an 11th request should
+ // succeed.
+ go func() {
+ <-h.inHandler
+ h.okToProceed <- struct{}{}
+ }()
+ resp := httptest.NewRecorder()
+ l.ServeHTTP(resp, &http.Request{})
+ if resp.Code != 200 {
+ t.Errorf("Got status %d on 11th request, want 200", resp.Code)
+ }
+}
+
+func TestRequestLimiter10(t *testing.T) {
+ h := newTestHandler(10)
+ l := NewRequestLimiter(10, h)
+ var wg sync.WaitGroup
+ for i := 0; i < 10; i++ {
+ wg.Add(1)
+ go func() {
+ l.ServeHTTP(httptest.NewRecorder(), &http.Request{})
+ wg.Done()
+ }()
+ // Make sure the handler starts before we initiate the
+ // next request, but don't let it finish yet.
+ <-h.inHandler
+ }
+ for i := 0; i < 10; i++ {
+ h.okToProceed <- struct{}{}
+ }
+ wg.Wait()
+}
// error.
type ResponseWriter struct {
http.ResponseWriter
- wroteStatus *int // Last status given to WriteHeader()
- wroteBodyBytes *int // Bytes successfully written
- err *error // Last error returned from Write()
+ wroteStatus *int // Last status given to WriteHeader()
+ wroteBodyBytes *int // Bytes successfully written
+ err *error // Last error returned from Write()
}
func WrapResponseWriter(orig http.ResponseWriter) ResponseWriter {
}
var buf = make([]byte, fs.Offset+fs.Len)
_, err = io.ReadFull(rdr, buf)
+ errClosing := rdr.Close()
+ if err == nil {
+ err = errClosing
+ }
if err != nil {
r.err = err
close(r.errNotNil)
c.Check(err, check.NotNil)
c.Check(err, check.Not(check.Equals), io.EOF)
}
+ c.Check(rdr.Close(), check.NotNil)
}
// makePermSignature generates a SHA-1 HMAC digest for the given blob,
// token, expiry, and site secret.
-func makePermSignature(blobHash, apiToken, expiry string, permissionSecret []byte) string {
+func makePermSignature(blobHash, apiToken, expiry, blobSignatureTTL string, permissionSecret []byte) string {
hmac := hmac.New(sha1.New, permissionSecret)
hmac.Write([]byte(blobHash))
hmac.Write([]byte("@"))
hmac.Write([]byte(apiToken))
hmac.Write([]byte("@"))
hmac.Write([]byte(expiry))
+ hmac.Write([]byte("@"))
+ hmac.Write([]byte(blobSignatureTTL))
digest := hmac.Sum(nil)
return fmt.Sprintf("%x", digest)
}
//
// This function is intended to be used by system components and admin
// utilities: userland programs do not know the permissionSecret.
-func SignLocator(blobLocator, apiToken string, expiry time.Time, permissionSecret []byte) string {
+func SignLocator(blobLocator, apiToken string, expiry time.Time, blobSignatureTTL time.Duration, permissionSecret []byte) string {
if len(permissionSecret) == 0 || apiToken == "" {
return blobLocator
}
// Strip off all hints: only the hash is used to sign.
blobHash := strings.Split(blobLocator, "+")[0]
timestampHex := fmt.Sprintf("%08x", expiry.Unix())
+ blobSignatureTTLHex := strconv.FormatInt(int64(blobSignatureTTL.Seconds()), 16)
return blobLocator +
- "+A" + makePermSignature(blobHash, apiToken, timestampHex, permissionSecret) +
+ "+A" + makePermSignature(blobHash, apiToken, timestampHex, blobSignatureTTLHex, permissionSecret) +
"@" + timestampHex
}
//
// This function is intended to be used by system components and admin
// utilities: userland programs do not know the permissionSecret.
-func VerifySignature(signedLocator, apiToken string, permissionSecret []byte) error {
+func VerifySignature(signedLocator, apiToken string, blobSignatureTTL time.Duration, permissionSecret []byte) error {
matches := signedLocatorRe.FindStringSubmatch(signedLocator)
if matches == nil {
return ErrSignatureMissing
} else if expiryTime.Before(time.Now()) {
return ErrSignatureExpired
}
- if signatureHex != makePermSignature(blobHash, apiToken, expiryHex, permissionSecret) {
+ blobSignatureTTLHex := strconv.FormatInt(int64(blobSignatureTTL.Seconds()), 16)
+ if signatureHex != makePermSignature(blobHash, apiToken, expiryHex, blobSignatureTTLHex, permissionSecret) {
return ErrSignatureInvalid
}
return nil
"gokee3eamvjy8qq1fvy238838enjmy5wzy2md7yvsitp5vztft6j4q866efym7e6" +
"vu5wm9fpnwjyxfldw3vbo01mgjs75rgo7qioh8z8ij7jpyp8508okhgbbex3ceei" +
"786u5rw2a9gx743dj3fgq2irk"
- knownSignature = "257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a"
+ knownSignature = "89118b78732c33104a4d6231e8b5a5fa1e4301e3"
knownTimestamp = "7fffffff"
knownSigHint = "+A" + knownSignature + "@" + knownTimestamp
knownSignedLocator = knownLocator + knownSigHint
+ blobSignatureTTL = 1209600 * time.Second
)
func TestSignLocator(t *testing.T) {
if ts, err := parseHexTimestamp(knownTimestamp); err != nil {
t.Errorf("bad knownTimestamp %s", knownTimestamp)
} else {
- if knownSignedLocator != SignLocator(knownLocator, knownToken, ts, []byte(knownKey)) {
+ if knownSignedLocator != SignLocator(knownLocator, knownToken, ts, blobSignatureTTL, []byte(knownKey)) {
t.Fail()
}
}
}
func TestVerifySignature(t *testing.T) {
- if VerifySignature(knownSignedLocator, knownToken, []byte(knownKey)) != nil {
+ if VerifySignature(knownSignedLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
t.Fail()
}
}
func TestVerifySignatureExtraHints(t *testing.T) {
- if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint, knownToken, []byte(knownKey)) != nil {
+ if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
t.Fatal("Verify cannot handle hint before permission signature")
}
- if VerifySignature(knownLocator+knownSigHint+"+Zfoo", knownToken, []byte(knownKey)) != nil {
+ if VerifySignature(knownLocator+knownSigHint+"+Zfoo", knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
t.Fatal("Verify cannot handle hint after permission signature")
}
- if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint+"+Zfoo", knownToken, []byte(knownKey)) != nil {
+ if VerifySignature(knownLocator+"+K@xyzzy"+knownSigHint+"+Zfoo", knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
t.Fatal("Verify cannot handle hints around permission signature")
}
}
// The size hint on the locator string should not affect signature validation.
func TestVerifySignatureWrongSize(t *testing.T) {
- if VerifySignature(knownHash+"+999999"+knownSigHint, knownToken, []byte(knownKey)) != nil {
+ if VerifySignature(knownHash+"+999999"+knownSigHint, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
t.Fatal("Verify cannot handle incorrect size hint")
}
- if VerifySignature(knownHash+knownSigHint, knownToken, []byte(knownKey)) != nil {
+ if VerifySignature(knownHash+knownSigHint, knownToken, blobSignatureTTL, []byte(knownKey)) != nil {
t.Fatal("Verify cannot handle missing size hint")
}
}
func TestVerifySignatureBadSig(t *testing.T) {
badLocator := knownLocator + "+Aaaaaaaaaaaaaaaa@" + knownTimestamp
- if VerifySignature(badLocator, knownToken, []byte(knownKey)) != ErrSignatureMissing {
+ if VerifySignature(badLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != ErrSignatureMissing {
t.Fail()
}
}
func TestVerifySignatureBadTimestamp(t *testing.T) {
badLocator := knownLocator + "+A" + knownSignature + "@OOOOOOOl"
- if VerifySignature(badLocator, knownToken, []byte(knownKey)) != ErrSignatureMissing {
+ if VerifySignature(badLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != ErrSignatureMissing {
t.Fail()
}
}
func TestVerifySignatureBadSecret(t *testing.T) {
- if VerifySignature(knownSignedLocator, knownToken, []byte("00000000000000000000")) != ErrSignatureInvalid {
+ if VerifySignature(knownSignedLocator, knownToken, blobSignatureTTL, []byte("00000000000000000000")) != ErrSignatureInvalid {
t.Fail()
}
}
func TestVerifySignatureBadToken(t *testing.T) {
- if VerifySignature(knownSignedLocator, "00000000", []byte(knownKey)) != ErrSignatureInvalid {
+ if VerifySignature(knownSignedLocator, "00000000", blobSignatureTTL, []byte(knownKey)) != ErrSignatureInvalid {
t.Fail()
}
}
func TestVerifySignatureExpired(t *testing.T) {
yesterday := time.Now().AddDate(0, 0, -1)
- expiredLocator := SignLocator(knownHash, knownToken, yesterday, []byte(knownKey))
- if VerifySignature(expiredLocator, knownToken, []byte(knownKey)) != ErrSignatureExpired {
+ expiredLocator := SignLocator(knownHash, knownToken, yesterday, blobSignatureTTL, []byte(knownKey))
+ if VerifySignature(expiredLocator, knownToken, blobSignatureTTL, []byte(knownKey)) != ErrSignatureExpired {
t.Fail()
}
}
return ch
}
-// Blocks may appear mulitple times within the same manifest if they
+// Blocks may appear multiple times within the same manifest if they
// are used by multiple files. In that case this Iterator will output
// the same block multiple times.
//
Meanwhile, the transfer() function selects() on two channels, the "requests"
channel and the "slices" channel.
-When a message is recieved on the "slices" channel, this means the a new
+When a message is received on the "slices" channel, this means the a new
section of the buffer has data, or an error is signaled. Since the data has
been read directly into the source_buffer, it is able to simply increases the
size of the body slice to encompass the newly filled in section. Then any
pending reads are serviced with handleReadRequest (described below).
-When a message is recieved on the "requests" channel, it means a StreamReader
+When a message is received on the "requests" channel, it means a StreamReader
wants access to a slice of the buffer. This is passed to handleReadRequest().
The handleReadRequest() function takes a sliceRequest consisting of a buffer
cache_pos, cache_data = self._readline_cache
if self.tell() == cache_pos:
data = [cache_data]
+ self._filepos += len(cache_data)
else:
data = ['']
data_size = len(data[-1])
except ValueError:
nextline_index = len(data)
nextline_index = min(nextline_index, size)
+ self._filepos -= len(data) - nextline_index
self._readline_cache = (self.tell(), data[nextline_index:])
return data[:nextline_index]
# arv-copy will issue an error.
import argparse
+import contextlib
import getpass
import os
import re
from arvados.api import OrderedJsonModel
+COMMIT_HASH_RE = re.compile(r'^[0-9a-f]{1,40}$')
+
logger = logging.getLogger('arvados.arv-copy')
# local_repo_dir records which git repositories from the Arvados source
copy_opts.add_argument(
'-f', '--force', dest='force', action='store_true',
help='Perform copy even if the object appears to exist at the remote destination.')
+ copy_opts.add_argument(
+ '--force-filters', action='store_true', default=False,
+ help="Copy pipeline template filters verbatim, even if they act differently on the destination cluster.")
copy_opts.add_argument(
'--src', dest='source_arvados', required=True,
help='The name of the source Arvados instance (required) - points at an Arvados config file. May be either a pathname to a config file, or (for example) "foo" as shorthand for $HOME/.config/arvados/foo.conf.')
new_pi = dst.pipeline_instances().create(body=pi, ensure_unique_name=True).execute(num_retries=args.retries)
return new_pi
+def filter_iter(arg):
+ """Iterate a filter string-or-list.
+
+ Pass in a filter field that can either be a string or list.
+ This will iterate elements as if the field had been written as a list.
+ """
+ if isinstance(arg, basestring):
+ return iter((arg,))
+ else:
+ return iter(arg)
+
+def migrate_repository_filter(repo_filter, src_repository, dst_repository):
+ """Update a single repository filter in-place for the destination.
+
+ If the filter checks that the repository is src_repository, it is
+ updated to check that the repository is dst_repository. If it does
+ anything else, this function raises ValueError.
+ """
+ if src_repository is None:
+ raise ValueError("component does not specify a source repository")
+ elif dst_repository is None:
+ raise ValueError("no destination repository specified to update repository filter")
+ elif repo_filter[1:] == ['=', src_repository]:
+ repo_filter[2] = dst_repository
+ elif repo_filter[1:] == ['in', [src_repository]]:
+ repo_filter[2] = [dst_repository]
+ else:
+ raise ValueError("repository filter is not a simple source match")
+
+def migrate_script_version_filter(version_filter):
+ """Update a single script_version filter in-place for the destination.
+
+ Currently this function checks that all the filter operands are Git
+ commit hashes. If they're not, it raises ValueError to indicate that
+ the filter is not portable. It could be extended to make other
+ transformations in the future.
+ """
+ if not all(COMMIT_HASH_RE.match(v) for v in filter_iter(version_filter[2])):
+ raise ValueError("script_version filter is not limited to commit hashes")
+
+def attr_filtered(filter_, *attr_names):
+ """Return True if filter_ applies to any of attr_names, else False."""
+ return any((name == 'any') or (name in attr_names)
+ for name in filter_iter(filter_[0]))
+
+@contextlib.contextmanager
+def exception_handler(handler, *exc_types):
+ """If any exc_types are raised in the block, call handler on the exception."""
+ try:
+ yield
+ except exc_types as error:
+ handler(error)
+
+def migrate_components_filters(template_components, dst_git_repo):
+ """Update template component filters in-place for the destination.
+
+ template_components is a dictionary of components in a pipeline template.
+ This method walks over each component's filters, and updates them to have
+ identical semantics on the destination cluster. It returns a list of
+ error strings that describe what filters could not be updated safely.
+
+ dst_git_repo is the name of the destination Git repository, which can
+ be None if that is not known.
+ """
+ errors = []
+ for cname, cspec in template_components.iteritems():
+ def add_error(errmsg):
+ errors.append("{}: {}".format(cname, errmsg))
+ if not isinstance(cspec, dict):
+ add_error("value is not a component definition")
+ continue
+ src_repository = cspec.get('repository')
+ filters = cspec.get('filters', [])
+ if not isinstance(filters, list):
+ add_error("filters are not a list")
+ continue
+ for cfilter in filters:
+ if not (isinstance(cfilter, list) and (len(cfilter) == 3)):
+ add_error("malformed filter {!r}".format(cfilter))
+ continue
+ if attr_filtered(cfilter, 'repository'):
+ with exception_handler(add_error, ValueError):
+ migrate_repository_filter(cfilter, src_repository, dst_git_repo)
+ if attr_filtered(cfilter, 'script_version'):
+ with exception_handler(add_error, ValueError):
+ migrate_script_version_filter(cfilter)
+ return errors
+
# copy_pipeline_template(pt_uuid, src, dst, args)
#
# Copies a pipeline template identified by pt_uuid from src to dst.
# fetch the pipeline template from the source instance
pt = src.pipeline_templates().get(uuid=pt_uuid).execute(num_retries=args.retries)
+ if not args.force_filters:
+ filter_errors = migrate_components_filters(pt['components'], args.dst_git_repo)
+ if filter_errors:
+ abort("Template filters cannot be copied safely. Use --force-filters to copy anyway.\n" +
+ "\n".join(filter_errors))
+
if args.recursive:
check_git_availability()
return [(image['collection'], image) for image in images
if image['collection'] in existing_coll_uuids]
-def main(arguments=None):
+def items_owned_by(owner_uuid, arv_items):
+ return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
+
+def main(arguments=None, stdout=sys.stdout):
args = arg_parser.parse_args(arguments)
api = arvados.api('v1')
if args.image is None or args.image == 'images':
- fmt = "{:30} {:10} {:12} {:29} {:20}"
- print fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED")
+ fmt = "{:30} {:10} {:12} {:29} {:20}\n"
+ stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
for i, j in list_images_in_arv(api, args.retries):
- print(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
+ stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
sys.exit(0)
# Pull the image if requested, unless the image is specified as a hash
num_retries=args.retries)['uuid']
# Find image hash tags
- existing_links = api.links().list(
+ existing_links = _get_docker_links(
+ api, args.retries,
filters=[['link_class', '=', 'docker_image_hash'],
- ['name', '=', image_hash]]
- ).execute(num_retries=args.retries)['items']
+ ['name', '=', image_hash]])
if existing_links:
# get readable collections
collections = api.collections().list(
if collections:
# check for repo+tag links on these collections
- existing_repo_tag = (api.links().list(
- filters=[['link_class', '=', 'docker_image_repo+tag'],
- ['name', '=', image_repo_tag],
- ['head_uuid', 'in', collections]]
- ).execute(num_retries=args.retries)['items']) if image_repo_tag else []
-
- # Filter on elements owned by the parent project
- owned_col = [c for c in collections if c['owner_uuid'] == parent_project_uuid]
- owned_img = [c for c in existing_links if c['owner_uuid'] == parent_project_uuid]
- owned_rep = [c for c in existing_repo_tag if c['owner_uuid'] == parent_project_uuid]
-
- if owned_col:
- # already have a collection owned by this project
- coll_uuid = owned_col[0]['uuid']
+ if image_repo_tag:
+ existing_repo_tag = _get_docker_links(
+ api, args.retries,
+ filters=[['link_class', '=', 'docker_image_repo+tag'],
+ ['name', '=', image_repo_tag],
+ ['head_uuid', 'in', collections]])
else:
+ existing_repo_tag = []
+
+ try:
+ coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
+ except StopIteration:
# create new collection owned by the project
coll_uuid = api.collections().create(
body={"manifest_text": collections[0]['manifest_text'],
).execute(num_retries=args.retries)['uuid']
link_base = {'owner_uuid': parent_project_uuid,
- 'head_uuid': coll_uuid }
+ 'head_uuid': coll_uuid,
+ 'properties': existing_links[0]['properties']}
- if not owned_img:
+ if not any(items_owned_by(parent_project_uuid, existing_links)):
# create image link owned by the project
make_link(api, args.retries,
'docker_image_hash', image_hash, **link_base)
- if not owned_rep and image_repo_tag:
+ if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
# create repo+tag link owned by the project
make_link(api, args.retries, 'docker_image_repo+tag',
image_repo_tag, **link_base)
- print(coll_uuid)
+ stdout.write(coll_uuid + "\n")
sys.exit(0)
put_args += ['--name', collection_name]
coll_uuid = arv_put.main(
- put_args + ['--filename', outfile_name, image_file.name]).strip()
+ put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
# Read the image metadata and make Arvados links from it.
image_file.seek(0)
self.cache_file.seek(0)
return json.load(self.cache_file)
+ def check_cache(self, api_client=None, num_retries=0):
+ try:
+ state = self.load()
+ locator = None
+ try:
+ if "_finished_streams" in state and len(state["_finished_streams"]) > 0:
+ locator = state["_finished_streams"][0][1][0]
+ elif "_current_stream_locators" in state and len(state["_current_stream_locators"]) > 0:
+ locator = state["_current_stream_locators"][0]
+ if locator is not None:
+ kc = arvados.keep.KeepClient(api_client=api_client)
+ kc.head(locator, num_retries=num_retries)
+ except Exception as e:
+ self.restart()
+ except (ValueError):
+ pass
+
def save(self, data):
try:
new_cache_fd, new_cache_name = tempfile.mkstemp(
if args.resume:
try:
resume_cache = ResumeCache(ResumeCache.make_path(args))
+ resume_cache.check_cache(api_client=api_client, num_retries=args.retries)
except (IOError, OSError, ValueError):
pass # Couldn't open cache directory/file. Continue without it.
except ResumeCacheConflict:
if args.progress: # Print newline to split stderr from stdout for humans.
print >>stderr
+ output = None
if args.stream:
output = writer.manifest_text()
if args.normalize:
status = 1
# Print the locator (uuid) of the new collection.
- stdout.write(output)
- if not output.endswith('\n'):
- stdout.write('\n')
+ if output is None:
+ status = status or 1
+ else:
+ stdout.write(output)
+ if not output.endswith('\n'):
+ stdout.write('\n')
for sigcode, orig_handler in orig_signal_handlers.items():
signal.signal(sigcode, orig_handler)
self.prefix = prefix
self.fn = fn
+ def __hash__(self):
+ return (self.prefix+self.fn).__hash__()
+
+ def __eq__(self, other):
+ return (self.prefix == other.prefix) and (self.fn == other.fn)
+
class UploadFile(ArvFile):
pass
return prefix+fn
-def uploadfiles(files, api, dry_run=False, num_retries=0, project=None, fnPattern="$(file %s/%s)"):
+def uploadfiles(files, api, dry_run=False, num_retries=0, project=None, fnPattern="$(file %s/%s)", name=None):
# Find the smallest path prefix that includes all the files that need to be uploaded.
# This starts at the root and iteratively removes common parent directory prefixes
- # until all file pathes no longer have a common parent.
+ # until all file paths no longer have a common parent.
n = True
pathprefix = "/"
while n:
stream = sp[0]
collection.start_new_stream(stream)
collection.write_file(f.fn, sp[1])
- item = api.collections().create(body={"owner_uuid": project, "manifest_text": collection.manifest_text()}).execute()
+
+ exists = api.collections().list(filters=[["owner_uuid", "=", project],
+ ["portable_data_hash", "=", collection.portable_data_hash()],
+ ["name", "=", name]]).execute(num_retries=num_retries)
+ if exists["items"]:
+ item = exists["items"][0]
+ logger.info("Using collection %s", item["uuid"])
+ else:
+ body = {"owner_uuid": project, "manifest_text": collection.manifest_text()}
+ if name is not None:
+ body["name"] = name
+ item = api.collections().create(body=body, ensure_unique_name=True).execute()
+ logger.info("Uploaded to %s", item["uuid"])
+
pdh = item["portable_data_hash"]
- logger.info("Uploaded to %s", item["uuid"])
for c in files:
c.fn = fnPattern % (pdh, c.fn)
import arvados
import config
import errors
+from retry import RetryLoop
import logging
import json
+import thread
import threading
import time
import os
_logger = logging.getLogger('arvados.events')
-class EventClient(WebSocketClient):
- def __init__(self, url, filters, on_event, last_log_id):
+class _EventClient(WebSocketClient):
+ def __init__(self, url, filters, on_event, last_log_id, on_closed):
ssl_options = {'ca_certs': arvados.util.ca_certs_path()}
if config.flag_is_true('ARVADOS_API_HOST_INSECURE'):
ssl_options['cert_reqs'] = ssl.CERT_NONE
# IPv4 addresses (common with "localhost"), only one of them
# will be attempted -- and it might not be the right one. See
# ws4py's WebSocketBaseClient.__init__.
- super(EventClient, self).__init__(url, ssl_options=ssl_options)
+ super(_EventClient, self).__init__(url, ssl_options=ssl_options)
+
self.filters = filters
self.on_event = on_event
self.last_log_id = last_log_id
self._closing_lock = threading.RLock()
self._closing = False
self._closed = threading.Event()
+ self.on_closed = on_closed
def opened(self):
- self.subscribe(self.filters, self.last_log_id)
+ for f in self.filters:
+ self.subscribe(f, self.last_log_id)
def closed(self, code, reason=None):
self._closed.set()
+ self.on_closed()
def received_message(self, m):
with self._closing_lock:
:timeout: is the number of seconds to wait for ws4py to
indicate that the connection has closed.
"""
- super(EventClient, self).close(code, reason)
+ super(_EventClient, self).close(code, reason)
with self._closing_lock:
# make sure we don't process any more messages.
self._closing = True
# wait for ws4py to tell us the connection is closed.
self._closed.wait(timeout=timeout)
- def subscribe(self, filters, last_log_id=None):
- m = {"method": "subscribe", "filters": filters}
+ def subscribe(self, f, last_log_id=None):
+ m = {"method": "subscribe", "filters": f}
if last_log_id is not None:
m["last_log_id"] = last_log_id
self.send(json.dumps(m))
- def unsubscribe(self, filters):
- self.send(json.dumps({"method": "unsubscribe", "filters": filters}))
+ def unsubscribe(self, f):
+ self.send(json.dumps({"method": "unsubscribe", "filters": f}))
+
+
+class EventClient(object):
+ def __init__(self, url, filters, on_event_cb, last_log_id):
+ self.url = url
+ if filters:
+ self.filters = [filters]
+ else:
+ self.filters = [[]]
+ self.on_event_cb = on_event_cb
+ self.last_log_id = last_log_id
+ self.is_closed = threading.Event()
+ self._setup_event_client()
+
+ def _setup_event_client(self):
+ self.ec = _EventClient(self.url, self.filters, self.on_event,
+ self.last_log_id, self.on_closed)
+ self.ec.daemon = True
+ try:
+ self.ec.connect()
+ except Exception:
+ self.ec.close_connection()
+ raise
+
+ def subscribe(self, f, last_log_id=None):
+ self.filters.append(f)
+ self.ec.subscribe(f, last_log_id)
+
+ def unsubscribe(self, f):
+ del self.filters[self.filters.index(f)]
+ self.ec.unsubscribe(f)
+
+ def close(self, code=1000, reason='', timeout=0):
+ self.is_closed.set()
+ self.ec.close(code, reason, timeout)
+
+ def on_event(self, m):
+ if m.get('id') != None:
+ self.last_log_id = m.get('id')
+ try:
+ self.on_event_cb(m)
+ except Exception as e:
+ _logger.exception("Unexpected exception from event callback.")
+ thread.interrupt_main()
+
+ def on_closed(self):
+ if not self.is_closed.is_set():
+ _logger.warn("Unexpected close. Reconnecting.")
+ for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=15):
+ try:
+ self._setup_event_client()
+ break
+ except Exception as e:
+ _logger.warn("Error '%s' during websocket reconnect.", e)
+ if tries_left == 0:
+ _logger.exception("EventClient thread could not contact websocket server.")
+ self.is_closed.set()
+ thread.interrupt_main()
+ return
+
+ def run_forever(self):
+ # Have to poll here to let KeyboardInterrupt get raised.
+ while not self.is_closed.wait(1):
+ pass
class PollClient(threading.Thread):
self.id = self.last_log_id
else:
for f in self.filters:
- items = self.api.logs().list(limit=1, order="id desc", filters=f).execute()['items']
+ for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=self.poll_time):
+ try:
+ items = self.api.logs().list(limit=1, order="id desc", filters=f).execute()['items']
+ break
+ except errors.ApiError as error:
+ pass
+ else:
+ tries_left = 0
+ break
+ if tries_left == 0:
+ _logger.exception("PollClient thread could not contact API server.")
+ with self._closing_lock:
+ self._closing.set()
+ thread.interrupt_main()
+ return
if items:
if items[0]['id'] > self.id:
self.id = items[0]['id']
max_id = self.id
moreitems = False
for f in self.filters:
- items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute()
+ for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=self.poll_time):
+ try:
+ items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute()
+ break
+ except errors.ApiError as error:
+ pass
+ else:
+ tries_left = 0
+ break
+ if tries_left == 0:
+ _logger.exception("PollClient thread could not contact API server.")
+ with self._closing_lock:
+ self._closing.set()
+ thread.interrupt_main()
+ return
for i in items["items"]:
if i['id'] > max_id:
max_id = i['id']
with self._closing_lock:
if self._closing.is_set():
return
- self.on_event(i)
+ try:
+ self.on_event(i)
+ except Exception as e:
+ _logger.exception("Unexpected exception from event callback.")
+ thread.interrupt_main()
if items["items_available"] > len(items["items"]):
moreitems = True
self.id = max_id
# to do so raises the same exception."
pass
- def subscribe(self, filters):
+ def subscribe(self, f):
self.on_event({'status': 200})
- self.filters.append(filters)
+ self.filters.append(f)
- def unsubscribe(self, filters):
- del self.filters[self.filters.index(filters)]
+ def unsubscribe(self, f):
+ del self.filters[self.filters.index(f)]
def _subscribe_websocket(api, filters, on_event, last_log_id=None):
if not endpoint:
raise errors.FeatureNotEnabledError(
"Server does not advertise a websocket endpoint")
+ uri_with_token = "{}?api_token={}".format(endpoint, api.api_token)
try:
- uri_with_token = "{}?api_token={}".format(endpoint, api.api_token)
client = EventClient(uri_with_token, filters, on_event, last_log_id)
- ok = False
- try:
- client.connect()
- ok = True
- return client
- finally:
- if not ok:
- client.close_connection()
- except:
+ except Exception:
_logger.warn("Failed to connect to websockets on %s" % endpoint)
raise
+ else:
+ return client
def subscribe(api, filters, on_event, poll_fallback=15, last_log_id=None):
self._cache.insert(0, n)
return n, True
-
class Counter(object):
def __init__(self, v=0):
self._lk = threading.Lock()
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 75)
return s
- def get(self, locator, timeout=None):
+ def get(self, locator, method="GET", timeout=None):
# locator is a KeepLocator object.
url = self.root + str(locator)
- _logger.debug("Request: GET %s", url)
+ _logger.debug("Request: %s %s", method, url)
curl = self._get_user_agent()
ok = None
try:
'{}: {}'.format(k,v) for k,v in self.get_headers.iteritems()])
curl.setopt(pycurl.WRITEFUNCTION, response_body.write)
curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
+ if method == "HEAD":
+ curl.setopt(pycurl.NOBODY, True)
self._setcurltimeouts(curl, timeout)
+
try:
curl.perform()
except Exception as e:
'headers': self._headers,
'error': False,
}
+
ok = retry.check_http_response_success(self._result['status_code'])
if not ok:
self._result['error'] = arvados.errors.HttpError(
_logger.debug("Request fail: GET %s => %s: %s",
url, type(self._result['error']), str(self._result['error']))
return None
+ if method == "HEAD":
+ _logger.info("HEAD %s: %s bytes",
+ self._result['status_code'],
+ self._result.get('content-length'))
+ return True
+
_logger.info("GET %s: %s bytes in %s msec (%.3f MiB/sec)",
self._result['status_code'],
len(self._result['body']),
t.msecs,
(len(self._result['body'])/(1024.0*1024))/t.secs if t.secs > 0 else 0)
+
if self.download_counter:
self.download_counter.add(len(self._result['body']))
resp_md5 = hashlib.md5(self._result['body']).hexdigest()
else:
return None
+ @retry.retry_method
+ def head(self, loc_s, num_retries=None):
+ return self._get_or_head(loc_s, method="HEAD", num_retries=num_retries)
+
@retry.retry_method
def get(self, loc_s, num_retries=None):
+ return self._get_or_head(loc_s, method="GET", num_retries=num_retries)
+
+ def _get_or_head(self, loc_s, method="GET", num_retries=None):
"""Get data from Keep.
This method fetches one or more blocks of data from Keep. It
self.get_counter.add(1)
locator = KeepLocator(loc_s)
- slot, first = self.block_cache.reserve_cache(locator.md5sum)
- if not first:
- self.hits_counter.add(1)
- v = slot.get()
- return v
+ if method == "GET":
+ slot, first = self.block_cache.reserve_cache(locator.md5sum)
+ if not first:
+ self.hits_counter.add(1)
+ v = slot.get()
+ return v
self.misses_counter.add(1)
for root in sorted_roots
if roots_map[root].usable()]
for keep_service in services_to_try:
- blob = keep_service.get(locator, timeout=self.current_timeout(num_retries-tries_left))
+ blob = keep_service.get(locator, method=method, timeout=self.current_timeout(num_retries-tries_left))
if blob is not None:
break
loop.save_result((blob, len(services_to_try)))
# Always cache the result, then return it if we succeeded.
- slot.set(blob)
- self.block_cache.cap_cache()
+ if method == "GET":
+ slot.set(blob)
+ self.block_cache.cap_cache()
if loop.success():
- return blob
+ if method == "HEAD":
+ return True
+ else:
+ return blob
# Q: Including 403 is necessary for the Keep tests to continue
# passing, but maybe they should expect KeepReadError instead?
return loop.last_result()
"""
def __init__(self, num_retries, success_check=lambda r: True,
- backoff_start=0, backoff_growth=2, save_results=1):
+ backoff_start=0, backoff_growth=2, save_results=1,
+ max_wait=60):
"""Construct a new RetryLoop.
Arguments:
* save_results: Specify a number to save the last N results
that the loop recorded. These records are available through
the results attribute, oldest first. Default 1.
+ * max_wait: Maximum number of seconds to wait between retries.
"""
self.tries_left = num_retries + 1
self.check_result = success_check
self.backoff_wait = backoff_start
self.backoff_growth = backoff_growth
+ self.max_wait = max_wait
self.next_start_time = 0
self.results = deque(maxlen=save_results)
self._running = None
wait_time = max(0, self.next_start_time - time.time())
time.sleep(wait_time)
self.backoff_wait *= self.backoff_growth
+ if self.backoff_wait > self.max_wait:
+ self.backoff_wait = self.max_wait
self.next_start_time = time.time() + self.backoff_wait
self.tries_left -= 1
return self.tries_left
except ImportError:
tagger = egg_info_cmd.egg_info
+short_tests_only = False
+if '--short-tests-only' in sys.argv:
+ short_tests_only = True
+ sys.argv.remove('--short-tests-only')
+
setup(name='arvados-python-client',
version='0.1',
description='Arvados client library',
install_requires=[
'google-api-python-client==1.4.2',
'oauth2client >=1.4.6, <2',
+ 'pyasn1-modules==0.0.5',
'ciso8601',
'httplib2',
'pycurl >=7.19.5.1, <7.21.5',
self._headerfunction("HTTP/1.1 {} Status".format(self._resp_code))
for k, v in self._resp_headers.iteritems():
self._headerfunction(k + ': ' + str(v))
- self._writer(self._resp_body)
+ if type(self._resp_body) is not bool:
+ self._writer(self._resp_body)
def close(self):
pass
self.wfile_bandwidth_write(self.server.store[datahash])
self.server._do_delay('response_close')
+ def do_HEAD(self):
+ self.server._do_delay('response')
+ r = re.search(r'[0-9a-f]{32}', self.path)
+ if not r:
+ return self.send_response(422)
+ datahash = r.group(0)
+ if datahash not in self.server.store:
+ return self.send_response(404)
+ self.send_response(200)
+ self.send_header('Content-type', 'application/octet-stream')
+ self.send_header('Content-length', str(len(self.server.store[datahash])))
+ self.end_headers()
+ self.server._do_delay('response_close')
+
def do_PUT(self):
self.server._do_delay('request_body')
# The comments at https://bugs.python.org/issue1491 implies that Python
--- /dev/null
+import __main__
+import os
+import unittest
+
+slow_test = lambda _: unittest.skipIf(
+ __main__.short_tests_only,
+ "running --short tests only")
def test_empty_list(self):
answer = arvados.api('v1').humans().list(
- filters=[['uuid', 'is', None]]).execute()
+ filters=[['uuid', '=', None]]).execute()
self.assertEqual(answer['items_available'], len(answer['items']))
def test_nonempty_list(self):
import arvados
import arvados.commands.put as arv_put
-from arvados_testutil import ArvadosBaseTestCase
+from arvados_testutil import ArvadosBaseTestCase, fake_httplib2_response
import run_test_server
class ArvadosPutResumeCacheTest(ArvadosBaseTestCase):
else:
config['ARVADOS_API_HOST'] = orig_host
+ @mock.patch('arvados.keep.KeepClient.head')
+ def test_resume_cache_with_current_stream_locators(self, keep_client_head):
+ keep_client_head.side_effect = [True]
+ thing = {}
+ thing['_current_stream_locators'] = ['098f6bcd4621d373cade4e832627b4f6+4', '1f253c60a2306e0ee12fb6ce0c587904+6']
+ with tempfile.NamedTemporaryFile() as cachefile:
+ self.last_cache = arv_put.ResumeCache(cachefile.name)
+ self.last_cache.save(thing)
+ self.last_cache.close()
+ resume_cache = arv_put.ResumeCache(self.last_cache.filename)
+ self.assertNotEqual(None, resume_cache)
+
+ @mock.patch('arvados.keep.KeepClient.head')
+ def test_resume_cache_with_finished_streams(self, keep_client_head):
+ keep_client_head.side_effect = [True]
+ thing = {}
+ thing['_finished_streams'] = [['.', ['098f6bcd4621d373cade4e832627b4f6+4', '1f253c60a2306e0ee12fb6ce0c587904+6']]]
+ with tempfile.NamedTemporaryFile() as cachefile:
+ self.last_cache = arv_put.ResumeCache(cachefile.name)
+ self.last_cache.save(thing)
+ self.last_cache.close()
+ resume_cache = arv_put.ResumeCache(self.last_cache.filename)
+ self.assertNotEqual(None, resume_cache)
+
+ @mock.patch('arvados.keep.KeepClient.head')
+ def test_resume_cache_with_finished_streams_error_on_head(self, keep_client_head):
+ keep_client_head.side_effect = Exception('Locator not found')
+ thing = {}
+ thing['_finished_streams'] = [['.', ['098f6bcd4621d373cade4e832627b4f6+4', '1f253c60a2306e0ee12fb6ce0c587904+6']]]
+ with tempfile.NamedTemporaryFile() as cachefile:
+ self.last_cache = arv_put.ResumeCache(cachefile.name)
+ self.last_cache.save(thing)
+ self.last_cache.close()
+ resume_cache = arv_put.ResumeCache(self.last_cache.filename)
+ self.assertNotEqual(None, resume_cache)
+ self.assertRaises(None, resume_cache.check_cache())
+
def test_basic_cache_storage(self):
thing = ['test', 'list']
with tempfile.NamedTemporaryFile() as cachefile:
self.call_main_with_args,
['--project-uuid', self.Z_UUID, '--stream'])
+ def test_api_error_handling(self):
+ collections_mock = mock.Mock(name='arv.collections()')
+ coll_create_mock = collections_mock().create().execute
+ coll_create_mock.side_effect = arvados.errors.ApiError(
+ fake_httplib2_response(403), '{}')
+ arv_put.api_client = arvados.api('v1')
+ arv_put.api_client.collections = collections_mock
+ with self.assertRaises(SystemExit) as exc_test:
+ self.call_main_with_args(['/dev/null'])
+ self.assertLess(0, exc_test.exception.args[0])
+ self.assertLess(0, coll_create_mock.call_count)
+ self.assertEqual("", self.main_stdout.getvalue())
+
+
class ArvPutIntegrationTest(run_test_server.TestCaseWithServers,
ArvadosBaseTestCase):
def _getKeepServerConfig():
--- /dev/null
+import arvados
+import io
+import logging
+import mock
+import Queue
+import run_test_server
+import threading
+import time
+import unittest
+
+import arvados_testutil
+
+class WebsocketTest(run_test_server.TestCaseWithServers):
+ MAIN_SERVER = {}
+
+ TIME_PAST = time.time()-3600
+ TIME_FUTURE = time.time()+3600
+ MOCK_WS_URL = 'wss://[{}]/'.format(arvados_testutil.TEST_HOST)
+
+ def setUp(self):
+ self.ws = None
+
+ def tearDown(self):
+ try:
+ if self.ws:
+ self.ws.close()
+ except Exception as e:
+ print("Error in teardown: ", e)
+ super(WebsocketTest, self).tearDown()
+ run_test_server.reset()
+
+ def _test_subscribe(self, poll_fallback, expect_type, start_time=None, expected=1):
+ run_test_server.authorize_with('active')
+ events = Queue.Queue(100)
+
+ # Create ancestor before subscribing.
+ # When listening with start_time in the past, this should also be retrieved.
+ # However, when start_time is omitted in subscribe, this should not be fetched.
+ ancestor = arvados.api('v1').humans().create(body={}).execute()
+
+ filters = [['object_uuid', 'is_a', 'arvados#human']]
+ if start_time:
+ filters.append(['created_at', '>=', start_time])
+
+ self.ws = arvados.events.subscribe(
+ arvados.api('v1'), filters,
+ events.put_nowait,
+ poll_fallback=poll_fallback,
+ last_log_id=(1 if start_time else None))
+ self.assertIsInstance(self.ws, expect_type)
+ self.assertEqual(200, events.get(True, 5)['status'])
+ human = arvados.api('v1').humans().create(body={}).execute()
+
+ log_object_uuids = []
+ for i in range(0, expected):
+ log_object_uuids.append(events.get(True, 5)['object_uuid'])
+
+ if expected > 0:
+ self.assertIn(human['uuid'], log_object_uuids)
+
+ if expected > 1:
+ self.assertIn(ancestor['uuid'], log_object_uuids)
+
+ with self.assertRaises(Queue.Empty):
+ # assertEqual just serves to show us what unexpected thing
+ # comes out of the queue when the assertRaises fails; when
+ # the test passes, this assertEqual doesn't get called.
+ self.assertEqual(events.get(True, 2), None)
+
+ def test_subscribe_websocket(self):
+ self._test_subscribe(
+ poll_fallback=False, expect_type=arvados.events.EventClient, expected=1)
+
+ @mock.patch('arvados.events.EventClient.__init__')
+ def test_subscribe_poll(self, event_client_constr):
+ event_client_constr.side_effect = Exception('All is well')
+ self._test_subscribe(
+ poll_fallback=0.25, expect_type=arvados.events.PollClient, expected=1)
+
+ def test_subscribe_poll_retry(self):
+ api_mock = mock.MagicMock()
+ n = []
+ def on_ev(ev):
+ n.append(ev)
+
+ error_mock = mock.MagicMock()
+ error_mock.resp.status = 0
+ error_mock._get_reason.return_value = "testing"
+ api_mock.logs().list().execute.side_effect = (arvados.errors.ApiError(error_mock, ""),
+ {"items": [{"id": 1}], "items_available": 1},
+ arvados.errors.ApiError(error_mock, ""),
+ {"items": [{"id": 1}], "items_available": 1})
+ pc = arvados.events.PollClient(api_mock, [], on_ev, 15, None)
+ pc.start()
+ while len(n) < 2:
+ time.sleep(.1)
+ pc.close()
+
+ def test_subscribe_websocket_with_start_time_past(self):
+ self._test_subscribe(
+ poll_fallback=False, expect_type=arvados.events.EventClient,
+ start_time=self.localiso(self.TIME_PAST),
+ expected=2)
+
+ @mock.patch('arvados.events.EventClient.__init__')
+ def test_subscribe_poll_with_start_time_past(self, event_client_constr):
+ event_client_constr.side_effect = Exception('All is well')
+ self._test_subscribe(
+ poll_fallback=0.25, expect_type=arvados.events.PollClient,
+ start_time=self.localiso(self.TIME_PAST),
+ expected=2)
+
+ def test_subscribe_websocket_with_start_time_future(self):
+ self._test_subscribe(
+ poll_fallback=False, expect_type=arvados.events.EventClient,
+ start_time=self.localiso(self.TIME_FUTURE),
+ expected=0)
+
+ @mock.patch('arvados.events.EventClient.__init__')
+ def test_subscribe_poll_with_start_time_future(self, event_client_constr):
+ event_client_constr.side_effect = Exception('All is well')
+ self._test_subscribe(
+ poll_fallback=0.25, expect_type=arvados.events.PollClient,
+ start_time=self.localiso(self.TIME_FUTURE),
+ expected=0)
+
+ def test_subscribe_websocket_with_start_time_past_utc(self):
+ self._test_subscribe(
+ poll_fallback=False, expect_type=arvados.events.EventClient,
+ start_time=self.utciso(self.TIME_PAST),
+ expected=2)
+
+ def test_subscribe_websocket_with_start_time_future_utc(self):
+ self._test_subscribe(
+ poll_fallback=False, expect_type=arvados.events.EventClient,
+ start_time=self.utciso(self.TIME_FUTURE),
+ expected=0)
+
+ def utciso(self, t):
+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
+
+ def localiso(self, t):
+ return time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(t)) + self.isotz(-time.timezone/60)
+
+ def isotz(self, offset):
+ """Convert minutes-east-of-UTC to ISO8601 time zone designator"""
+ return '{:+03d}{:02d}'.format(offset/60, offset%60)
+
+ # Test websocket reconnection on (un)execpted close
+ def _test_websocket_reconnect(self, close_unexpected):
+ run_test_server.authorize_with('active')
+ events = Queue.Queue(100)
+
+ logstream = io.BytesIO()
+ rootLogger = logging.getLogger()
+ streamHandler = logging.StreamHandler(logstream)
+ rootLogger.addHandler(streamHandler)
+
+ filters = [['object_uuid', 'is_a', 'arvados#human']]
+ filters.append(['created_at', '>=', self.localiso(self.TIME_PAST)])
+ self.ws = arvados.events.subscribe(
+ arvados.api('v1'), filters,
+ events.put_nowait,
+ poll_fallback=False,
+ last_log_id=None)
+ self.assertIsInstance(self.ws, arvados.events.EventClient)
+ self.assertEqual(200, events.get(True, 5)['status'])
+
+ # create obj
+ human = arvados.api('v1').humans().create(body={}).execute()
+
+ # expect an event
+ self.assertIn(human['uuid'], events.get(True, 5)['object_uuid'])
+ with self.assertRaises(Queue.Empty):
+ self.assertEqual(events.get(True, 2), None)
+
+ # close (im)properly
+ if close_unexpected:
+ self.ws.ec.close_connection()
+ else:
+ self.ws.close()
+
+ # create one more obj
+ human2 = arvados.api('v1').humans().create(body={}).execute()
+
+ # (un)expect the object creation event
+ if close_unexpected:
+ log_object_uuids = []
+ for i in range(0, 2):
+ event = events.get(True, 5)
+ if event.get('object_uuid') != None:
+ log_object_uuids.append(event['object_uuid'])
+ with self.assertRaises(Queue.Empty):
+ self.assertEqual(events.get(True, 2), None)
+ self.assertNotIn(human['uuid'], log_object_uuids)
+ self.assertIn(human2['uuid'], log_object_uuids)
+ else:
+ with self.assertRaises(Queue.Empty):
+ self.assertEqual(events.get(True, 2), None)
+
+ # verify log message to ensure that an (un)expected close
+ log_messages = logstream.getvalue()
+ closeLogFound = log_messages.find("Unexpected close. Reconnecting.")
+ retryLogFound = log_messages.find("Error during websocket reconnect. Will retry")
+ if close_unexpected:
+ self.assertNotEqual(closeLogFound, -1)
+ else:
+ self.assertEqual(closeLogFound, -1)
+ rootLogger.removeHandler(streamHandler)
+
+ def test_websocket_reconnect_on_unexpected_close(self):
+ self._test_websocket_reconnect(True)
+
+ def test_websocket_no_reconnect_on_close_by_user(self):
+ self._test_websocket_reconnect(False)
+
+ # Test websocket reconnection retry
+ @mock.patch('arvados.events._EventClient.connect')
+ def test_websocket_reconnect_retry(self, event_client_connect):
+ event_client_connect.side_effect = [None, Exception('EventClient.connect error'), None]
+
+ logstream = io.BytesIO()
+ rootLogger = logging.getLogger()
+ streamHandler = logging.StreamHandler(logstream)
+ rootLogger.addHandler(streamHandler)
+
+ run_test_server.authorize_with('active')
+ events = Queue.Queue(100)
+
+ filters = [['object_uuid', 'is_a', 'arvados#human']]
+ self.ws = arvados.events.subscribe(
+ arvados.api('v1'), filters,
+ events.put_nowait,
+ poll_fallback=False,
+ last_log_id=None)
+ self.assertIsInstance(self.ws, arvados.events.EventClient)
+
+ # simulate improper close
+ self.ws.on_closed()
+
+ # verify log messages to ensure retry happened
+ log_messages = logstream.getvalue()
+ found = log_messages.find("Error 'EventClient.connect error' during websocket reconnect.")
+ self.assertNotEqual(found, -1)
+ rootLogger.removeHandler(streamHandler)
+
+ @mock.patch('arvados.events._EventClient')
+ def test_subscribe_method(self, websocket_client):
+ filters = [['object_uuid', 'is_a', 'arvados#human']]
+ client = arvados.events.EventClient(
+ self.MOCK_WS_URL, [], lambda event: None, None)
+ client.subscribe(filters[:], 99)
+ websocket_client().subscribe.assert_called_with(filters, 99)
+
+ @mock.patch('arvados.events._EventClient')
+ def test_unsubscribe(self, websocket_client):
+ filters = [['object_uuid', 'is_a', 'arvados#human']]
+ client = arvados.events.EventClient(
+ self.MOCK_WS_URL, filters[:], lambda event: None, None)
+ client.unsubscribe(filters[:])
+ websocket_client().unsubscribe.assert_called_with(filters)
+
+ @mock.patch('arvados.events._EventClient')
+ def test_run_forever_survives_reconnects(self, websocket_client):
+ connection_cond = threading.Condition()
+ def ws_connect():
+ with connection_cond:
+ connection_cond.notify_all()
+ websocket_client().connect.side_effect = ws_connect
+ client = arvados.events.EventClient(
+ self.MOCK_WS_URL, [], lambda event: None, None)
+ with connection_cond:
+ forever_thread = threading.Thread(target=client.run_forever)
+ forever_thread.start()
+ # Simulate an unexpected disconnect, and wait for reconnect.
+ close_thread = threading.Thread(target=client.on_closed)
+ close_thread.start()
+ connection_cond.wait()
+ close_thread.join()
+ run_forever_alive = forever_thread.is_alive()
+ client.close()
+ forever_thread.join()
+ self.assertTrue(run_forever_alive)
+ self.assertEqual(2, websocket_client().connect.call_count)
+
+
+class PollClientTestCase(unittest.TestCase):
+ class MockLogs(object):
+ def __init__(self):
+ self.logs = []
+ self.lock = threading.Lock()
+
+ def add(self, log):
+ with self.lock:
+ self.logs.append(log)
+
+ def return_list(self, num_retries=None):
+ with self.lock:
+ retval = self.logs
+ self.logs = []
+ return {'items': retval, 'items_available': len(retval)}
+
+
+ def setUp(self):
+ self.logs = self.MockLogs()
+ self.arv = mock.MagicMock(name='arvados.api()')
+ self.arv.logs().list().execute.side_effect = self.logs.return_list
+ self.callback_cond = threading.Condition()
+ self.recv_events = []
+
+ def tearDown(self):
+ if hasattr(self, 'client'):
+ self.client.close(timeout=None)
+
+ def callback(self, event):
+ with self.callback_cond:
+ self.recv_events.append(event)
+ self.callback_cond.notify_all()
+
+ def build_client(self, filters=None, callback=None, last_log_id=None, poll_time=99):
+ if filters is None:
+ filters = []
+ if callback is None:
+ callback = self.callback
+ self.client = arvados.events.PollClient(
+ self.arv, filters, callback, poll_time, last_log_id)
+
+ def was_filter_used(self, target):
+ return any(target in call[-1].get('filters', [])
+ for call in self.arv.logs().list.call_args_list)
+
+ def test_callback(self):
+ test_log = {'id': 12345, 'testkey': 'testtext'}
+ self.logs.add({'id': 123})
+ self.build_client(poll_time=.01)
+ with self.callback_cond:
+ self.client.start()
+ self.callback_cond.wait()
+ self.logs.add(test_log.copy())
+ self.callback_cond.wait()
+ self.client.close(timeout=None)
+ self.assertIn(test_log, self.recv_events)
+
+ def test_subscribe(self):
+ client_filter = ['kind', '=', 'arvados#test']
+ self.build_client()
+ self.client.subscribe([client_filter[:]])
+ with self.callback_cond:
+ self.client.start()
+ self.callback_cond.wait()
+ self.client.close(timeout=None)
+ self.assertTrue(self.was_filter_used(client_filter))
+
+ def test_unsubscribe(self):
+ client_filter = ['kind', '=', 'arvados#test']
+ self.build_client()
+ self.client.subscribe([client_filter[:]])
+ self.client.unsubscribe([client_filter[:]])
+ self.client.start()
+ self.client.close(timeout=None)
+ self.assertFalse(self.was_filter_used(client_filter))
+
+ def test_run_forever(self):
+ self.build_client()
+ with self.callback_cond:
+ self.client.start()
+ forever_thread = threading.Thread(target=self.client.run_forever)
+ forever_thread.start()
+ self.callback_cond.wait()
+ self.assertTrue(forever_thread.is_alive())
+ self.client.close()
+ forever_thread.join()
blob_str,
'wrong content from Keep.get(md5(<binarydata>))')
+ @unittest.skip("unreliable test - please fix and close #8752")
def test_KeepSingleCopyRWTest(self):
blob_str = '\xff\xfe\xfd\xfc\x00\x01\x02\x03'
blob_locator = self.keep_client.put(blob_str, copies=1)
# Must be a string type
self.keep_client.put({})
+ def test_KeepHeadTest(self):
+ locator = self.keep_client.put('test_head')
+ self.assertRegexpMatches(
+ locator,
+ '^b9a772c7049325feb7130fff1f8333e9\+9',
+ 'wrong md5 hash from Keep.put for "test_head": ' + locator)
+ self.assertEqual(True, self.keep_client.head(locator))
+ self.assertEqual(self.keep_client.get(locator),
+ 'test_head',
+ 'wrong content from Keep.get for "test_head"')
+
class KeepPermissionTestCase(run_test_server.TestCaseWithServers):
MAIN_SERVER = {}
KEEP_SERVER = {'blob_signing_key': 'abcdefghijk0123456789',
mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
int(arvados.KeepClient.DEFAULT_TIMEOUT[2]))
+ def test_head_timeout(self):
+ api_client = self.mock_keep_services(count=1)
+ force_timeout = socket.timeout("timed out")
+ with tutil.mock_keep_responses(force_timeout, 0) as mock:
+ keep_client = arvados.KeepClient(api_client=api_client)
+ with self.assertRaises(arvados.errors.KeepReadError):
+ keep_client.head('ffffffffffffffffffffffffffffffff')
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.CONNECTTIMEOUT_MS),
+ int(arvados.KeepClient.DEFAULT_TIMEOUT[0]*1000))
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.LOW_SPEED_TIME),
+ int(arvados.KeepClient.DEFAULT_TIMEOUT[1]))
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
+ int(arvados.KeepClient.DEFAULT_TIMEOUT[2]))
+
def test_proxy_get_timeout(self):
api_client = self.mock_keep_services(service_type='proxy', count=1)
force_timeout = socket.timeout("timed out")
mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[2]))
+ def test_proxy_head_timeout(self):
+ api_client = self.mock_keep_services(service_type='proxy', count=1)
+ force_timeout = socket.timeout("timed out")
+ with tutil.mock_keep_responses(force_timeout, 0) as mock:
+ keep_client = arvados.KeepClient(api_client=api_client)
+ with self.assertRaises(arvados.errors.KeepReadError):
+ keep_client.head('ffffffffffffffffffffffffffffffff')
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.CONNECTTIMEOUT_MS),
+ int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[0]*1000))
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.LOW_SPEED_TIME),
+ int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[1]))
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.LOW_SPEED_LIMIT),
+ int(arvados.KeepClient.DEFAULT_PROXY_TIMEOUT[2]))
+
def test_proxy_put_timeout(self):
api_client = self.mock_keep_services(service_type='proxy', count=1)
force_timeout = socket.timeout("timed out")
def test_get_error_with_no_services(self):
self.check_no_services_error('get', arvados.errors.KeepReadError)
+ def test_head_error_with_no_services(self):
+ self.check_no_services_error('head', arvados.errors.KeepReadError)
+
def test_put_error_with_no_services(self):
self.check_no_services_error('put', arvados.errors.KeepWriteError)
def test_get_error_reflects_last_retry(self):
self.check_errors_from_last_retry('get', arvados.errors.KeepReadError)
+ def test_head_error_reflects_last_retry(self):
+ self.check_errors_from_last_retry('head', arvados.errors.KeepReadError)
+
def test_put_error_reflects_last_retry(self):
self.check_errors_from_last_retry('put', arvados.errors.KeepWriteError)
self._test_probe_order_against_reference_set(
lambda i: self.keep_client.get(self.hashes[i], num_retries=1))
+ def test_head_probe_order_against_reference_set(self):
+ self._test_probe_order_against_reference_set(
+ lambda i: self.keep_client.head(self.hashes[i], num_retries=1))
+
def test_put_probe_order_against_reference_set(self):
# copies=1 prevents the test from being sensitive to races
# between writer threads.
with self.assertTakesGreater(self.TIMEOUT_TIME):
with self.assertRaises(arvados.errors.KeepWriteError):
kc.put(self.DATA, copies=1, num_retries=0)
+ with self.assertTakesGreater(self.TIMEOUT_TIME):
+ with self.assertRaises(arvados.errors.KeepReadError) as e:
+ kc.head(loc, num_retries=0)
def test_low_bandwidth_with_server_mid_delay_failure(self):
kc = self.keepClient()
self.assertEqual('foo', self.keepClient.get(locator))
self.assertEqual(self.gateway_roots[0]+locator,
MockCurl.return_value.getopt(pycurl.URL))
+ self.assertEqual(True, self.keepClient.head(locator))
@mock.patch('pycurl.Curl')
def test_get_with_gateway_hints_in_order(self, MockCurl):
mocks[i].getopt(pycurl.URL),
r'keep0x')
+ @mock.patch('pycurl.Curl')
+ def test_head_with_gateway_hints_in_order(self, MockCurl):
+ gateways = 4
+ disks = 3
+ mocks = [
+ tutil.FakeCurl.make(code=404, body='')
+ for _ in range(gateways+disks)
+ ]
+ MockCurl.side_effect = tutil.queue_with(mocks)
+ self.mock_disks_and_gateways(gateways=gateways, disks=disks)
+ locator = '+'.join(['acbd18db4cc2f85cedef654fccc4a4d8+3'] +
+ ['K@'+gw['uuid'] for gw in self.gateways])
+ with self.assertRaises(arvados.errors.NotFoundError):
+ self.keepClient.head(locator)
+ # Gateways are tried first, in the order given.
+ for i, root in enumerate(self.gateway_roots):
+ self.assertEqual(root+locator,
+ mocks[i].getopt(pycurl.URL))
+ # Disk services are tried next.
+ for i in range(gateways, gateways+disks):
+ self.assertRegexpMatches(
+ mocks[i].getopt(pycurl.URL),
+ r'keep0x')
+
@mock.patch('pycurl.Curl')
def test_get_with_remote_proxy_hint(self, MockCurl):
MockCurl.return_value = tutil.FakeCurl.make(
self.assertEqual('https://keep.xyzzy.arvadosapi.com/'+locator,
MockCurl.return_value.getopt(pycurl.URL))
+ @mock.patch('pycurl.Curl')
+ def test_head_with_remote_proxy_hint(self, MockCurl):
+ MockCurl.return_value = tutil.FakeCurl.make(
+ code=200, body='foo', headers={'Content-Length': 3})
+ self.mock_disks_and_gateways()
+ locator = 'acbd18db4cc2f85cedef654fccc4a4d8+3+K@xyzzy'
+ self.assertEqual(True, self.keepClient.head(locator))
+ self.assertEqual('https://keep.xyzzy.arvadosapi.com/'+locator,
+ MockCurl.return_value.getopt(pycurl.URL))
+
class KeepClientRetryTestMixin(object):
# Testing with a local Keep store won't exercise the retry behavior.
(self.DEFAULT_EXPECT, 200)):
self.check_success(locator=self.HINTED_LOCATOR)
+@tutil.skip_sleep
+class KeepClientRetryHeadTestCase(KeepClientRetryTestMixin, unittest.TestCase):
+ DEFAULT_EXPECT = True
+ DEFAULT_EXCEPTION = arvados.errors.KeepReadError
+ HINTED_LOCATOR = KeepClientRetryTestMixin.TEST_LOCATOR + '+K@xyzzy'
+ TEST_PATCHER = staticmethod(tutil.mock_keep_responses)
+
+ def run_method(self, locator=KeepClientRetryTestMixin.TEST_LOCATOR,
+ *args, **kwargs):
+ return self.new_client().head(locator, *args, **kwargs)
+
+ def test_specific_exception_when_not_found(self):
+ with tutil.mock_keep_responses(self.DEFAULT_EXPECT, 404, 200):
+ self.check_exception(arvados.errors.NotFoundError, num_retries=3)
+
+ def test_general_exception_with_mixed_errors(self):
+ # head should raise a NotFoundError if no server returns the block,
+ # and a high threshold of servers report that it's not found.
+ # This test rigs up 50/50 disagreement between two servers, and
+ # checks that it does not become a NotFoundError.
+ client = self.new_client()
+ with tutil.mock_keep_responses(self.DEFAULT_EXPECT, 404, 500):
+ with self.assertRaises(arvados.errors.KeepReadError) as exc_check:
+ client.head(self.HINTED_LOCATOR)
+ self.assertNotIsInstance(
+ exc_check.exception, arvados.errors.NotFoundError,
+ "mixed errors raised NotFoundError")
+
+ def test_hint_server_can_succeed_without_retries(self):
+ with tutil.mock_keep_responses(self.DEFAULT_EXPECT, 404, 200, 500):
+ self.check_success(locator=self.HINTED_LOCATOR)
+
+ def test_try_next_server_after_timeout(self):
+ with tutil.mock_keep_responses(
+ (socket.timeout("timed out"), 200),
+ (self.DEFAULT_EXPECT, 200)):
+ self.check_success(locator=self.HINTED_LOCATOR)
@tutil.skip_sleep
class KeepClientRetryPutTestCase(KeepClientRetryTestMixin, unittest.TestCase):
def test_backoff_multiplier(self, sleep_mock, time_mock):
self.run_loop(5, 500, 501, 502, 503, 504, 505,
- backoff_start=5, backoff_growth=10)
+ backoff_start=5, backoff_growth=10, max_wait=1000000000)
self.check_backoff(sleep_mock, 5, 9)
def test_bz2_decompression(self):
self.check_decompression('bz2', bz2.compress)
+ def test_readline_then_readlines(self):
+ reader = self.make_newlines_reader()
+ data = reader.readline()
+ self.assertEqual('one\n', data)
+ data = reader.readlines()
+ self.assertEqual(['two\n', '\n', 'three\n', 'four\n', '\n'], data)
+
+ def test_readline_then_readall(self):
+ reader = self.make_newlines_reader()
+ data = reader.readline()
+ self.assertEqual('one\n', data)
+ self.assertEqual(''.join(['two\n', '\n', 'three\n', 'four\n', '\n']), ''.join(reader.readall()))
+
class StreamRetryTestMixin(object):
# Define reader_for(coll_name, **kwargs)
+++ /dev/null
-import arvados
-import arvados.events
-from datetime import datetime, timedelta, tzinfo
-import mock
-import Queue
-import run_test_server
-import threading
-import time
-import unittest
-
-class WebsocketTest(run_test_server.TestCaseWithServers):
- MAIN_SERVER = {}
-
- TIME_PAST = time.time()-3600
- TIME_FUTURE = time.time()+3600
-
- def setUp(self):
- self.ws = None
-
- def tearDown(self):
- if self.ws:
- self.ws.close()
- super(WebsocketTest, self).tearDown()
- run_test_server.reset()
-
- def _test_subscribe(self, poll_fallback, expect_type, start_time=None, expected=1):
- run_test_server.authorize_with('active')
- events = Queue.Queue(100)
-
- # Create ancestor before subscribing.
- # When listening with start_time in the past, this should also be retrieved.
- # However, when start_time is omitted in subscribe, this should not be fetched.
- ancestor = arvados.api('v1').humans().create(body={}).execute()
-
- filters = [['object_uuid', 'is_a', 'arvados#human']]
- if start_time:
- filters.append(['created_at', '>=', start_time])
-
- self.ws = arvados.events.subscribe(
- arvados.api('v1'), filters,
- events.put_nowait,
- poll_fallback=poll_fallback,
- last_log_id=(1 if start_time else None))
- self.assertIsInstance(self.ws, expect_type)
- self.assertEqual(200, events.get(True, 5)['status'])
- human = arvados.api('v1').humans().create(body={}).execute()
-
- log_object_uuids = []
- for i in range(0, expected):
- log_object_uuids.append(events.get(True, 5)['object_uuid'])
-
- if expected > 0:
- self.assertIn(human['uuid'], log_object_uuids)
-
- if expected > 1:
- self.assertIn(ancestor['uuid'], log_object_uuids)
-
- with self.assertRaises(Queue.Empty):
- # assertEqual just serves to show us what unexpected thing
- # comes out of the queue when the assertRaises fails; when
- # the test passes, this assertEqual doesn't get called.
- self.assertEqual(events.get(True, 2), None)
-
- def test_subscribe_websocket(self):
- self._test_subscribe(
- poll_fallback=False, expect_type=arvados.events.EventClient, expected=1)
-
- @mock.patch('arvados.events.EventClient.__init__')
- def test_subscribe_poll(self, event_client_constr):
- event_client_constr.side_effect = Exception('All is well')
- self._test_subscribe(
- poll_fallback=0.25, expect_type=arvados.events.PollClient, expected=1)
-
- def test_subscribe_websocket_with_start_time_past(self):
- self._test_subscribe(
- poll_fallback=False, expect_type=arvados.events.EventClient,
- start_time=self.localiso(self.TIME_PAST),
- expected=2)
-
- @mock.patch('arvados.events.EventClient.__init__')
- def test_subscribe_poll_with_start_time_past(self, event_client_constr):
- event_client_constr.side_effect = Exception('All is well')
- self._test_subscribe(
- poll_fallback=0.25, expect_type=arvados.events.PollClient,
- start_time=self.localiso(self.TIME_PAST),
- expected=2)
-
- def test_subscribe_websocket_with_start_time_future(self):
- self._test_subscribe(
- poll_fallback=False, expect_type=arvados.events.EventClient,
- start_time=self.localiso(self.TIME_FUTURE),
- expected=0)
-
- @mock.patch('arvados.events.EventClient.__init__')
- def test_subscribe_poll_with_start_time_future(self, event_client_constr):
- event_client_constr.side_effect = Exception('All is well')
- self._test_subscribe(
- poll_fallback=0.25, expect_type=arvados.events.PollClient,
- start_time=self.localiso(self.TIME_FUTURE),
- expected=0)
-
- def test_subscribe_websocket_with_start_time_past_utc(self):
- self._test_subscribe(
- poll_fallback=False, expect_type=arvados.events.EventClient,
- start_time=self.utciso(self.TIME_PAST),
- expected=2)
-
- def test_subscribe_websocket_with_start_time_future_utc(self):
- self._test_subscribe(
- poll_fallback=False, expect_type=arvados.events.EventClient,
- start_time=self.utciso(self.TIME_FUTURE),
- expected=0)
-
- def utciso(self, t):
- return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
-
- def localiso(self, t):
- return time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(t)) + self.isotz(-time.timezone/60)
-
- def isotz(self, offset):
- """Convert minutes-east-of-UTC to ISO8601 time zone designator"""
- return '{:+03d}{:02d}'.format(offset/60, offset%60)
s.files = ["lib/arvados.rb", "lib/arvados/google_api_client.rb",
"lib/arvados/collection.rb", "lib/arvados/keep.rb",
"README", "LICENSE-2.0.txt"]
- s.required_ruby_version = '>= 2.1.0'
+ s.required_ruby_version = '>= 1.8.7'
# activesupport <4.2.6 only because https://dev.arvados.org/issues/8222
- s.add_dependency('activesupport', '>= 3.2.13', '< 4.2.6')
+ s.add_dependency('activesupport', '>= 3', '< 4.2.6')
s.add_dependency('andand', '~> 1.3', '>= 1.3.3')
- s.add_dependency('google-api-client', '~> 0.6.3', '>= 0.6.3')
+ # Our google-api-client dependency used to be < 0.9, but that could be
+ # satisfied by the buggy 0.9.pre*. https://dev.arvados.org/issues/9213
+ s.add_dependency('google-api-client', '>= 0.7', '< 0.8.9')
+ # work around undeclared dependency on i18n in some activesupport 3.x.x:
+ s.add_dependency('i18n', '~> 0')
s.add_dependency('json', '~> 1.7', '>= 1.7.7')
- s.add_runtime_dependency('jwt', '>= 0.1.5', '< 1.0.0')
+ s.add_runtime_dependency('jwt', '<2', '>= 0.1.5')
s.homepage =
'https://arvados.org'
end
:parameters => parameters,
:body_object => body,
:headers => {
- authorization: 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
+ :authorization => 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
})
resp = JSON.parse result.body, :symbolize_names => true
if resp[:errors]
elsif resp[:uuid] and resp[:etag]
self.new(resp)
elsif resp[:items].is_a? Array
- resp.merge(items: resp[:items].collect do |i|
+ resp.merge(:items => resp[:items].collect do |i|
self.new(i)
end)
else
end
def cp_r(source, target, source_collection=nil)
- opts = {descend_target: !source.end_with?("/")}
+ opts = {:descend_target => !source.end_with?("/")}
copy(:merge, source.chomp("/"), target, source_collection, opts)
end
end
def rm_r(source)
- remove(source, recursive: true)
+ remove(source, :recursive => true)
end
protected
modified
end
- LocatorSegment = Struct.new(:locators, :start_pos, :length)
+ Struct.new("LocatorSegment", :locators, :start_pos, :length)
class LocatorRange < Range
attr_reader :locator
end_index = search_for_byte(start_pos + length - 1, start_index)
end
seg_ranges = @ranges[start_index..end_index]
- LocatorSegment.new(seg_ranges.map(&:locator),
- start_pos - seg_ranges.first.begin,
- length)
+ Struct::LocatorSegment.new(seg_ranges.map(&:locator),
+ start_pos - seg_ranges.first.begin,
+ length)
end
private
raise ArgumentError.new "locator is nil or empty"
end
- m = LOCATOR_REGEXP.match(tok.strip)
+ m = LOCATOR_REGEXP.match(tok)
unless m
raise ArgumentError.new "not a valid locator #{tok}"
end
- tokhash, _, toksize, _, trailer = m[1..5]
+ tokhash, _, toksize, _, _, trailer = m[1..6]
tokhints = []
if trailer
trailer.split('+').each do |hint|
- if hint =~ /^[[:upper:]][[:alnum:]@_-]+$/
+ if hint =~ /^[[:upper:]][[:alnum:]@_-]*$/
tokhints.push(hint)
else
- raise ArgumentError.new "unknown hint #{hint}"
+ raise ArgumentError.new "invalid hint #{hint}"
end
end
end
[true, 'd41d8cd98f00b204e9800998ecf8427e+0', '+0','0',nil],
[true, 'd41d8cd98f00b204e9800998ecf8427e+0+Fizz+Buzz','+0','0','+Fizz+Buzz'],
[true, 'd41d8cd98f00b204e9800998ecf8427e+Fizz+Buzz', nil,nil,'+Fizz+Buzz'],
+ [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', '+0','0','+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
+ [true, 'd41d8cd98f00b204e9800998ecf8427e+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', nil,nil,'+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
[true, 'd41d8cd98f00b204e9800998ecf8427e+0+Z', '+0','0','+Z'],
[true, 'd41d8cd98f00b204e9800998ecf8427e+Z', nil,nil,'+Z'],
].each do |ok, locator, match2, match3, match4|
assert_equal match4, match[4]
end
end
+ define_method "test_parse_method_on_#{locator.inspect}" do
+ loc = Keep::Locator.parse locator
+ if !ok
+ assert_nil loc
+ else
+ refute_nil loc
+ assert loc.is_a?(Keep::Locator)
+ #assert loc.hash
+ #assert loc.size
+ #assert loc.hints.is_a?(Array)
+ end
+ end
end
[
[true, ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\040\n"],
[true, ". 00000000000000000000000000000000+0 0:0:0\n"],
[true, ". 00000000000000000000000000000000+0 0:0:d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff\n"],
+ [true, ". d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff 0:0:empty.txt\n"],
[false, '. d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt',
"Invalid manifest: does not end with newline"],
[false, "abc d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt\n",
gem 'test_after_commit', :group => :test
-gem 'google-api-client', '~> 0.6.3'
gem 'trollop'
gem 'faye-websocket'
activemodel (>= 3.0.0)
activesupport (>= 3.0.0)
rack (>= 1.1.0)
- addressable (2.3.8)
+ addressable (2.4.0)
andand (1.3.3)
arel (3.0.3)
- arvados (0.1.20150615153458)
- activesupport (>= 3.2.13)
+ arvados (0.1.20160420143004)
+ activesupport (>= 3, < 4.2.6)
andand (~> 1.3, >= 1.3.3)
- google-api-client (~> 0.6.3, >= 0.6.3)
+ google-api-client (>= 0.7, < 0.9)
+ i18n (~> 0)
json (~> 1.7, >= 1.7.7)
- jwt (>= 0.1.5, < 1.0.0)
- arvados-cli (0.1.20151207150126)
+ jwt (>= 0.1.5, < 2)
+ arvados-cli (0.1.20160503204200)
activesupport (~> 3.2, >= 3.2.13)
andand (~> 1.3, >= 1.3.3)
arvados (~> 0.1, >= 0.1.20150128223554)
curb (~> 0.8)
- google-api-client (~> 0.6.3, >= 0.6.3)
+ google-api-client (~> 0.6, >= 0.6.3, < 0.9)
json (~> 1.7, >= 1.7.7)
- jwt (>= 0.1.5, < 1.0.0)
oj (~> 2.0, >= 2.0.3)
trollop (~> 2.0)
autoparse (0.3.3)
coffee-script-source
execjs
coffee-script-source (1.7.0)
- curb (0.8.8)
+ curb (0.9.3)
daemon_controller (1.2.0)
database_cleaner (1.2.0)
erubis (2.7.0)
factory_girl_rails (4.4.1)
factory_girl (~> 4.4.0)
railties (>= 3.0.0)
- faraday (0.8.9)
- multipart-post (~> 1.2.0)
+ faraday (0.9.2)
+ multipart-post (>= 1.2, < 3)
faye-websocket (0.7.2)
eventmachine (>= 0.12.0)
websocket-driver (>= 0.3.1)
- google-api-client (0.6.4)
+ google-api-client (0.7.1)
addressable (>= 2.3.2)
autoparse (>= 0.3.3)
extlib (>= 0.9.15)
- faraday (~> 0.8.4)
+ faraday (>= 0.9.0)
jwt (>= 0.1.5)
launchy (>= 2.1.1)
multi_json (>= 1.0.0)
- signet (~> 0.4.5)
+ retriable (>= 1.4)
+ signet (>= 0.5.0)
uuidtools (>= 2.1.0)
hashie (1.2.0)
highline (1.6.21)
mime-types (1.25.1)
mocha (1.1.0)
metaclass (~> 0.0.1)
- multi_json (1.11.1)
- multipart-post (1.2.0)
+ multi_json (1.12.0)
+ multipart-post (2.0.0)
net-scp (1.2.0)
net-ssh (>= 2.6.5)
net-sftp (2.1.2)
jwt (~> 0.1.4)
multi_json (~> 1.0)
rack (~> 1.2)
- oj (2.11.4)
+ oj (2.15.0)
omniauth (1.1.1)
hashie (~> 1.2)
rack
rdoc (3.12.2)
json (~> 1.4)
ref (1.0.5)
+ retriable (2.1.0)
ruby-prof (0.15.2)
rvm-capistrano (1.5.1)
capistrano (~> 2.15.4)
railties (~> 3.2.0)
sass (>= 3.1.10)
tilt (~> 1.3)
- signet (0.4.5)
+ signet (0.5.1)
addressable (>= 2.2.3)
- faraday (~> 0.8.1)
+ faraday (>= 0.9.0.rc5)
jwt (>= 0.1.5)
multi_json (>= 1.0.0)
simplecov (0.7.1)
treetop (1.4.15)
polyglot
polyglot (>= 0.3.1)
- trollop (2.1.1)
+ trollop (2.1.2)
tzinfo (0.3.39)
uglifier (2.5.0)
execjs (>= 0.3.0)
database_cleaner
factory_girl_rails
faye-websocket
- google-api-client (~> 0.6.3)
jquery-rails
mocha
multi_json
uglifier (>= 1.0.3)
BUNDLED WITH
- 1.10.6
+ 1.12.1
return @attrs if @attrs
@attrs = params[resource_name]
if @attrs.is_a? String
- @attrs = Oj.load @attrs, symbol_keys: true
+ @attrs = Oj.strict_load @attrs, symbol_keys: true
end
unless @attrs.is_a? Hash
message = "No #{resource_name}"
def load_json_value(hash, key, must_be_class=nil)
if hash[key].is_a? String
- hash[key] = Oj.load(hash[key], symbol_keys: false)
+ hash[key] = Oj.strict_load(hash[key], symbol_keys: false)
if must_be_class and !hash[key].is_a? must_be_class
raise TypeError.new("parameter #{key.to_s} must be a #{must_be_class.to_s}")
end
class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
accept_attribute_as_json :scopes, Array
- before_filter :current_api_client_is_trusted
+ before_filter :current_api_client_is_trusted, :except => [:current]
before_filter :admin_required, :only => :create_system_auth
- skip_before_filter :render_404_if_no_object, :only => :create_system_auth
+ skip_before_filter :render_404_if_no_object, :only => [:create_system_auth, :current]
+ skip_before_filter :find_object_by_uuid, :only => [:create_system_auth, :current]
def self._create_system_auth_requires_parameters
{
new(user_id: system_user.id,
api_client_id: params[:api_client_id] || current_api_client.andand.id,
created_by_ip_address: remote_ip,
- scopes: Oj.load(params[:scopes] || '["all"]'))
+ scopes: Oj.strict_load(params[:scopes] || '["all"]'))
@object.save!
show
end
super
end
+ def current
+ @object = Thread.current[:api_client_authorization]
+ show
+ end
+
protected
def default_orders
val.is_a?(String) && (attr == 'uuid' || attr == 'api_token')
}
end
- @objects = model_class.
- includes(:user, :api_client).
- where('user_id=?', current_user.id)
- super
- wanted_scopes.compact.each do |scope_list|
- sorted_scopes = scope_list.sort
- @objects = @objects.select { |auth| auth.scopes.sort == sorted_scopes }
+ @objects = model_class.where('user_id=?', current_user.id)
+ if wanted_scopes.compact.any?
+ # We can't filter on scopes effectively using AR/postgres.
+ # Instead we get the entire result set, do our own filtering on
+ # scopes to get a list of UUIDs, then start a new query
+ # (restricted to the selected UUIDs) so super can apply the
+ # offset/limit/order params in the usual way.
+ @request_limit = @limit
+ @request_offset = @offset
+ @limit = @objects.count
+ @offset = 0
+ super
+ wanted_scopes.compact.each do |scope_list|
+ sorted_scopes = scope_list.sort
+ @objects = @objects.select { |auth| auth.scopes.sort == sorted_scopes }
+ end
+ @limit = @request_limit
+ @offset = @request_offset
+ @objects = model_class.where('uuid in (?)', @objects.collect(&:uuid))
end
+ super
end
def find_object_by_uuid
# The @filters test here also prevents a non-trusted token from
# filtering on its own scopes, and discovering whether any _other_
# equally scoped tokens exist (403=yes, 200=no).
- if (@objects.andand.count == 1 and
- @objects.first.uuid == current_api_client_authorization.andand.uuid and
+ return forbidden if !@objects
+ full_set = @objects.except(:limit).except(:offset) if @objects
+ if (full_set.count == 1 and
+ full_set.first.uuid == current_api_client_authorization.andand.uuid and
(@filters.map(&:first) & %w(uuid api_token)).any?)
return true
end
accept_attribute_as_json :runtime_constraints, Hash
accept_attribute_as_json :command, Array
+ def auth
+ if @object.locked_by_uuid != Thread.current[:api_client_authorization].uuid
+ raise ArvadosModel::PermissionDeniedError.new("Not locked by your token")
+ end
+ @object = @object.auth
+ show
+ end
+
+ # Updates use row locking to resolve races between multiple
+ # dispatchers trying to lock the same container.
+ def update
+ @object.with_lock do
+ super
+ end
+ end
end
class Arvados::V1::JobsController < ApplicationController
+ accept_attribute_as_json :components, Hash
accept_attribute_as_json :script_parameters, Hash
accept_attribute_as_json :runtime_constraints, Hash
accept_attribute_as_json :tasks_summary, Hash
end
end
end
- job_queue = Job.queue
+ job_queue = Job.queue.select(:uuid)
n_queued_before_me = 0
job_queue.each do |j|
break if j.uuid == @job.uuid
yield "#{db_current_time}" \
" job #{@job.uuid}" \
" queue_position #{n_queued_before_me}" \
- " queue_size #{job_queue.size}" \
+ " queue_size #{job_queue.count}" \
" nodes_idle #{nodes_in_state[:idle]}" \
" nodes_alloc #{nodes_in_state[:alloc]}\n"
last_ack_at = db_current_time
supplied_token =
params["api_token"] ||
params["oauth_token"] ||
- env["HTTP_AUTHORIZATION"].andand.match(/OAuth2 ([a-z0-9]+)/).andand[1]
+ env["HTTP_AUTHORIZATION"].andand.match(/OAuth2 ([a-zA-Z0-9]+)/).andand[1]
if supplied_token
api_client_auth = ApiClientAuthorization.
includes(:api_client, :user).
end
timestamp_hex = timestamp.to_s(16)
# => "53163cb4"
+ blob_signature_ttl = Rails.configuration.blob_signature_ttl.to_s(16)
# Generate a signature.
signature =
generate_signature((opts[:key] or Rails.configuration.blob_signing_key),
- blob_hash, opts[:api_token], timestamp_hex)
+ blob_hash, opts[:api_token], timestamp_hex, blob_signature_ttl)
blob_locator + '+A' + signature + '@' + timestamp_hex
end
if timestamp.to_i(16) < (opts[:now] or db_current_time.to_i)
raise Blob::InvalidSignatureError.new 'Signature expiry time has passed.'
end
+ blob_signature_ttl = Rails.configuration.blob_signature_ttl.to_s(16)
my_signature =
generate_signature((opts[:key] or Rails.configuration.blob_signing_key),
- blob_hash, opts[:api_token], timestamp)
+ blob_hash, opts[:api_token], timestamp, blob_signature_ttl)
if my_signature != given_signature
raise Blob::InvalidSignatureError.new 'Signature is invalid.'
true
end
- def self.generate_signature key, blob_hash, api_token, timestamp
+ def self.generate_signature key, blob_hash, api_token, timestamp, blob_signature_ttl
OpenSSL::HMAC.hexdigest('sha1', key,
[blob_hash,
api_token,
- timestamp].join('@'))
+ timestamp,
+ blob_signature_ttl].join('@'))
end
end
validates :command, :container_image, :output_path, :cwd, :priority, :presence => true
validate :validate_state_change
validate :validate_change
+ validate :validate_lock
+ after_validation :assign_auth
after_save :handle_completed
has_many :container_requests, :foreign_key => :container_uuid, :class_name => 'ContainerRequest', :primary_key => :uuid
+ belongs_to :auth, :class_name => 'ApiClientAuthorization', :foreign_key => :auth_uuid, :primary_key => :uuid
api_accessible :user, extend: :common do |t|
t.add :command
t.add :environment
t.add :exit_code
t.add :finished_at
+ t.add :locked_by_uuid
t.add :log
t.add :mounts
t.add :output
t.add :runtime_constraints
t.add :started_at
t.add :state
+ t.add :auth_uuid
end
# Supported states for a container
States =
[
(Queued = 'Queued'),
+ (Locked = 'Locked'),
(Running = 'Running'),
(Complete = 'Complete'),
(Cancelled = 'Cancelled')
State_transitions = {
nil => [Queued],
- Queued => [Running, Cancelled],
+ Queued => [Locked, Cancelled],
+ Locked => [Queued, Running, Cancelled],
Running => [Complete, Cancelled]
}
end
def update_priority!
- if [Queued, Running].include? self.state
+ if [Queued, Locked, Running].include? self.state
# Update the priority of this container to the maximum priority of any of
# its committed container requests and save the record.
- max = 0
- ContainerRequest.where(container_uuid: uuid).each do |cr|
- if cr.state == ContainerRequest::Committed and cr.priority > max
- max = cr.priority
- end
- end
- self.priority = max
+ self.priority = ContainerRequest.
+ where(container_uuid: uuid,
+ state: ContainerRequest::Committed).
+ maximum('priority')
self.save!
end
end
end
def validate_change
- permitted = []
+ permitted = [:state]
if self.new_record?
- permitted.push :owner_uuid, :command, :container_image, :cwd, :environment,
- :mounts, :output_path, :priority, :runtime_constraints, :state
+ permitted.push(:owner_uuid, :command, :container_image, :cwd,
+ :environment, :mounts, :output_path, :priority,
+ :runtime_constraints)
end
case self.state
- when Queued
- # permit priority change only.
+ when Queued, Locked
permitted.push :priority
when Running
+ permitted.push :priority, :progress
if self.state_changed?
- # At point of state change, can set state and started_at
- permitted.push :state, :started_at
- else
- # While running, can update priority and progress.
- permitted.push :priority, :progress
+ permitted.push :started_at
end
when Complete
- if self.state_changed?
- permitted.push :state, :finished_at, :output, :log, :exit_code
- else
- errors.add :state, "cannot update record"
+ if self.state_was == Running
+ permitted.push :finished_at, :output, :log, :exit_code
end
when Cancelled
- if self.state_changed?
- if self.state_was == Running
- permitted.push :state, :finished_at, :output, :log
- elsif self.state_was == Queued
- permitted.push :state, :finished_at
- end
- else
- errors.add :state, "cannot update record"
+ case self.state_was
+ when Running
+ permitted.push :finished_at, :output, :log
+ when Queued, Locked
+ permitted.push :finished_at
end
else
- errors.add :state, "invalid state"
+ # The state_transitions check will add an error message for this
+ return false
end
check_update_whitelist permitted
end
+ def validate_lock
+ # If the Container is already locked by someone other than the
+ # current api_client_auth, disallow all changes -- except
+ # priority, which needs to change to reflect max(priority) of
+ # relevant ContainerRequests.
+ if locked_by_uuid_was
+ if locked_by_uuid_was != Thread.current[:api_client_authorization].uuid
+ check_update_whitelist [:priority]
+ end
+ end
+
+ if [Locked, Running].include? self.state
+ # If the Container was already locked, locked_by_uuid must not
+ # changes. Otherwise, the current auth gets the lock.
+ need_lock = locked_by_uuid_was || Thread.current[:api_client_authorization].uuid
+ else
+ need_lock = nil
+ end
+
+ # The caller can provide a new value for locked_by_uuid, but only
+ # if it's exactly what we expect. This allows a caller to perform
+ # an update like {"state":"Unlocked","locked_by_uuid":null}.
+ if self.locked_by_uuid_changed?
+ if self.locked_by_uuid != need_lock
+ return errors.add :locked_by_uuid, "can only change to #{need_lock}"
+ end
+ end
+ self.locked_by_uuid = need_lock
+ end
+
+ def assign_auth
+ if self.auth_uuid_changed?
+ return errors.add :auth_uuid, 'is readonly'
+ end
+ if not [Locked, Running].include? self.state
+ # don't need one
+ self.auth.andand.update_attributes(expires_at: db_current_time)
+ self.auth = nil
+ return
+ elsif self.auth
+ # already have one
+ return
+ end
+ cr = ContainerRequest.
+ where('container_uuid=? and priority>0', self.uuid).
+ order('priority desc').
+ first
+ if !cr
+ return errors.add :auth_uuid, "cannot be assigned because priority <= 0"
+ end
+ self.auth = ApiClientAuthorization.
+ create!(user_id: User.find_by_uuid(cr.modified_by_user_uuid).id,
+ api_client_id: 0)
+ end
+
def handle_completed
# This container is finished so finalize any associated container requests
# that are associated with this container.
self.cwd ||= "."
end
- # Turn a container request into a container.
+ # Create a new container (or find an existing one) to satisfy this
+ # request.
def resolve
- # In the future this will do things like resolve symbolic git and keep
- # references to content addresses.
- Container.create!({ :command => self.command,
- :container_image => self.container_image,
- :cwd => self.cwd,
- :environment => self.environment,
- :mounts => self.mounts,
- :output_path => self.output_path,
- :runtime_constraints => self.runtime_constraints })
+ # TODO: resolve symbolic git and keep references to content
+ # addresses.
+ c = act_as_system_user do
+ Container.create!(command: self.command,
+ container_image: self.container_image,
+ cwd: self.cwd,
+ environment: self.environment,
+ mounts: self.mounts,
+ output_path: self.output_path,
+ runtime_constraints: self.runtime_constraints)
+ end
+ self.container_uuid = c.uuid
end
def set_container
- if self.container_uuid_changed?
- if not current_user.andand.is_admin and not self.container_uuid.nil?
- errors.add :container_uuid, "can only be updated to nil."
- end
- else
- if self.state_changed?
- if self.state == Committed and (self.state_was == Uncommitted or self.state_was.nil?)
- act_as_system_user do
- self.container_uuid = self.resolve.andand.uuid
- end
- end
- end
+ if (container_uuid_changed? and
+ not current_user.andand.is_admin and
+ not container_uuid.nil?)
+ errors.add :container_uuid, "can only be updated to nil."
+ return false
+ end
+ if state_changed? and state == Committed and container_uuid.nil?
+ resolve
end
end
end
def update_priority
- if [Committed, Final].include? self.state and (self.state_changed? or
- self.priority_changed? or
- self.container_uuid_changed?)
- [self.container_uuid_was, self.container_uuid].each do |cuuid|
- unless cuuid.nil?
- c = Container.find_by_uuid cuuid
- act_as_system_user do
- c.update_priority!
- end
- end
+ if self.state_changed? or
+ self.priority_changed? or
+ self.container_uuid_changed?
+ act_as_system_user do
+ Container.
+ where('uuid in (?)',
+ [self.container_uuid_was, self.container_uuid].compact).
+ map(&:update_priority!)
end
end
end
def invalidate_permissions_cache
# Ensure a new group can be accessed by the appropriate users
# immediately after being created.
- User.invalidate_permissions_cache
+ User.invalidate_permissions_cache db_current_time.to_i
end
def assign_name
include HasUuid
include KindAndEtag
include CommonApiTemplate
+ serialize :components, Hash
attr_protected :arvados_sdk_version, :docker_image_locator
serialize :script_parameters, Hash
serialize :runtime_constraints, Hash
t.add :queue_position
t.add :node_uuids
t.add :description
+ t.add :components
end
# Supported states for a job
end
def queue_position
- Job::queue.each_with_index do |job, index|
- if job[:uuid] == self.uuid
- return index
- end
- end
- nil
+ # We used to report this accurately, but the implementation made queue
+ # API requests O(n**2) for the size of the queue. See #8800.
+ # We've soft-disabled it because it's not clear we even want this
+ # functionality: now that we have Node Manager with support for multiple
+ # node sizes, "queue position" tells you very little about when a job will
+ # run.
+ state == Queued ? 0 : nil
end
def self.running
end
def lock locked_by_uuid
- transaction do
- self.reload
+ with_lock do
unless self.state == Queued and self.is_locked_by_uuid.nil?
raise AlreadyLockedError
end
output_changed? or
log_changed? or
tasks_summary_changed? or
- state_changed?
+ state_changed? or
+ components_changed?
logger.warn "User #{current_user.uuid if current_user} tried to change protected job attributes on locked #{self.class.to_s} #{uuid_was}"
return false
end
# permissions for head_uuid and tail_uuid, and invalidate the
# cache for only those users. (This would require a browseable
# cache.)
- User.invalidate_permissions_cache
+ User.invalidate_permissions_cache db_current_time.to_i
end
end
true
end
- def self.invalidate_permissions_cache
- Rails.cache.delete_matched(/^groups_for_user_/)
+ def self.invalidate_permissions_cache(timestamp=nil)
+ if Rails.configuration.async_permissions_update
+ timestamp = DbCurrentTime::db_current_time.to_i if timestamp.nil?
+ connection.execute "NOTIFY invalidate_permissions_cache, '#{timestamp}'"
+ else
+ Rails.cache.delete_matched(/^groups_for_user_/)
+ end
end
# Return a hash of {group_uuid: perm_hash} where perm_hash[:read]
# The permission graph is built by repeatedly enumerating all
# permission links reachable from self.uuid, and then calling
# search_permissions
- def group_permissions
- Rails.cache.fetch "groups_for_user_#{self.uuid}" do
+ def calculate_group_permissions
permissions_from = {}
todo = {self.uuid => true}
done = {}
end
end
end
- search_permissions(self.uuid, permissions_from)
+ perms = search_permissions(self.uuid, permissions_from)
+ Rails.cache.write "groups_for_user_#{self.uuid}", perms
+ perms
+ end
+
+ # Return a hash of {group_uuid: perm_hash} where perm_hash[:read]
+ # and perm_hash[:write] are true if this user can read and write
+ # objects owned by group_uuid.
+ def group_permissions
+ r = Rails.cache.read "groups_for_user_#{self.uuid}"
+ if r.nil?
+ if Rails.configuration.async_permissions_update
+ while r.nil?
+ sleep(0.1)
+ r = Rails.cache.read "groups_for_user_#{self.uuid}"
+ end
+ else
+ r = calculate_group_permissions
+ end
end
+ r
end
def self.setup(user, openid_prefix, repo_name=nil, vm_uuid=nil)
# generate permission signatures for Keep locators. It must be
# identical to the permission key given to Keep. IMPORTANT: This is
# a site secret. It should be at least 50 characters.
+ #
+ # Modifying blob_signing_key will invalidate all existing
+ # signatures, which can cause programs to fail (e.g., arv-put,
+ # arv-get, and Crunch jobs). To avoid errors, rotate keys only when
+ # no such processes are running.
blob_signing_key: ~
# These settings are provided by your OAuth2 provider (e.g.,
# still has permission) the client can retrieve the collection again
# to get fresh signatures.
#
- # Datamanager considers an unreferenced block older than this to be
- # eligible for garbage collection. Therefore, it should never be
- # smaller than the corresponding value used by any local keepstore
- # service (see keepstore -blob-signature-ttl flag). This rule
- # prevents datamanager from trying to garbage-collect recently
- # written blocks while clients are still holding valid signatures.
+ # This must be exactly equal to the -blob-signature-ttl flag used by
+ # keepstore servers. Otherwise, reading data blocks and saving
+ # collections will fail with HTTP 403 permission errors.
+ #
+ # Modifying blob_signature_ttl invalidates existing signatures; see
+ # blob_signing_key note above.
#
# The default is 2 weeks.
blob_signature_ttl: 1209600
crunch_log_partial_line_throttle_period: 5
+ # Enable asynchronous permission graph rebuild. Must run
+ # script/permission-updater.rb as a separate process. When the permission
+ # cache is invalidated, the background process will update the permission
+ # graph cache. This feature is experimental!
+ async_permissions_update: false
+
development:
force_ssl: false
cache_classes: false
--- /dev/null
+module URI
+ if Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.2')
+ # Rack uses the standard library method URI.decode_www_form_component to
+ # process parameters. This method first validates the string with a
+ # regular expression, and then decodes it using another regular expression.
+ # Ruby 2.1 and earlier has a bug is in the validation; the regular
+ # expression that is used generates many backtracking points, which results
+ # in exponential memory growth when matching large strings. The fix is to
+ # monkey-patch the version of the method from Ruby 2.2 which checks that
+ # the string is not invalid instead of checking it is valid.
+ def self.decode_www_form_component(str, enc=Encoding::UTF_8)
+ raise ArgumentError, "invalid %-encoding (#{str})" if /%(?!\h\h)/ =~ str
+ str.b.gsub(/\+|%\h\h/, TBLDECWWWCOMP_).force_encoding(enc)
+ end
+ end
+end
namespace :v1 do
resources :api_client_authorizations do
post 'create_system_auth', on: :collection
+ get 'current', on: :collection
end
resources :api_clients
resources :authorized_keys
end
resources :humans
resources :job_tasks
- resources :containers
+ resources :containers do
+ get 'auth', on: :member
+ end
resources :container_requests
resources :jobs do
get 'queue', on: :collection
--- /dev/null
+class AddComponentsToJob < ActiveRecord::Migration
+ def up
+ add_column :jobs, :components, :text
+ end
+
+ def down
+ if column_exists?(:jobs, :components)
+ remove_column :jobs, :components
+ end
+ end
+end
--- /dev/null
+class AddAuthsToContainer < ActiveRecord::Migration
+ def change
+ add_column :containers, :auth_uuid, :string
+ add_column :containers, :locked_by_uuid, :string
+ end
+end
--- /dev/null
+class AddAuthAndLockToContainerIndex < ActiveRecord::Migration
+ Columns_were = ["uuid", "owner_uuid", "modified_by_client_uuid", "modified_by_user_uuid", "state", "log", "cwd", "output_path", "output", "container_image"]
+ Columns = Columns_were + ["auth_uuid", "locked_by_uuid"]
+ def up
+ begin
+ remove_index :containers, :name => 'containers_search_index'
+ rescue
+ end
+ add_index(:containers, Columns, name: "containers_search_index")
+ end
+
+ def down
+ begin
+ remove_index :containers, :name => 'containers_search_index'
+ rescue
+ end
+ add_index(:containers, Columns_were, name: "containers_search_index")
+ end
+end
progress double precision,
priority integer,
updated_at timestamp without time zone NOT NULL,
- exit_code integer
+ exit_code integer,
+ auth_uuid character varying(255),
+ locked_by_uuid character varying(255)
);
priority integer DEFAULT 0 NOT NULL,
description character varying(524288),
state character varying(255),
- arvados_sdk_version character varying(255)
+ arvados_sdk_version character varying(255),
+ components text
);
-- Name: containers_search_index; Type: INDEX; Schema: public; Owner: -; Tablespace:
--
-CREATE INDEX containers_search_index ON containers USING btree (uuid, owner_uuid, modified_by_client_uuid, modified_by_user_uuid, state, log, cwd, output_path, output, container_image);
+CREATE INDEX containers_search_index ON containers USING btree (uuid, owner_uuid, modified_by_client_uuid, modified_by_user_uuid, state, log, cwd, output_path, output, container_image, auth_uuid, locked_by_uuid);
--
INSERT INTO schema_migrations (version) VALUES ('20160208210629');
-INSERT INTO schema_migrations (version) VALUES ('20160209155729');
\ No newline at end of file
+INSERT INTO schema_migrations (version) VALUES ('20160209155729');
+
+INSERT INTO schema_migrations (version) VALUES ('20160324144017');
+
+INSERT INTO schema_migrations (version) VALUES ('20160506175108');
+
+INSERT INTO schema_migrations (version) VALUES ('20160509143250');
\ No newline at end of file
end
def act_as_user user
+ #auth_was = Thread.current[:api_client_authorization]
user_was = Thread.current[:user]
Thread.current[:user] = user
+ #Thread.current[:api_client_authorization] = ApiClientAuthorization.
+ # where('user_id=? and scopes is null', user.id).
+ # order('expires_at desc').
+ # first
begin
yield
ensure
Thread.current[:user] = user_was
+ #Thread.current[:api_client_authorization] = auth_was
end
end
begin
begin
# Parse event data as JSON
- p = (Oj.load event.data).symbolize_keys
+ p = (Oj.strict_load event.data).symbolize_keys
filter = Filter.new(p)
rescue Oj::Error => e
ws.send ({status: 400, message: "malformed request"}.to_json)
@where = params[:where]
elsif params[:where].is_a? String
begin
- @where = Oj.load(params[:where])
+ @where = Oj.strict_load(params[:where])
raise unless @where.is_a? Hash
rescue
raise ArgumentError.new("Could not parse \"where\" param as an object")
@filters += params[:filters]
elsif params[:filters].is_a? String and !params[:filters].empty?
begin
- f = Oj.load params[:filters]
+ f = Oj.strict_load params[:filters]
if not f.nil?
raise unless f.is_a? Array
@filters += f
(case params[:order]
when String
if params[:order].starts_with? '['
- od = Oj.load(params[:order])
+ od = Oj.strict_load(params[:order])
raise unless od.is_a? Array
od
else
@select = params[:select]
when String
begin
- @select = Oj.load params[:select]
+ @select = Oj.strict_load params[:select]
raise unless @select.is_a? Array or @select.nil?
rescue
raise ArgumentError.new("Could not parse \"select\" param as an array")
end
end
cond_out << cond.join(' OR ')
+ else
+ raise ArgumentError.new("Invalid operator '#{operator}'")
end
end
conds_out << cond_out.join(' OR ') if cond_out.any?
def check_update_whitelist permitted_fields
attribute_names.each do |field|
if not permitted_fields.include? field.to_sym and self.send((field.to_s + "_changed?").to_sym)
- errors.add field, "illegal update of field"
+ errors.add field, "cannot be modified in this state"
end
end
end
def validate_state_change
if self.state_changed?
unless state_transitions[self.state_was].andand.include? self.state
- errors.add :state, "invalid state change from #{self.state_was} to #{self.state}"
+ errors.add :state, "cannot change from #{self.state_was} to #{self.state}"
return false
end
end
--- /dev/null
+#!/usr/bin/env ruby
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+include DbCurrentTime
+
+def update_permissions
+ timestamp = DbCurrentTime::db_current_time.to_i
+ Rails.logger.info "Begin updating permission cache"
+ User.all.each do |u|
+ u.calculate_group_permissions
+ end
+ Rails.cache.write "last_updated_permissions", timestamp
+ Rails.logger.info "Permission cache updated"
+end
+
+ActiveRecord::Base.connection_pool.with_connection do |connection|
+ conn = connection.instance_variable_get(:@connection)
+ begin
+ conn.async_exec "LISTEN invalidate_permissions_cache"
+
+ # Initial refresh of permissions graph
+ update_permissions
+
+ while true
+ # wait_for_notify will block until there is a change
+ # notification from Postgres about the permission cache,
+ # and then rebuild the permission cache.
+ conn.wait_for_notify do |channel, pid, payload|
+ last_updated = Rails.cache.read("last_updated_permissions")
+ Rails.logger.info "Got notify #{payload} last update #{last_updated}"
+ if last_updated.nil? || last_updated.to_i <= payload.to_i
+ update_permissions
+ end
+ end
+ end
+ ensure
+ # Don't want the connection to still be listening once we return
+ # it to the pool - could result in weird behavior for the next
+ # thread to check it out.
+ conn.async_exec "UNLISTEN *"
+ end
+end
api_token: e99512cdc0f3415c2428b9758f33bdfb07bc3561b00e86e7e6
expires_at: 2038-01-01 00:00:00
+job_reader2:
+ uuid: zzzzz-gj3su-jobreader2auth1
+ api_client: untrusted
+ user: job_reader2
+ api_token: jobreader2415c2428b9758f33bdfb07bc3561b0jobreader2
+ expires_at: 2038-01-01 00:00:00
+
active_no_prefs:
uuid: zzzzz-gj3su-307z32aux8dg2s1
api_client: untrusted
api_token: 4nagbkv8eap0uok7pxm72nossq5asihls3yn5p4xmvqx5t5e7p
expires_at: 2038-01-01 00:00:00
+dispatch1:
+ uuid: zzzzz-gj3su-k9dvestay1plssr
+ api_client: untrusted
+ user: system_user
+ api_token: kwi8oowusvbutahacwk2geulqewy5oaqmpalczfna4b6bb0hfw
+ expires_at: 2038-01-01 00:00:00
--- /dev/null
+queued:
+ owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ state: Committed
+ priority: 1
+ created_at: 2016-01-11 11:11:11.111111111 Z
+ updated_at: 2016-01-11 11:11:11.111111111 Z
+ modified_at: 2016-01-11 11:11:11.111111111 Z
+ modified_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ container_image: test
+ cwd: test
+ output_path: test
+ command: ["echo", "hello"]
+ container_uuid: zzzzz-dz642-queuedcontainer
queued:
uuid: zzzzz-dz642-queuedcontainer
- owner_uuid: zzzzz-tpzed-d9tiejq69daie8f
+ owner_uuid: zzzzz-tpzed-000000000000000
state: Queued
priority: 1
created_at: 2016-01-11 11:11:11.111111111 Z
output: test
output_path: test
command: ["echo", "hello"]
+ runtime_constraints:
+ ram: 12000000000
+ vcpus: 4
completed:
uuid: zzzzz-dz642-compltcontainer
- owner_uuid: zzzzz-tpzed-d9tiejq69daie8f
+ owner_uuid: zzzzz-tpzed-000000000000000
state: Complete
priority: 1
created_at: 2016-01-11 11:11:11.111111111 Z
output: test
output_path: test
command: ["echo", "hello"]
+ runtime_constraints:
+ ram: 12000000000
+ vcpus: 4
--- /dev/null
+running_job_task_1:
+ uuid: zzzzz-ot0gb-runningjobtask1
+ owner_uuid: zzzzz-j7d0g-v955i6s2oi1cbso
+ created_at: <%= 3.minute.ago.to_s(:db) %>
+ job_uuid: zzzzz-8i9sb-with2components
+
+running_job_task_2:
+ uuid: zzzzz-ot0gb-runningjobtask2
+ owner_uuid: zzzzz-j7d0g-v955i6s2oi1cbso
+ created_at: <%= 3.minute.ago.to_s(:db) %>
+ job_uuid: zzzzz-8i9sb-with2components
log: zzzzz-4zz18-fy296fx3hot09f7
output: zzzzz-4zz18-bv31uwvy3neko21
+running_job_with_components:
+ uuid: zzzzz-8i9sb-with2components
+ owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ cancelled_at: ~
+ cancelled_by_user_uuid: ~
+ cancelled_by_client_uuid: ~
+ created_at: <%= 3.minute.ago.to_s(:db) %>
+ started_at: <%= 3.minute.ago.to_s(:db) %>
+ finished_at: ~
+ script: hash
+ repository: active/foo
+ script_version: 1de84a854e2b440dc53bf42f8548afa4c17da332
+ running: true
+ success: ~
+ output: ~
+ priority: 0
+ log: ~
+ is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ tasks_summary:
+ failed: 0
+ todo: 3
+ running: 1
+ done: 1
+ runtime_constraints: {}
+ state: Running
+ components:
+ component1: zzzzz-8i9sb-jyq01m7in1jlofj
+ component2: zzzzz-d1hrv-partdonepipelin
+
# Test Helper trims the rest of the file
# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
tail_uuid: zzzzz-tpzed-905b42d1dd4a354
head_uuid: zzzzz-s0uqq-382brsig8rp3666
+job_reader2_can_read_job_with_components:
+ # Permission link giving job_reader2 permission
+ # to read running_job_with_components
+ uuid: zzzzz-o0j2j-jobcomps4jobrdr
+ owner_uuid: zzzzz-tpzed-000000000000000
+ created_at: 2014-06-13 20:42:26 -0800
+ modified_by_client_uuid: zzzzz-tpzed-000000000000000
+ modified_by_user_uuid: zzzzz-tpzed-000000000000000
+ modified_at: 2014-06-13 20:42:26 -0800
+ updated_at: 2014-06-13 20:42:26 -0800
+ link_class: permission
+ name: can_read
+ tail_uuid: zzzzz-tpzed-readjobwithcomp
+ head_uuid: zzzzz-8i9sb-with2components
+
+job_reader2_can_read_pipeline_from_job_with_components:
+ # Permission link giving job_reader2 permission
+ # to read running_job_with_components
+ uuid: zzzzz-o0j2j-pi4comps4jobrdr
+ owner_uuid: zzzzz-tpzed-000000000000000
+ created_at: 2014-06-13 20:42:26 -0800
+ modified_by_client_uuid: zzzzz-tpzed-000000000000000
+ modified_by_user_uuid: zzzzz-tpzed-000000000000000
+ modified_at: 2014-06-13 20:42:26 -0800
+ updated_at: 2014-06-13 20:42:26 -0800
+ link_class: permission
+ name: can_read
+ tail_uuid: zzzzz-tpzed-readjobwithcomp
+ head_uuid: zzzzz-d1hrv-partdonepipelin
+
+job_reader2_can_read_first_job_from_pipeline_from_job_with_components:
+ # Permission link giving job_reader2 permission
+ # to read running_job_with_components
+ uuid: zzzzz-o0j2j-job4pi4j4jobrdr
+ owner_uuid: zzzzz-tpzed-000000000000000
+ created_at: 2014-06-13 20:42:26 -0800
+ modified_by_client_uuid: zzzzz-tpzed-000000000000000
+ modified_by_user_uuid: zzzzz-tpzed-000000000000000
+ modified_at: 2014-06-13 20:42:26 -0800
+ updated_at: 2014-06-13 20:42:26 -0800
+ link_class: permission
+ name: can_read
+ tail_uuid: zzzzz-tpzed-readjobwithcomp
+ head_uuid: zzzzz-8i9sb-cjs4pklxxjykqqq
+
baz_collection_name_in_asubproject:
uuid: zzzzz-o0j2j-bazprojectname2
owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
state: Complete
uuid: zzzzz-d1hrv-i3e77t9z5y8j9cc
owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ started_at: <%= 10.minute.ago.to_s(:db) %>
+ finished_at: <%= 9.minute.ago.to_s(:db) %>
components:
foo:
script: foo
role: Computational biologist
getting_started_shown: 2015-03-26 12:34:56.789000000 Z
+job_reader2:
+ owner_uuid: zzzzz-tpzed-000000000000000
+ uuid: zzzzz-tpzed-readjobwithcomp
+ email: job_reader2@arvados.local
+ first_name: Job
+ last_name: Reader2
+ identity_url: https://job_reader2.openid.local
+ is_active: true
+ is_admin: false
+ username: jobreader2
+ prefs:
+ profile:
+ organization: example.com
+ role: Computational biologist
+ getting_started_shown: 2015-03-26 12:34:56.789000000 Z
+
active_no_prefs:
owner_uuid: zzzzz-tpzed-000000000000000
uuid: zzzzz-tpzed-a46c42d1td4aoj4
assert_response 403
end
- def assert_found_tokens(auth, search_params, *expected_tokens)
+ def assert_found_tokens(auth, search_params, expected)
authorize_with auth
- expected_tokens.map! { |name| api_client_authorizations(name).api_token }
+ expected_tokens = expected.map do |name|
+ api_client_authorizations(name).api_token
+ end
get :index, search_params
assert_response :success
got_tokens = JSON.parse(@response.body)['items']
# Three-tuples with auth to use, scopes to find, and expected tokens.
# Make two tests for each tuple, one searching with where and the other
# with filter.
- [[:admin_trustedclient, [], :admin_noscope],
- [:active_trustedclient, ["GET /arvados/v1/users"], :active_userlist],
+ [[:admin_trustedclient, [], [:admin_noscope]],
+ [:active_trustedclient, ["GET /arvados/v1/users"], [:active_userlist]],
[:active_trustedclient,
["POST /arvados/v1/api_client_authorizations",
"GET /arvados/v1/api_client_authorizations"],
- :active_apitokens],
- ].each do |auth, scopes, *expected|
+ [:active_apitokens]],
+ ].each do |auth, scopes, expected|
test "#{auth.to_s} can find auths where scopes=#{scopes.inspect}" do
- assert_found_tokens(auth, {where: {scopes: scopes}}, *expected)
+ assert_found_tokens(auth, {where: {scopes: scopes}}, expected)
end
test "#{auth.to_s} can find auths filtered with scopes=#{scopes.inspect}" do
- assert_found_tokens(auth, {filters: [['scopes', '=', scopes]]}, *expected)
+ assert_found_tokens(auth, {filters: [['scopes', '=', scopes]]}, expected)
+ end
+
+ test "#{auth.to_s} offset works with filter scopes=#{scopes.inspect}" do
+ assert_found_tokens(auth, {
+ offset: expected.length,
+ filters: [['scopes', '=', scopes]]
+ }, [])
end
end
assert_response expect_list_response
if expect_list_items
assert_equal assigns(:objects).length, expect_list_items
+ assert_equal json_response['items_available'], expect_list_items
+ end
+ end
+
+ if expect_list_items
+ test "using '#{user}', list '#{token}' by uuid with offset" do
+ authorize_with user
+ get :index, {
+ filters: [['uuid','=',api_client_authorizations(token).uuid]],
+ offset: expect_list_items,
+ }
+ assert_response expect_list_response
+ assert_equal json_response['items_available'], expect_list_items
+ assert_equal json_response['items'].length, 0
end
end
assert_response expect_list_response
if expect_list_items
assert_equal assigns(:objects).length, expect_list_items
+ assert_equal json_response['items_available'], expect_list_items
end
end
end
}
assert_response 403
end
+
+ test "get current token" do
+ authorize_with :active
+ get :current
+ assert_response :success
+ assert_equal(json_response['api_token'],
+ api_client_authorizations(:active).api_token)
+ end
+
+ test "get current token, no auth" do
+ get :current
+ assert_response 401
+ end
end
--- /dev/null
+require 'test_helper'
+
+class Arvados::V1::ContainersControllerTest < ActionController::TestCase
+ test 'create' do
+ authorize_with :system_user
+ post :create, {
+ container: {
+ command: ['echo', 'hello'],
+ container_image: 'test',
+ output_path: 'test',
+ },
+ }
+ assert_response :success
+ end
+
+ [Container::Queued, Container::Complete].each do |state|
+ test "cannot get auth in #{state} state" do
+ authorize_with :dispatch1
+ get :auth, id: containers(:queued).uuid
+ assert_response 403
+ end
+ end
+
+ test 'cannot get auth with wrong token' do
+ authorize_with :dispatch1
+ c = containers(:queued)
+ assert c.update_attributes(state: Container::Locked), show_errors(c)
+
+ authorize_with :system_user
+ get :auth, id: c.uuid
+ assert_response 403
+ end
+
+ test 'get auth' do
+ authorize_with :dispatch1
+ c = containers(:queued)
+ assert c.update_attributes(state: Container::Locked), show_errors(c)
+ get :auth, id: c.uuid
+ assert_response :success
+ assert_operator 32, :<, json_response['api_token'].length
+ assert_equal 'arvados#apiClientAuthorization', json_response['kind']
+ end
+
+ test 'no auth in container response' do
+ authorize_with :dispatch1
+ c = containers(:queued)
+ assert c.update_attributes(state: Container::Locked), show_errors(c)
+ get :show, id: c.uuid
+ assert_response :success
+ assert_nil json_response['auth']
+ end
+end
assert_response :success
# verify that the user can no longer see the project
- @counter = 0 # Reset executed action counter
+ @test_counter = 0 # Reset executed action counter
@controller = Arvados::V1::GroupsController.new
authorize_with :project_viewer
get :index, filters: [['group_class', '=', 'project']], format: :json
assert_equal false, found_projects.include?(groups(:starred_and_shared_active_user_project).uuid)
# share the project
- @counter = 0
+ @test_counter = 0
@controller = Arvados::V1::LinksController.new
authorize_with :system_user
post :create, link: {
}
# verify that project_viewer user can now see shared project again
- @counter = 0
+ @test_counter = 0
@controller = Arvados::V1::GroupsController.new
authorize_with :project_viewer
get :index, filters: [['group_class', '=', 'project']], format: :json
assert_equal('077ba2ad3ea24a929091a9e6ce545c93199b8e57',
internal_tag(json_response['uuid']))
end
+
+ test 'get job with components' do
+ authorize_with :active
+ get :show, {id: jobs(:running_job_with_components).uuid}
+ assert_response :success
+ assert_not_nil json_response["components"]
+ assert_equal ["component1", "component2"], json_response["components"].keys
+ end
+
+ [
+ [:active, :success],
+ [:system_user, :success],
+ [:admin, 403],
+ ].each do |user, expected|
+ test "add components to job locked by active user as #{user} user and expect #{expected}" do
+ authorize_with user
+ put :update, {
+ id: jobs(:running).uuid,
+ job: {
+ components: {"component1" => "value1", "component2" => "value2"}
+ }
+ }
+ assert_response expected
+ if expected == :success
+ assert_not_nil json_response["components"]
+ keys = json_response["components"].keys
+ assert_equal ["component1", "component2"], keys
+ assert_equal "value1", json_response["components"][keys[0]]
+ end
+ end
+ end
+
+ test 'get_delete components_get again for job with components' do
+ authorize_with :active
+ get :show, {id: jobs(:running_job_with_components).uuid}
+ assert_response :success
+ assert_not_nil json_response["components"]
+ assert_equal ["component1", "component2"], json_response["components"].keys
+
+ # delete second component
+ @test_counter = 0 # Reset executed action counter
+ @controller = Arvados::V1::JobsController.new
+ put :update, {
+ id: jobs(:running_job_with_components).uuid,
+ job: {
+ components: {"component1" => "zzzzz-8i9sb-jobuuid00000001"}
+ }
+ }
+ assert_response :success
+
+ @test_counter = 0 # Reset executed action counter
+ @controller = Arvados::V1::JobsController.new
+ get :show, {id: jobs(:running_job_with_components).uuid}
+ assert_response :success
+ assert_not_nil json_response["components"]
+ assert_equal ["component1"], json_response["components"].keys
+
+ # delete all components
+ @test_counter = 0 # Reset executed action counter
+ @controller = Arvados::V1::JobsController.new
+ put :update, {
+ id: jobs(:running_job_with_components).uuid,
+ job: {
+ components: {}
+ }
+ }
+ assert_response :success
+
+ @test_counter = 0 # Reset executed action counter
+ @controller = Arvados::V1::JobsController.new
+ get :show, {id: jobs(:running_job_with_components).uuid}
+ assert_response :success
+ assert_not_nil json_response["components"]
+ assert_equal [], json_response["components"].keys
+ end
end
$stderr.puts "#{t1 - t0}s #{label}"
end
end
+
+ def vmpeak c
+ open("/proc/self/status").each_line do |line|
+ print "Begin #{c} #{line}" if (line =~ /^VmHWM:/)
+ end
+ n = yield
+ open("/proc/self/status").each_line do |line|
+ print "End #{c} #{line}" if (line =~ /^VmHWM:/)
+ end
+ n
+ end
+
end
include ManifestExamples
test "crud cycle for a collection with a big manifest" do
+ slow_test
bigmanifest = time_block 'make example' do
make_manifest(streams: 100,
files_per_stream: 100,
api_token: api_token(:active))
end
json = time_block "JSON encode #{bigmanifest.length>>20}MiB manifest" do
- Oj.dump({manifest_text: bigmanifest})
+ Oj.dump({"manifest_text" => bigmanifest})
end
time_block 'create' do
post '/arvados/v1/collections', {collection: json}, auth(:active)
delete '/arvados/v1/collections/' + uuid, {}, auth(:active)
end
end
+
+ test "memory usage" do
+ slow_test
+ hugemanifest = make_manifest(streams: 1,
+ files_per_stream: 2000,
+ blocks_per_file: 200,
+ bytes_per_block: 2**26,
+ api_token: api_token(:active))
+ json = time_block "JSON encode #{hugemanifest.length>>20}MiB manifest" do
+ Oj.dump({manifest_text: hugemanifest})
+ end
+ vmpeak "post" do
+ post '/arvados/v1/collections', {collection: json}, auth(:active)
+ end
+ end
end
self.use_transactional_fixtures = false
test "reset fails when Rails.env != 'test'" do
+ slow_test
rails_env_was = Rails.env
begin
Rails.env = 'production'
end
test "database reset doesn't break basic CRUD operations" do
+ slow_test
active_auth = auth(:active)
admin_auth = auth(:admin)
end
test "roll back database change" do
+ slow_test
active_auth = auth(:active)
admin_auth = auth(:admin)
ws_helper do |ws|
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
status = d["status"]
ws.close
end
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
status = d["status"]
ws.close
end
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
test "connect, subscribe, get event, unsubscribe" do
+ slow_test
state = 1
spec = nil
spec_ev_uuid = nil
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
test "connect, subscribe, get event, unsubscribe with filter" do
+ slow_test
state = 1
spec = nil
spec_ev_uuid = nil
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
test "connect, subscribe, get event, try to unsubscribe with bogus filter" do
+ slow_test
state = 1
spec = nil
spec_ev_uuid = nil
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
test "connected, not subscribed, no event" do
+ slow_test
authorize_with :admin
ws_helper :admin, false do |ws|
end
test "connected, not authorized to see event" do
+ slow_test
state = 1
authorize_with :admin
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
status = d["status"]
ws.close
end
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
status = d["status"]
ws.close
end
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
status = d["status"]
ws.close
end
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when (1..EventBus::MAX_FILTERS)
assert_equal 200, d["status"]
end
test "connect, subscribe, lots of events" do
+ slow_test
state = 1
event_count = 0
log_start = Log.order(:id).last.id
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
end
ws.on :message do |event|
- d = Oj.load event.data
+ d = Oj.strict_load event.data
case state
when 1
assert_equal 200, d["status"]
module ArvadosTestSupport
def json_response
- Oj.load response.body
+ Oj.strict_load response.body
end
def api_token(api_client_auth_name)
def auth(api_client_auth_name)
{'HTTP_AUTHORIZATION' => "OAuth2 #{api_token(api_client_auth_name)}"}
end
+
+ def show_errors model
+ return lambda { model.errors.full_messages.inspect }
+ end
end
class ActiveSupport::TestCase
include ArvadosTestSupport
+ setup do
+ Rails.logger.warn "\n\n#{'=' * 70}\n#{self.class}\##{method_name}\n#{'-' * 70}\n\n"
+ end
+
teardown do
Thread.current[:api_client_ip_address] = nil
Thread.current[:api_client_authorization] = nil
ArvadosApiToken.new.call("rack.input" => "",
"HTTP_AUTHORIZATION" => "OAuth2 #{t}")
end
+
+ def slow_test
+ skip "RAILS_TEST_SHORT is set" unless (ENV['RAILS_TEST_SHORT'] || '').empty?
+ end
end
class ActionController::TestCase
setup do
- @counter = 0
+ @test_counter = 0
end
def check_counter action
- @counter += 1
- if @counter == 2
+ @test_counter += 1
+ if @test_counter == 2
assert_equal 1, 2, "Multiple actions in functional test"
end
end
'vu5wm9fpnwjyxfldw3vbo01mgjs75rgo7qioh8z8ij7jpyp8508okhgbbex3ceei' +
'786u5rw2a9gx743dj3fgq2irk'
@@known_signed_locator = 'acbd18db4cc2f85cedef654fccc4a4d8+3' +
- '+A257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a@7fffffff'
+ '+A89118b78732c33104a4d6231e8b5a5fa1e4301e3@7fffffff'
test 'generate predictable invincible signature' do
signed = Blob.sign_locator @@known_locator, {
Blob.verify_signature!(@@blob_locator, api_token: @@api_token, key: @@key)
end
end
+
+ test 'signature changes when ttl changes' do
+ signed = Blob.sign_locator @@known_locator, {
+ api_token: @@known_token,
+ key: @@known_key,
+ expire: 0x7fffffff,
+ }
+
+ original_ttl = Rails.configuration.blob_signature_ttl
+ Rails.configuration.blob_signature_ttl = original_ttl*2
+ signed2 = Blob.sign_locator @@known_locator, {
+ api_token: @@known_token,
+ key: @@known_key,
+ expire: 0x7fffffff,
+ }
+ Rails.configuration.blob_signature_ttl = original_ttl
+
+ assert_not_equal signed, signed2
+ end
end
# "crrud" == "create read render update delete", not a typo
test "crrud cycle for a collection with a big manifest)" do
+ slow_test
bigmanifest = time_block 'make example' do
make_manifest(streams: 100,
files_per_stream: 100,
assert_equal "Committed", cr.state
c = Container.find_by_uuid cr.container_uuid
- assert_equal "Queued", c.state
+ assert_equal Container::Queued, c.state
act_as_system_user do
- c.state = "Running"
- c.save!
+ c.update_attributes! state: Container::Locked
+ c.update_attributes! state: Container::Running
end
cr.reload
assert_equal "Committed", cr.state
act_as_system_user do
- c.state = "Complete"
+ c.update_attributes! state: Container::Complete
c.save!
end
require 'test_helper'
class ContainerTest < ActiveSupport::TestCase
- def check_illegal_modify c
- c.reload
- c.command = ["echo", "bar"]
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
-
- c.reload
- c.container_image = "img2"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
+ include DbCurrentTime
- c.reload
- c.cwd = "/tmp2"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
+ DEFAULT_ATTRS = {
+ command: ['echo', 'foo'],
+ container_image: 'img',
+ output_path: '/tmp',
+ priority: 1,
+ }
- c.reload
- c.environment = {"FOO" => "BAR"}
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
+ def minimal_new attrs={}
+ cr = ContainerRequest.new DEFAULT_ATTRS.merge(attrs)
+ act_as_user users(:active) do
+ cr.save!
end
-
- c.reload
- c.mounts = {"FOO" => "BAR"}
- assert_raises(ActiveRecord::RecordInvalid) do
+ c = Container.new DEFAULT_ATTRS.merge(attrs)
+ act_as_system_user do
c.save!
+ assert cr.update_attributes(container_uuid: c.uuid,
+ state: ContainerRequest::Committed,
+ ), show_errors(cr)
end
+ return c, cr
+ end
- c.reload
- c.output_path = "/tmp3"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
+ def check_illegal_updates c, bad_updates
+ bad_updates.each do |u|
+ refute c.update_attributes(u), u.inspect
+ refute c.valid?, u.inspect
+ c.reload
end
+ end
- c.reload
- c.runtime_constraints = {"FOO" => "BAR"}
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
+ def check_illegal_modify c
+ check_illegal_updates c, [{command: ["echo", "bar"]},
+ {container_image: "img2"},
+ {cwd: "/tmp2"},
+ {environment: {"FOO" => "BAR"}},
+ {mounts: {"FOO" => "BAR"}},
+ {output_path: "/tmp3"},
+ {locked_by_uuid: "zzzzz-gj3su-027z32aux8dg2s1"},
+ {auth_uuid: "zzzzz-gj3su-017z32aux8dg2s1"},
+ {runtime_constraints: {"FOO" => "BAR"}}]
end
def check_bogus_states c
- c.reload
- c.state = nil
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
-
- c.reload
- c.state = "Flubber"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
+ check_illegal_updates c, [{state: nil},
+ {state: "Flubber"}]
end
- def check_no_change_from_complete c
+ def check_no_change_from_cancelled c
check_illegal_modify c
check_bogus_states c
-
- c.reload
- c.priority = 3
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
-
- c.reload
- c.state = "Queued"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
-
- c.reload
- c.state = "Running"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
-
- c.reload
- c.state = "Complete"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
+ check_illegal_updates c, [{ priority: 3 },
+ { state: Container::Queued },
+ { state: Container::Locked },
+ { state: Container::Running },
+ { state: Container::Complete }]
end
test "Container create" do
act_as_system_user do
- c = Container.new
- c.command = ["echo", "foo"]
- c.container_image = "img"
- c.cwd = "/tmp"
- c.environment = {}
- c.mounts = {"BAR" => "FOO"}
- c.output_path = "/tmp"
- c.priority = 1
- c.runtime_constraints = {}
- c.save!
+ c, _ = minimal_new(environment: {},
+ mounts: {"BAR" => "FOO"},
+ output_path: "/tmp",
+ priority: 1,
+ runtime_constraints: {})
check_illegal_modify c
check_bogus_states c
end
test "Container running" do
- act_as_system_user do
- c = Container.new
- c.command = ["echo", "foo"]
- c.container_image = "img"
- c.output_path = "/tmp"
- c.save!
+ c, _ = minimal_new priority: 1
- c.reload
- c.state = "Complete"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
+ set_user_from_auth :dispatch1
+ check_illegal_updates c, [{state: Container::Running},
+ {state: Container::Complete}]
- c.reload
- c.state = "Running"
- c.save!
+ c.update_attributes! state: Container::Locked
+ c.update_attributes! state: Container::Running
- check_illegal_modify c
- check_bogus_states c
+ check_illegal_modify c
+ check_bogus_states c
- c.reload
- c.state = "Queued"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
+ check_illegal_updates c, [{state: Container::Queued}]
+ c.reload
- c.reload
- c.priority = 3
- c.save!
- end
+ c.update_attributes! priority: 3
end
- test "Container queued cancel" do
- act_as_system_user do
- c = Container.new
- c.command = ["echo", "foo"]
- c.container_image = "img"
- c.output_path = "/tmp"
- c.save!
+ test "Lock and unlock" do
+ c, cr = minimal_new priority: 0
- c.reload
- c.state = "Cancelled"
- c.save!
+ set_user_from_auth :dispatch1
+ assert_equal Container::Queued, c.state
- check_no_change_from_complete c
- end
- end
+ refute c.update_attributes(state: Container::Locked), "no priority"
+ c.reload
+ assert cr.update_attributes priority: 1
- test "Container running cancel" do
- act_as_system_user do
- c = Container.new
- c.command = ["echo", "foo"]
- c.container_image = "img"
- c.output_path = "/tmp"
- c.save!
+ refute c.update_attributes(state: Container::Running), "not locked"
+ c.reload
+ refute c.update_attributes(state: Container::Complete), "not locked"
+ c.reload
- c.reload
- c.state = "Running"
- c.save!
+ assert c.update_attributes(state: Container::Locked), show_errors(c)
+ assert c.locked_by_uuid
+ assert c.auth_uuid
- c.reload
- c.state = "Cancelled"
- c.save!
+ assert c.update_attributes(state: Container::Queued), show_errors(c)
+ refute c.locked_by_uuid
+ refute c.auth_uuid
- check_no_change_from_complete c
- end
+ refute c.update_attributes(state: Container::Running), "not locked"
+ c.reload
+ refute c.locked_by_uuid
+ refute c.auth_uuid
+
+ assert c.update_attributes(state: Container::Locked), show_errors(c)
+ assert c.update_attributes(state: Container::Running), show_errors(c)
+ assert c.locked_by_uuid
+ assert c.auth_uuid
+
+ auth_uuid_was = c.auth_uuid
+
+ refute c.update_attributes(state: Container::Locked), "already running"
+ c.reload
+ refute c.update_attributes(state: Container::Queued), "already running"
+ c.reload
+
+ assert c.update_attributes(state: Container::Complete), show_errors(c)
+ refute c.locked_by_uuid
+ refute c.auth_uuid
+
+ auth_exp = ApiClientAuthorization.find_by_uuid(auth_uuid_was).expires_at
+ assert_operator auth_exp, :<, db_current_time
+ end
+
+ test "Container queued cancel" do
+ c, _ = minimal_new
+ set_user_from_auth :dispatch1
+ assert c.update_attributes(state: Container::Cancelled), show_errors(c)
+ check_no_change_from_cancelled c
+ end
+
+ test "Container locked cancel" do
+ c, _ = minimal_new
+ set_user_from_auth :dispatch1
+ assert c.update_attributes(state: Container::Locked), show_errors(c)
+ assert c.update_attributes(state: Container::Cancelled), show_errors(c)
+ check_no_change_from_cancelled c
+ end
+
+ test "Container running cancel" do
+ c, _ = minimal_new
+ set_user_from_auth :dispatch1
+ c.update_attributes! state: Container::Queued
+ c.update_attributes! state: Container::Locked
+ c.update_attributes! state: Container::Running
+ c.update_attributes! state: Container::Cancelled
+ check_no_change_from_cancelled c
end
test "Container create forbidden for non-admin" do
set_user_from_auth :active_trustedclient
- c = Container.new
- c.command = ["echo", "foo"]
- c.container_image = "img"
- c.cwd = "/tmp"
+ c = Container.new DEFAULT_ATTRS
c.environment = {}
c.mounts = {"BAR" => "FOO"}
c.output_path = "/tmp"
end
test "Container only set exit code on complete" do
- act_as_system_user do
- c = Container.new
- c.command = ["echo", "foo"]
- c.container_image = "img"
- c.output_path = "/tmp"
- c.save!
+ c, _ = minimal_new
+ set_user_from_auth :dispatch1
+ c.update_attributes! state: Container::Locked
+ c.update_attributes! state: Container::Running
- c.reload
- c.state = "Running"
- c.save!
+ check_illegal_updates c, [{exit_code: 1},
+ {exit_code: 1, state: Container::Cancelled}]
- c.reload
- c.exit_code = 1
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
-
- c.reload
- c.exit_code = 1
- c.state = "Cancelled"
- assert_raises(ActiveRecord::RecordInvalid) do
- c.save!
- end
-
- c.reload
- c.exit_code = 1
- c.state = "Complete"
- c.save!
- end
+ assert c.update_attributes(exit_code: 1, state: Container::Complete)
end
end
assert_not_nil job1.queue_position, "Expected non-nil queue position for job1"
assert_not_nil job2.queue_position, "Expected non-nil queue position for job2"
- assert_not_equal job1.queue_position, job2.queue_position
end
SDK_MASTER = "ca68b24e51992e790f29df5cc4bc54ce1da4a1c2"
package main
+// Dispatcher service for Crunch that runs containers locally.
+
import (
"flag"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+ "git.curoverse.com/arvados.git/sdk/go/dispatch"
"log"
"os"
"os/exec"
- "os/signal"
"sync"
- "syscall"
"time"
)
}
var (
- arv arvadosclient.ArvadosClient
runningCmds map[string]*exec.Cmd
runningCmdsMutex sync.Mutex
waitGroup sync.WaitGroup
- doneProcessing chan bool
- sigChan chan os.Signal
+ crunchRunCommand *string
)
func doMain() error {
10,
"Interval in seconds to poll for queued containers")
- priorityPollInterval := flags.Int(
- "container-priority-poll-interval",
- 60,
- "Interval in seconds to check priority of a dispatched container")
-
- crunchRunCommand := flags.String(
+ crunchRunCommand = flags.String(
"crunch-run-command",
"/usr/bin/crunch-run",
"Crunch command to run container")
// Parse args; omit the first arg which is the command name
flags.Parse(os.Args[1:])
- var err error
- arv, err = arvadosclient.MakeArvadosClient()
+ runningCmds = make(map[string]*exec.Cmd)
+
+ arv, err := arvadosclient.MakeArvadosClient()
if err != nil {
+ log.Printf("Error making Arvados client: %v", err)
return err
}
+ arv.Retries = 25
- // Channel to terminate
- doneProcessing = make(chan bool)
-
- // Map of running crunch jobs
- runningCmds = make(map[string]*exec.Cmd)
-
- // Graceful shutdown
- sigChan = make(chan os.Signal, 1)
- signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
- go func(sig <-chan os.Signal) {
- for sig := range sig {
- log.Printf("Caught signal: %v", sig)
- doneProcessing <- true
- }
- }(sigChan)
+ dispatcher := dispatch.Dispatcher{
+ Arv: arv,
+ RunContainer: run,
+ PollInterval: time.Duration(*pollInterval) * time.Second,
+ DoneProcessing: make(chan struct{})}
- // Run all queued containers
- runQueuedContainers(*pollInterval, *priorityPollInterval, *crunchRunCommand)
+ err = dispatcher.RunDispatcher()
+ if err != nil {
+ return err
+ }
+ runningCmdsMutex.Lock()
// Finished dispatching; interrupt any crunch jobs that are still running
for _, cmd := range runningCmds {
cmd.Process.Signal(os.Interrupt)
}
+ runningCmdsMutex.Unlock()
// Wait for all running crunch jobs to complete / terminate
waitGroup.Wait()
return nil
}
-// Poll for queued containers using pollInterval.
-// Invoke dispatchLocal for each ticker cycle, which will run all the queued containers.
-//
-// Any errors encountered are logged but the program would continue to run (not exit).
-// This is because, once one or more crunch jobs are running,
-// we would need to wait for them complete.
-func runQueuedContainers(pollInterval, priorityPollInterval int, crunchRunCommand string) {
- ticker := time.NewTicker(time.Duration(pollInterval) * time.Second)
-
- for {
- select {
- case <-ticker.C:
- dispatchLocal(priorityPollInterval, crunchRunCommand)
- case <-doneProcessing:
- ticker.Stop()
- return
- }
- }
+func startFunc(container dispatch.Container, cmd *exec.Cmd) error {
+ return cmd.Start()
}
-// Container data
-type Container struct {
- UUID string `json:"uuid"`
- State string `json:"state"`
- Priority int `json:"priority"`
-}
+var startCmd = startFunc
-// ContainerList is a list of the containers from api
-type ContainerList struct {
- Items []Container `json:"items"`
-}
-
-// Get the list of queued containers from API server and invoke run for each container.
-func dispatchLocal(priorityPollInterval int, crunchRunCommand string) {
- params := arvadosclient.Dict{
- "filters": [][]string{[]string{"state", "=", "Queued"}},
- }
-
- var containers ContainerList
- err := arv.List("containers", params, &containers)
- if err != nil {
- log.Printf("Error getting list of queued containers: %q", err)
- return
- }
+// Run a container.
+//
+// If the container is Locked, start a new crunch-run process and wait until
+// crunch-run completes. If the priority is set to zero, set an interrupt
+// signal to the crunch-run process.
+//
+// If the container is in any other state, or is not Complete/Cancelled after
+// crunch-run terminates, mark the container as Cancelled.
+func run(dispatcher *dispatch.Dispatcher,
+ container dispatch.Container,
+ status chan dispatch.Container) {
+
+ uuid := container.UUID
+
+ if container.State == dispatch.Locked {
+ waitGroup.Add(1)
+
+ cmd := exec.Command(*crunchRunCommand, uuid)
+ cmd.Stdin = nil
+ cmd.Stderr = os.Stderr
+ cmd.Stdout = os.Stderr
+
+ log.Printf("Starting container %v", uuid)
+
+ // Add this crunch job to the list of runningCmds only if we
+ // succeed in starting crunch-run.
+
+ runningCmdsMutex.Lock()
+ if err := startCmd(container, cmd); err != nil {
+ runningCmdsMutex.Unlock()
+ log.Printf("Error starting %v for %v: %q", *crunchRunCommand, uuid, err)
+ dispatcher.UpdateState(uuid, dispatch.Cancelled)
+ } else {
+ runningCmds[uuid] = cmd
+ runningCmdsMutex.Unlock()
+
+ // Need to wait for crunch-run to exit
+ done := make(chan struct{})
+
+ go func() {
+ if _, err := cmd.Process.Wait(); err != nil {
+ log.Printf("Error while waiting for crunch job to finish for %v: %q", uuid, err)
+ }
+ log.Printf("sending done")
+ done <- struct{}{}
+ }()
+
+ Loop:
+ for {
+ select {
+ case <-done:
+ break Loop
+ case c := <-status:
+ // Interrupt the child process if priority changes to 0
+ if (c.State == dispatch.Locked || c.State == dispatch.Running) && c.Priority == 0 {
+ log.Printf("Sending SIGINT to pid %d to cancel container %v", cmd.Process.Pid, uuid)
+ cmd.Process.Signal(os.Interrupt)
+ }
+ }
+ }
+ close(done)
- for i := 0; i < len(containers.Items); i++ {
- log.Printf("About to run queued container %v", containers.Items[i].UUID)
- // Run the container
- go run(containers.Items[i].UUID, crunchRunCommand, priorityPollInterval)
- }
-}
+ log.Printf("Finished container run for %v", uuid)
-// Run queued container:
-// Set container state to locked (TBD)
-// Run container using the given crunch-run command
-// Set the container state to Running
-// If the container priority becomes zero while crunch job is still running, terminate it.
-func run(uuid string, crunchRunCommand string, priorityPollInterval int) {
- cmd := exec.Command(crunchRunCommand, uuid)
-
- cmd.Stdin = nil
- cmd.Stderr = os.Stderr
- cmd.Stdout = os.Stderr
- if err := cmd.Start(); err != nil {
- log.Printf("Error running container for %v: %q", uuid, err)
- return
+ // Remove the crunch job from runningCmds
+ runningCmdsMutex.Lock()
+ delete(runningCmds, uuid)
+ runningCmdsMutex.Unlock()
+ }
+ waitGroup.Done()
}
- // Add this crunch job to the list of runningCmds
- runningCmdsMutex.Lock()
- runningCmds[uuid] = cmd
- runningCmdsMutex.Unlock()
-
- log.Printf("Started container run for %v", uuid)
-
- // Add this crunch job to waitGroup
- waitGroup.Add(1)
- defer waitGroup.Done()
-
- // Update container status to Running
- err := arv.Update("containers", uuid,
- arvadosclient.Dict{
- "container": arvadosclient.Dict{"state": "Running"}},
- nil)
+ // If the container is not finalized, then change it to "Cancelled".
+ err := dispatcher.Arv.Get("containers", uuid, nil, &container)
if err != nil {
- log.Printf("Error updating container state to 'Running' for %v: %q", uuid, err)
+ log.Printf("Error getting final container state: %v", err)
}
-
- // A goroutine to terminate the runner if container priority becomes zero
- priorityTicker := time.NewTicker(time.Duration(priorityPollInterval) * time.Second)
- go func() {
- for _ = range priorityTicker.C {
- var container Container
- err := arv.Get("containers", uuid, nil, &container)
- if err != nil {
- log.Printf("Error getting container info for %v: %q", uuid, err)
- } else {
- if container.Priority == 0 {
- priorityTicker.Stop()
- cmd.Process.Signal(os.Interrupt)
- }
- }
- }
- }()
-
- // Wait for the crunch job to exit
- if _, err := cmd.Process.Wait(); err != nil {
- log.Printf("Error while waiting for crunch job to finish for %v: %q", uuid, err)
+ if container.LockedByUUID == dispatcher.Auth.UUID &&
+ (container.State == dispatch.Locked || container.State == dispatch.Running) {
+ log.Printf("After %s process termination, container state for %v is %q. Updating it to %q",
+ *crunchRunCommand, container.State, uuid, dispatch.Cancelled)
+ dispatcher.UpdateState(uuid, dispatch.Cancelled)
}
- // Remove the crunch job to runningCmds
- runningCmdsMutex.Lock()
- delete(runningCmds, uuid)
- runningCmdsMutex.Unlock()
-
- priorityTicker.Stop()
-
- // The container state should be 'Complete'
- var container Container
- err = arv.Get("containers", uuid, nil, &container)
- if container.State == "Running" {
- log.Printf("After crunch-run process termination, the state is still 'Running' for %v. Updating it to 'Complete'", uuid)
- err = arv.Update("containers", uuid,
- arvadosclient.Dict{
- "container": arvadosclient.Dict{"state": "Complete"}},
- nil)
- if err != nil {
- log.Printf("Error updating container state to Complete for %v: %q", uuid, err)
- }
+ // drain any subsequent status changes
+ for _ = range status {
}
- log.Printf("Finished container run for %v", uuid)
+ log.Printf("Finalized container %v", uuid)
}
package main
import (
+ "bytes"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/arvadostest"
-
- "io/ioutil"
+ "git.curoverse.com/arvados.git/sdk/go/dispatch"
+ . "gopkg.in/check.v1"
+ "io"
"log"
"net/http"
"net/http/httptest"
"os"
+ "os/exec"
"strings"
- "syscall"
"testing"
"time"
-
- . "gopkg.in/check.v1"
)
// Gocheck boilerplate
func (s *TestSuite) SetUpSuite(c *C) {
initialArgs = os.Args
arvadostest.StartAPI()
+ runningCmds = make(map[string]*exec.Cmd)
}
func (s *TestSuite) TearDownSuite(c *C) {
func (s *TestSuite) SetUpTest(c *C) {
args := []string{"crunch-dispatch-local"}
os.Args = args
-
- var err error
- arv, err = arvadosclient.MakeArvadosClient()
- if err != nil {
- c.Fatalf("Error making arvados client: %s", err)
- }
}
func (s *TestSuite) TearDownTest(c *C) {
arvadostest.ResetEnv()
}
-func (s *TestSuite) Test_doMain(c *C) {
- args := []string{"-poll-interval", "2", "-container-priority-poll-interval", "1", "-crunch-run-command", "echo"}
- os.Args = append(os.Args, args...)
+func (s *TestSuite) TestIntegration(c *C) {
+ arv, err := arvadosclient.MakeArvadosClient()
+ c.Assert(err, IsNil)
+
+ echo := "echo"
+ crunchRunCommand = &echo
+
+ doneProcessing := make(chan struct{})
+ dispatcher := dispatch.Dispatcher{
+ Arv: arv,
+ PollInterval: time.Duration(1) * time.Second,
+ RunContainer: func(dispatcher *dispatch.Dispatcher,
+ container dispatch.Container,
+ status chan dispatch.Container) {
+ run(dispatcher, container, status)
+ doneProcessing <- struct{}{}
+ },
+ DoneProcessing: doneProcessing}
+
+ startCmd = func(container dispatch.Container, cmd *exec.Cmd) error {
+ dispatcher.UpdateState(container.UUID, "Running")
+ dispatcher.UpdateState(container.UUID, "Complete")
+ return cmd.Start()
+ }
- go func() {
- time.Sleep(5 * time.Second)
- sigChan <- syscall.SIGINT
- }()
+ err = dispatcher.RunDispatcher()
+ c.Assert(err, IsNil)
- err := doMain()
- c.Check(err, IsNil)
+ // Wait for all running crunch jobs to complete / terminate
+ waitGroup.Wait()
// There should be no queued containers now
params := arvadosclient.Dict{
"filters": [][]string{[]string{"state", "=", "Queued"}},
}
- var containers ContainerList
+ var containers dispatch.ContainerList
err = arv.List("containers", params, &containers)
c.Check(err, IsNil)
c.Assert(len(containers.Items), Equals, 0)
// Previously "Queued" container should now be in "Complete" state
- var container Container
+ var container dispatch.Container
err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
c.Check(err, IsNil)
c.Check(container.State, Equals, "Complete")
apiStubResponses := make(map[string]arvadostest.StubResponse)
apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
- testWithServerStub(c, apiStubResponses, "echo", "Error getting list of queued containers")
+ testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
}
func (s *MockArvadosServerSuite) Test_APIErrorUpdatingContainerState(c *C) {
apiStubResponses := make(map[string]arvadostest.StubResponse)
apiStubResponses["/arvados/v1/containers"] =
- arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx1"}]}`)}
+ arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx1","State":"Queued"}]}`)}
apiStubResponses["/arvados/v1/containers/zzzzz-dz642-xxxxxxxxxxxxxx1"] =
arvadostest.StubResponse{500, string(`{}`)}
- testWithServerStub(c, apiStubResponses, "echo", "Error updating container state")
+ testWithServerStub(c, apiStubResponses, "echo", "Error updating container zzzzz-dz642-xxxxxxxxxxxxxx1 to state \"Locked\"")
}
func (s *MockArvadosServerSuite) Test_ContainerStillInRunningAfterRun(c *C) {
apiStubResponses := make(map[string]arvadostest.StubResponse)
apiStubResponses["/arvados/v1/containers"] =
- arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2"}]}`)}
+ arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2","State":"Queued"}]}`)}
apiStubResponses["/arvados/v1/containers/zzzzz-dz642-xxxxxxxxxxxxxx2"] =
- arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2", "state":"Running", "priority":1}`)}
+ arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2", "state":"Running", "priority":1, "locked_by_uuid": "` + arvadostest.Dispatch1AuthUUID + `"}`)}
testWithServerStub(c, apiStubResponses, "echo",
- "After crunch-run process termination, the state is still 'Running' for zzzzz-dz642-xxxxxxxxxxxxxx2")
+ `After echo process termination, container state for Running is "zzzzz-dz642-xxxxxxxxxxxxxx2". Updating it to "Cancelled"`)
}
func (s *MockArvadosServerSuite) Test_ErrorRunningContainer(c *C) {
apiStubResponses := make(map[string]arvadostest.StubResponse)
apiStubResponses["/arvados/v1/containers"] =
- arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx3"}]}`)}
+ arvadostest.StubResponse{200, string(`{"items_available":1, "items":[{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx3","State":"Queued"}]}`)}
+
apiStubResponses["/arvados/v1/containers/zzzzz-dz642-xxxxxxxxxxxxxx3"] =
arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx3", "state":"Running", "priority":1}`)}
- testWithServerStub(c, apiStubResponses, "nosuchcommand", "Error running container for zzzzz-dz642-xxxxxxxxxxxxxx3")
+ testWithServerStub(c, apiStubResponses, "nosuchcommand", "Error starting nosuchcommand for zzzzz-dz642-xxxxxxxxxxxxxx3")
}
func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
+ apiStubResponses["/arvados/v1/api_client_authorizations/current"] =
+ arvadostest.StubResponse{200, string(`{"uuid": "` + arvadostest.Dispatch1AuthUUID + `", "api_token": "xyz"}`)}
+
apiStub := arvadostest.ServerStub{apiStubResponses}
api := httptest.NewServer(&apiStub)
defer api.Close()
- arv = arvadosclient.ArvadosClient{
+ arv := arvadosclient.ArvadosClient{
Scheme: "http",
ApiServer: api.URL[7:],
ApiToken: "abc123",
Retries: 0,
}
- tempfile, err := ioutil.TempFile(os.TempDir(), "temp-log-file")
- c.Check(err, IsNil)
- defer os.Remove(tempfile.Name())
- log.SetOutput(tempfile)
+ buf := bytes.NewBuffer(nil)
+ log.SetOutput(io.MultiWriter(buf, os.Stderr))
+ defer log.SetOutput(os.Stderr)
+
+ *crunchRunCommand = crunchCmd
+
+ doneProcessing := make(chan struct{})
+ dispatcher := dispatch.Dispatcher{
+ Arv: arv,
+ PollInterval: time.Duration(1) * time.Second,
+ RunContainer: func(dispatcher *dispatch.Dispatcher,
+ container dispatch.Container,
+ status chan dispatch.Container) {
+ run(dispatcher, container, status)
+ doneProcessing <- struct{}{}
+ },
+ DoneProcessing: doneProcessing}
+
+ startCmd = func(container dispatch.Container, cmd *exec.Cmd) error {
+ dispatcher.UpdateState(container.UUID, "Running")
+ dispatcher.UpdateState(container.UUID, "Complete")
+ return cmd.Start()
+ }
go func() {
- time.Sleep(2 * time.Second)
- sigChan <- syscall.SIGTERM
+ for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
+ time.Sleep(100 * time.Millisecond)
+ }
+ dispatcher.DoneProcessing <- struct{}{}
}()
- runQueuedContainers(1, 1, crunchCmd)
+ err := dispatcher.RunDispatcher()
+ c.Assert(err, IsNil)
// Wait for all running crunch jobs to complete / terminate
waitGroup.Wait()
- buf, _ := ioutil.ReadFile(tempfile.Name())
- c.Check(strings.Contains(string(buf), expected), Equals, true)
+ c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
}
package main
+// Dispatcher service for Crunch that submits containers to the slurm queue.
+
import (
"flag"
"fmt"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+ "git.curoverse.com/arvados.git/sdk/go/dispatch"
"io/ioutil"
"log"
+ "math"
"os"
"os/exec"
- "os/signal"
- "sync"
- "syscall"
+ "strings"
"time"
)
}
var (
- arv arvadosclient.ArvadosClient
- runningCmds map[string]*exec.Cmd
- runningCmdsMutex sync.Mutex
- waitGroup sync.WaitGroup
- doneProcessing chan bool
- sigChan chan os.Signal
+ crunchRunCommand *string
+ squeueUpdater Squeue
)
func doMain() error {
10,
"Interval in seconds to poll for queued containers")
- priorityPollInterval := flags.Int(
- "container-priority-poll-interval",
- 60,
- "Interval in seconds to check priority of a dispatched container")
-
- crunchRunCommand := flags.String(
+ crunchRunCommand = flags.String(
"crunch-run-command",
"/usr/bin/crunch-run",
"Crunch command to run container")
- finishCommand := flags.String(
- "finish-command",
- "/usr/bin/crunch-finish-slurm.sh",
- "Command to run from strigger when job is finished")
-
// Parse args; omit the first arg which is the command name
flags.Parse(os.Args[1:])
- var err error
- arv, err = arvadosclient.MakeArvadosClient()
+ arv, err := arvadosclient.MakeArvadosClient()
if err != nil {
+ log.Printf("Error making Arvados client: %v", err)
return err
}
+ arv.Retries = 25
- // Channel to terminate
- doneProcessing = make(chan bool)
-
- // Graceful shutdown
- sigChan = make(chan os.Signal, 1)
- signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
- go func(sig <-chan os.Signal) {
- for sig := range sig {
- log.Printf("Caught signal: %v", sig)
- doneProcessing <- true
- }
- }(sigChan)
-
- // Run all queued containers
- runQueuedContainers(*pollInterval, *priorityPollInterval, *crunchRunCommand, *finishCommand)
-
- // Wait for all running crunch jobs to complete / terminate
- waitGroup.Wait()
-
- return nil
-}
-
-// Poll for queued containers using pollInterval.
-// Invoke dispatchSlurm for each ticker cycle, which will run all the queued containers.
-//
-// Any errors encountered are logged but the program would continue to run (not exit).
-// This is because, once one or more crunch jobs are running,
-// we would need to wait for them complete.
-func runQueuedContainers(pollInterval, priorityPollInterval int, crunchRunCommand, finishCommand string) {
- ticker := time.NewTicker(time.Duration(pollInterval) * time.Second)
-
- for {
- select {
- case <-ticker.C:
- dispatchSlurm(priorityPollInterval, crunchRunCommand, finishCommand)
- case <-doneProcessing:
- ticker.Stop()
- return
- }
- }
-}
-
-// Container data
-type Container struct {
- UUID string `json:"uuid"`
- State string `json:"state"`
- Priority int `json:"priority"`
-}
-
-// ContainerList is a list of the containers from api
-type ContainerList struct {
- Items []Container `json:"items"`
-}
+ squeueUpdater.StartMonitor(time.Duration(*pollInterval) * time.Second)
+ defer squeueUpdater.Done()
-// Get the list of queued containers from API server and invoke run for each container.
-func dispatchSlurm(priorityPollInterval int, crunchRunCommand, finishCommand string) {
- params := arvadosclient.Dict{
- "filters": [][]string{[]string{"state", "=", "Queued"}},
- }
+ dispatcher := dispatch.Dispatcher{
+ Arv: arv,
+ RunContainer: run,
+ PollInterval: time.Duration(*pollInterval) * time.Second,
+ DoneProcessing: make(chan struct{})}
- var containers ContainerList
- err := arv.List("containers", params, &containers)
+ err = dispatcher.RunDispatcher()
if err != nil {
- log.Printf("Error getting list of queued containers: %q", err)
- return
+ return err
}
- for i := 0; i < len(containers.Items); i++ {
- log.Printf("About to submit queued container %v", containers.Items[i].UUID)
- // Run the container
- go run(containers.Items[i], crunchRunCommand, finishCommand, priorityPollInterval)
- }
+ return nil
}
// sbatchCmd
-func sbatchFunc(uuid string) *exec.Cmd {
- return exec.Command("sbatch", "--job-name="+uuid, "--share", "--parsable")
+func sbatchFunc(container dispatch.Container) *exec.Cmd {
+ memPerCPU := math.Ceil((float64(container.RuntimeConstraints["ram"])) / (float64(container.RuntimeConstraints["vcpus"] * 1048576)))
+ return exec.Command("sbatch", "--share", "--parsable",
+ fmt.Sprintf("--job-name=%s", container.UUID),
+ fmt.Sprintf("--mem-per-cpu=%d", int(memPerCPU)),
+ fmt.Sprintf("--cpus-per-task=%d", int(container.RuntimeConstraints["vcpus"])),
+ fmt.Sprintf("--priority=%d", container.Priority))
}
-var sbatchCmd = sbatchFunc
-
-// striggerCmd
-func striggerFunc(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure string) *exec.Cmd {
- return exec.Command("strigger", "--set", "--jobid="+jobid, "--fini",
- fmt.Sprintf("--program=%s %s %s %s %s", finishCommand, apiHost, apiToken, apiInsecure, containerUUID))
+// scancelCmd
+func scancelFunc(container dispatch.Container) *exec.Cmd {
+ return exec.Command("scancel", "--name="+container.UUID)
}
-var striggerCmd = striggerFunc
+// Wrap these so that they can be overridden by tests
+var sbatchCmd = sbatchFunc
+var scancelCmd = scancelFunc
// Submit job to slurm using sbatch.
-func submit(container Container, crunchRunCommand string) (jobid string, submitErr error) {
+func submit(dispatcher *dispatch.Dispatcher,
+ container dispatch.Container, crunchRunCommand string) (jobid string, submitErr error) {
submitErr = nil
- // Mark record as complete if anything errors out.
defer func() {
- if submitErr != nil {
- // This really should be an "Error" state, see #8018
- updateErr := arv.Update("containers", container.UUID,
- arvadosclient.Dict{
- "container": arvadosclient.Dict{"state": "Complete"}},
- nil)
- if updateErr != nil {
- log.Printf("Error updating container state to 'Complete' for %v: %q", container.UUID, updateErr)
- }
+ // If we didn't get as far as submitting a slurm job,
+ // unlock the container and return it to the queue.
+ if submitErr == nil {
+ // OK, no cleanup needed
+ return
+ }
+ err := dispatcher.Arv.Update("containers", container.UUID,
+ arvadosclient.Dict{
+ "container": arvadosclient.Dict{"state": "Queued"}},
+ nil)
+ if err != nil {
+ log.Printf("Error unlocking container %s: %v", container.UUID, err)
}
}()
// Create the command and attach to stdin/stdout
- cmd := sbatchCmd(container.UUID)
+ cmd := sbatchCmd(container)
stdinWriter, stdinerr := cmd.StdinPipe()
if stdinerr != nil {
submitErr = fmt.Errorf("Error creating stdin pipe %v: %q", container.UUID, stdinerr)
return
}
+ // Mutex between squeue sync and running sbatch or scancel.
+ squeueUpdater.SlurmLock.Lock()
+ defer squeueUpdater.SlurmLock.Unlock()
+
err := cmd.Start()
if err != nil {
submitErr = fmt.Errorf("Error starting %v: %v", cmd.Args, err)
stdoutChan := make(chan []byte)
go func() {
b, _ := ioutil.ReadAll(stdoutReader)
+ stdoutReader.Close()
stdoutChan <- b
- close(stdoutChan)
}()
stderrChan := make(chan []byte)
go func() {
b, _ := ioutil.ReadAll(stderrReader)
+ stderrReader.Close()
stderrChan <- b
- close(stderrChan)
}()
// Send a tiny script on stdin to execute the crunch-run command
stdoutMsg := <-stdoutChan
stderrmsg := <-stderrChan
+ close(stdoutChan)
+ close(stderrChan)
+
if err != nil {
submitErr = fmt.Errorf("Container submission failed %v: %v %v", cmd.Args, err, stderrmsg)
return
}
// If everything worked out, got the jobid on stdout
- jobid = string(stdoutMsg)
+ jobid = strings.TrimSpace(string(stdoutMsg))
return
}
-// finalizeRecordOnFinish uses 'strigger' command to register a script that will run on
-// the slurm controller when the job finishes.
-func finalizeRecordOnFinish(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure string) {
- cmd := striggerCmd(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure)
- cmd.Stdout = os.Stdout
- cmd.Stderr = os.Stderr
- err := cmd.Run()
- if err != nil {
- log.Printf("While setting up strigger: %v", err)
- }
-}
-
-// Run a queued container.
-// Set container state to locked (TBD)
-// Submit job to slurm to execute crunch-run command for the container
-// If the container priority becomes zero while crunch job is still running, cancel the job.
-func run(container Container, crunchRunCommand, finishCommand string, priorityPollInterval int) {
+// If the container is marked as Locked, check if it is already in the slurm
+// queue. If not, submit it.
+//
+// If the container is marked as Running, check if it is in the slurm queue.
+// If not, mark it as Cancelled.
+func monitorSubmitOrCancel(dispatcher *dispatch.Dispatcher, container dispatch.Container, monitorDone *bool) {
+ submitted := false
+ for !*monitorDone {
+ if squeueUpdater.CheckSqueue(container.UUID) {
+ // Found in the queue, so continue monitoring
+ submitted = true
+ } else if container.State == dispatch.Locked && !submitted {
+ // Not in queue but in Locked state and we haven't
+ // submitted it yet, so submit it.
+
+ log.Printf("About to submit queued container %v", container.UUID)
+
+ if _, err := submit(dispatcher, container, *crunchRunCommand); err != nil {
+ log.Printf("Error submitting container %s to slurm: %v",
+ container.UUID, err)
+ // maybe sbatch is broken, put it back to queued
+ dispatcher.UpdateState(container.UUID, dispatch.Queued)
+ }
+ submitted = true
+ } else {
+ // Not in queue and we are not going to submit it.
+ // Refresh the container state. If it is
+ // Complete/Cancelled, do nothing, if it is Locked then
+ // release it back to the Queue, if it is Running then
+ // clean up the record.
+
+ var con dispatch.Container
+ err := dispatcher.Arv.Get("containers", container.UUID, nil, &con)
+ if err != nil {
+ log.Printf("Error getting final container state: %v", err)
+ }
- jobid, err := submit(container, crunchRunCommand)
- if err != nil {
- log.Printf("Error queuing container run: %v", err)
- return
- }
+ var st string
+ switch con.State {
+ case dispatch.Locked:
+ st = dispatch.Queued
+ case dispatch.Running:
+ st = dispatch.Cancelled
+ default:
+ // Container state is Queued, Complete or Cancelled so stop monitoring it.
+ return
+ }
- insecure := "0"
- if arv.ApiInsecure {
- insecure = "1"
- }
- finalizeRecordOnFinish(jobid, container.UUID, finishCommand, arv.ApiServer, arv.ApiToken, insecure)
-
- // Update container status to Running, this is a temporary workaround
- // to avoid resubmitting queued containers because record locking isn't
- // implemented yet.
- err = arv.Update("containers", container.UUID,
- arvadosclient.Dict{
- "container": arvadosclient.Dict{"state": "Running"}},
- nil)
- if err != nil {
- log.Printf("Error updating container state to 'Running' for %v: %q", container.UUID, err)
+ log.Printf("Container %s in state %v but missing from slurm queue, changing to %v.",
+ container.UUID, con.State, st)
+ dispatcher.UpdateState(container.UUID, st)
+ }
}
+}
- log.Printf("Submitted container run for %v", container.UUID)
-
- containerUUID := container.UUID
-
- // A goroutine to terminate the runner if container priority becomes zero
- priorityTicker := time.NewTicker(time.Duration(priorityPollInterval) * time.Second)
- go func() {
- for _ = range priorityTicker.C {
- var container Container
- err := arv.Get("containers", containerUUID, nil, &container)
- if err != nil {
- log.Printf("Error getting container info for %v: %q", container.UUID, err)
- } else {
- if container.Priority == 0 {
- log.Printf("Canceling container %v", container.UUID)
- priorityTicker.Stop()
- cancelcmd := exec.Command("scancel", "--name="+container.UUID)
- cancelcmd.Run()
- }
- if container.State == "Complete" {
- priorityTicker.Stop()
+// Run or monitor a container.
+//
+// Monitor status updates. If the priority changes to zero, cancel the
+// container using scancel.
+func run(dispatcher *dispatch.Dispatcher,
+ container dispatch.Container,
+ status chan dispatch.Container) {
+
+ log.Printf("Monitoring container %v started", container.UUID)
+ defer log.Printf("Monitoring container %v finished", container.UUID)
+
+ monitorDone := false
+ go monitorSubmitOrCancel(dispatcher, container, &monitorDone)
+
+ for container = range status {
+ if container.State == dispatch.Locked || container.State == dispatch.Running {
+ if container.Priority == 0 {
+ log.Printf("Canceling container %s", container.UUID)
+
+ // Mutex between squeue sync and running sbatch or scancel.
+ squeueUpdater.SlurmLock.Lock()
+ err := scancelCmd(container).Run()
+ squeueUpdater.SlurmLock.Unlock()
+
+ if err != nil {
+ log.Printf("Error stopping container %s with scancel: %v",
+ container.UUID, err)
+ if squeueUpdater.CheckSqueue(container.UUID) {
+ log.Printf("Container %s is still in squeue after scancel.",
+ container.UUID)
+ continue
+ }
}
+
+ err = dispatcher.UpdateState(container.UUID, dispatch.Cancelled)
}
}
- }()
-
+ }
+ monitorDone = true
}
package main
import (
+ "bytes"
+ "fmt"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/arvadostest"
-
- "io/ioutil"
+ "git.curoverse.com/arvados.git/sdk/go/dispatch"
+ "io"
"log"
"net/http"
"net/http/httptest"
"os"
"os/exec"
"strings"
- "syscall"
"testing"
"time"
func (s *TestSuite) SetUpSuite(c *C) {
initialArgs = os.Args
- arvadostest.StartAPI()
}
func (s *TestSuite) TearDownSuite(c *C) {
- arvadostest.StopAPI()
}
func (s *TestSuite) SetUpTest(c *C) {
args := []string{"crunch-dispatch-slurm"}
os.Args = args
- var err error
- arv, err = arvadosclient.MakeArvadosClient()
- if err != nil {
- c.Fatalf("Error making arvados client: %s", err)
- }
+ arvadostest.StartAPI()
+ os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
}
func (s *TestSuite) TearDownTest(c *C) {
- arvadostest.ResetEnv()
os.Args = initialArgs
+ arvadostest.StopAPI()
}
func (s *MockArvadosServerSuite) TearDownTest(c *C) {
arvadostest.ResetEnv()
}
-func (s *TestSuite) Test_doMain(c *C) {
- args := []string{"-poll-interval", "2", "-container-priority-poll-interval", "1", "-crunch-run-command", "echo"}
- os.Args = append(os.Args, args...)
+func (s *TestSuite) TestIntegrationNormal(c *C) {
+ container := s.integrationTest(c, func() *exec.Cmd { return exec.Command("echo", "zzzzz-dz642-queuedcontainer") },
+ []string(nil),
+ func(dispatcher *dispatch.Dispatcher, container dispatch.Container) {
+ dispatcher.UpdateState(container.UUID, dispatch.Running)
+ time.Sleep(3 * time.Second)
+ dispatcher.UpdateState(container.UUID, dispatch.Complete)
+ })
+ c.Check(container.State, Equals, "Complete")
+}
- var sbatchCmdLine []string
- var striggerCmdLine []string
+func (s *TestSuite) TestIntegrationCancel(c *C) {
// Override sbatchCmd
- defer func(orig func(string) *exec.Cmd) {
- sbatchCmd = orig
- }(sbatchCmd)
- sbatchCmd = func(uuid string) *exec.Cmd {
- sbatchCmdLine = sbatchFunc(uuid).Args
- return exec.Command("echo", uuid)
+ var scancelCmdLine []string
+ defer func(orig func(dispatch.Container) *exec.Cmd) {
+ scancelCmd = orig
+ }(scancelCmd)
+ scancelCmd = func(container dispatch.Container) *exec.Cmd {
+ scancelCmdLine = scancelFunc(container).Args
+ return exec.Command("echo")
}
- // Override striggerCmd
- defer func(orig func(jobid, containerUUID, finishCommand,
- apiHost, apiToken, apiInsecure string) *exec.Cmd) {
- striggerCmd = orig
- }(striggerCmd)
- striggerCmd = func(jobid, containerUUID, finishCommand, apiHost, apiToken, apiInsecure string) *exec.Cmd {
- striggerCmdLine = striggerFunc(jobid, containerUUID, finishCommand,
- apiHost, apiToken, apiInsecure).Args
- go func() {
- time.Sleep(5 * time.Second)
- arv.Update("containers", containerUUID,
+ container := s.integrationTest(c, func() *exec.Cmd { return exec.Command("echo", "zzzzz-dz642-queuedcontainer") },
+ []string(nil),
+ func(dispatcher *dispatch.Dispatcher, container dispatch.Container) {
+ dispatcher.UpdateState(container.UUID, dispatch.Running)
+ time.Sleep(1 * time.Second)
+ dispatcher.Arv.Update("containers", container.UUID,
arvadosclient.Dict{
- "container": arvadosclient.Dict{"state": "Complete"}},
+ "container": arvadosclient.Dict{"priority": 0}},
nil)
- }()
- return exec.Command("echo", "strigger")
+ })
+ c.Check(container.State, Equals, "Cancelled")
+ c.Check(scancelCmdLine, DeepEquals, []string{"scancel", "--name=zzzzz-dz642-queuedcontainer"})
+}
+
+func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
+ container := s.integrationTest(c, func() *exec.Cmd { return exec.Command("echo") }, []string{"sbatch", "--share", "--parsable",
+ fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
+ fmt.Sprintf("--mem-per-cpu=%d", 2862),
+ fmt.Sprintf("--cpus-per-task=%d", 4),
+ fmt.Sprintf("--priority=%d", 1)},
+ func(dispatcher *dispatch.Dispatcher, container dispatch.Container) {
+ dispatcher.UpdateState(container.UUID, dispatch.Running)
+ time.Sleep(3 * time.Second)
+ dispatcher.UpdateState(container.UUID, dispatch.Complete)
+ })
+ c.Check(container.State, Equals, "Cancelled")
+}
+
+func (s *TestSuite) integrationTest(c *C,
+ newSqueueCmd func() *exec.Cmd,
+ sbatchCmdComps []string,
+ runContainer func(*dispatch.Dispatcher, dispatch.Container)) dispatch.Container {
+ arvadostest.ResetEnv()
+
+ arv, err := arvadosclient.MakeArvadosClient()
+ c.Assert(err, IsNil)
+
+ var sbatchCmdLine []string
+
+ // Override sbatchCmd
+ defer func(orig func(dispatch.Container) *exec.Cmd) {
+ sbatchCmd = orig
+ }(sbatchCmd)
+ sbatchCmd = func(container dispatch.Container) *exec.Cmd {
+ sbatchCmdLine = sbatchFunc(container).Args
+ return exec.Command("sh")
}
- go func() {
- time.Sleep(8 * time.Second)
- sigChan <- syscall.SIGINT
- }()
+ // Override squeueCmd
+ defer func(orig func() *exec.Cmd) {
+ squeueCmd = orig
+ }(squeueCmd)
+ squeueCmd = newSqueueCmd
// There should be no queued containers now
params := arvadosclient.Dict{
"filters": [][]string{[]string{"state", "=", "Queued"}},
}
- var containers ContainerList
- err := arv.List("containers", params, &containers)
+ var containers dispatch.ContainerList
+ err = arv.List("containers", params, &containers)
c.Check(err, IsNil)
c.Check(len(containers.Items), Equals, 1)
- err = doMain()
- c.Check(err, IsNil)
+ echo := "echo"
+ crunchRunCommand = &echo
+
+ doneProcessing := make(chan struct{})
+ dispatcher := dispatch.Dispatcher{
+ Arv: arv,
+ PollInterval: time.Duration(1) * time.Second,
+ RunContainer: func(dispatcher *dispatch.Dispatcher,
+ container dispatch.Container,
+ status chan dispatch.Container) {
+ go runContainer(dispatcher, container)
+ run(dispatcher, container, status)
+ doneProcessing <- struct{}{}
+ },
+ DoneProcessing: doneProcessing}
+
+ squeueUpdater.StartMonitor(time.Duration(500) * time.Millisecond)
- c.Check(sbatchCmdLine, DeepEquals, []string{"sbatch", "--job-name=zzzzz-dz642-queuedcontainer", "--share", "--parsable"})
- c.Check(striggerCmdLine, DeepEquals, []string{"strigger", "--set", "--jobid=zzzzz-dz642-queuedcontainer\n", "--fini",
- "--program=/usr/bin/crunch-finish-slurm.sh " + os.Getenv("ARVADOS_API_HOST") + " 4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h 1 zzzzz-dz642-queuedcontainer"})
+ err = dispatcher.RunDispatcher()
+ c.Assert(err, IsNil)
+
+ squeueUpdater.Done()
+
+ c.Check(sbatchCmdLine, DeepEquals, sbatchCmdComps)
// There should be no queued containers now
err = arv.List("containers", params, &containers)
c.Check(len(containers.Items), Equals, 0)
// Previously "Queued" container should now be in "Complete" state
- var container Container
+ var container dispatch.Container
err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
c.Check(err, IsNil)
- c.Check(container.State, Equals, "Complete")
+ return container
}
func (s *MockArvadosServerSuite) Test_APIErrorGettingContainers(c *C) {
apiStubResponses := make(map[string]arvadostest.StubResponse)
+ apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
- testWithServerStub(c, apiStubResponses, "echo", "Error getting list of queued containers")
+ testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
}
func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
api := httptest.NewServer(&apiStub)
defer api.Close()
- arv = arvadosclient.ArvadosClient{
+ arv := arvadosclient.ArvadosClient{
Scheme: "http",
ApiServer: api.URL[7:],
ApiToken: "abc123",
Retries: 0,
}
- tempfile, err := ioutil.TempFile(os.TempDir(), "temp-log-file")
- c.Check(err, IsNil)
- defer os.Remove(tempfile.Name())
- log.SetOutput(tempfile)
+ buf := bytes.NewBuffer(nil)
+ log.SetOutput(io.MultiWriter(buf, os.Stderr))
+ defer log.SetOutput(os.Stderr)
+
+ crunchRunCommand = &crunchCmd
+
+ doneProcessing := make(chan struct{})
+ dispatcher := dispatch.Dispatcher{
+ Arv: arv,
+ PollInterval: time.Duration(1) * time.Second,
+ RunContainer: func(dispatcher *dispatch.Dispatcher,
+ container dispatch.Container,
+ status chan dispatch.Container) {
+ go func() {
+ time.Sleep(1 * time.Second)
+ dispatcher.UpdateState(container.UUID, dispatch.Running)
+ dispatcher.UpdateState(container.UUID, dispatch.Complete)
+ }()
+ run(dispatcher, container, status)
+ doneProcessing <- struct{}{}
+ },
+ DoneProcessing: doneProcessing}
go func() {
- time.Sleep(2 * time.Second)
- sigChan <- syscall.SIGTERM
+ for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
+ time.Sleep(100 * time.Millisecond)
+ }
+ dispatcher.DoneProcessing <- struct{}{}
}()
- runQueuedContainers(2, 1, crunchCmd, crunchCmd)
+ err := dispatcher.RunDispatcher()
+ c.Assert(err, IsNil)
- buf, _ := ioutil.ReadFile(tempfile.Name())
- c.Check(strings.Contains(string(buf), expected), Equals, true)
+ c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
}
+++ /dev/null
-#!/bin/sh
-
-# Script to be called by strigger when a job finishes. This ensures the job
-# record has the correct state "Complete" even if the node running the job
-# failed.
-
-ARVADOS_API_HOST=$1
-ARVADOS_API_TOKEN=$2
-ARVADOS_API_HOST_INSECURE=$3
-uuid=$4
-jobid=$5
-
-# If it is possible to attach metadata to job records we could look up the
-# above information instead of getting it on the command line. For example,
-# this is the recipe for getting the job name (container uuid) from the job id.
-#uuid=$(squeue --jobs=$jobid --states=all --format=%j --noheader)
-
-export ARVADOS_API_HOST ARVADOS_API_TOKEN ARVADOS_API_HOST_INSECURE
-
-exec arv container update --uuid $uuid --container '{"state": "Complete"}'
--- /dev/null
+package main
+
+import (
+ "bufio"
+ "log"
+ "os/exec"
+ "sync"
+ "time"
+)
+
+// Squeue implements asynchronous polling monitor of the SLURM queue using the
+// command 'squeue'.
+type Squeue struct {
+ squeueContents []string
+ squeueDone chan struct{}
+ squeueCond *sync.Cond
+ SlurmLock sync.Mutex
+}
+
+// squeueFunc
+func squeueFunc() *exec.Cmd {
+ return exec.Command("squeue", "--format=%j")
+}
+
+var squeueCmd = squeueFunc
+
+// RunSqueue runs squeue once and captures the output. If it succeeds, set
+// "squeueContents" and then wake up any goroutines waiting squeueCond in
+// CheckSqueue(). If there was an error, log it and leave the threads blocked.
+func (squeue *Squeue) RunSqueue() {
+ var newSqueueContents []string
+
+ // Mutex between squeue sync and running sbatch or scancel. This
+ // establishes a sequence so that squeue doesn't run concurrently with
+ // sbatch or scancel; the next update of squeue will occur only after
+ // sbatch or scancel has completed.
+ squeue.SlurmLock.Lock()
+ defer squeue.SlurmLock.Unlock()
+
+ // Also ensure unlock on all return paths
+
+ cmd := squeueCmd()
+ sq, err := cmd.StdoutPipe()
+ if err != nil {
+ log.Printf("Error creating stdout pipe for squeue: %v", err)
+ return
+ }
+ cmd.Start()
+ scanner := bufio.NewScanner(sq)
+ for scanner.Scan() {
+ newSqueueContents = append(newSqueueContents, scanner.Text())
+ }
+ if err := scanner.Err(); err != nil {
+ cmd.Wait()
+ log.Printf("Error reading from squeue pipe: %v", err)
+ return
+ }
+
+ err = cmd.Wait()
+ if err != nil {
+ log.Printf("Error running squeue: %v", err)
+ return
+ }
+
+ squeue.squeueCond.L.Lock()
+ squeue.squeueContents = newSqueueContents
+ squeue.squeueCond.Broadcast()
+ squeue.squeueCond.L.Unlock()
+}
+
+// CheckSqueue checks if a given container UUID is in the slurm queue. This
+// does not run squeue directly, but instead blocks until woken up by next
+// successful update of squeue.
+func (squeue *Squeue) CheckSqueue(uuid string) bool {
+ squeue.squeueCond.L.Lock()
+ // block until next squeue broadcast signaling an update.
+ squeue.squeueCond.Wait()
+ contents := squeue.squeueContents
+ squeue.squeueCond.L.Unlock()
+
+ for _, k := range contents {
+ if k == uuid {
+ return true
+ }
+ }
+ return false
+}
+
+// StartMonitor starts the squeue monitoring goroutine.
+func (squeue *Squeue) StartMonitor(pollInterval time.Duration) {
+ squeue.squeueCond = sync.NewCond(&sync.Mutex{})
+ squeue.squeueDone = make(chan struct{})
+ go squeue.SyncSqueue(pollInterval)
+}
+
+// Done stops the squeue monitoring goroutine.
+func (squeue *Squeue) Done() {
+ squeue.squeueDone <- struct{}{}
+ close(squeue.squeueDone)
+}
+
+// SyncSqueue periodically polls RunSqueue() at the given duration until
+// terminated by calling Done().
+func (squeue *Squeue) SyncSqueue(pollInterval time.Duration) {
+ ticker := time.NewTicker(pollInterval)
+ for {
+ select {
+ case <-squeue.squeueDone:
+ return
+ case <-ticker.C:
+ squeue.RunSqueue()
+ }
+ }
+}
"os"
"os/exec"
"os/signal"
+ "path"
"strings"
"sync"
"syscall"
Create(resourceType string, parameters arvadosclient.Dict, output interface{}) error
Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) (err error)
+ Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) (err error)
}
// ErrCancelled is the error returned when the container is cancelled.
PortableDataHash string `json:"portable_data_hash"`
UUID string `json:"uuid"`
DeviceType string `json:"device_type"`
+ Path string `json:"path"`
}
// Collection record returned by the API server.
PortableDataHash string `json:"portable_data_hash"`
}
+type RuntimeConstraints struct {
+ API *bool
+}
+
// ContainerRecord is the container record returned by the API server.
type ContainerRecord struct {
- UUID string `json:"uuid"`
- Command []string `json:"command"`
- ContainerImage string `json:"container_image"`
- Cwd string `json:"cwd"`
- Environment map[string]string `json:"environment"`
- Mounts map[string]Mount `json:"mounts"`
- OutputPath string `json:"output_path"`
- Priority int `json:"priority"`
- RuntimeConstraints map[string]interface{} `json:"runtime_constraints"`
- State string `json:"state"`
- Output string `json:"output"`
+ UUID string `json:"uuid"`
+ Command []string `json:"command"`
+ ContainerImage string `json:"container_image"`
+ Cwd string `json:"cwd"`
+ Environment map[string]string `json:"environment"`
+ Mounts map[string]Mount `json:"mounts"`
+ OutputPath string `json:"output_path"`
+ Priority int `json:"priority"`
+ RuntimeConstraints RuntimeConstraints `json:"runtime_constraints"`
+ State string `json:"state"`
+ Output string `json:"output"`
+}
+
+// APIClientAuthorization is an arvados#api_client_authorization resource.
+type APIClientAuthorization struct {
+ UUID string `json:"uuid"`
+ APIToken string `json:"api_token"`
}
// NewLogWriter is a factory function to create a new log writer.
type NewLogWriter func(name string) io.WriteCloser
-type RunArvMount func([]string) (*exec.Cmd, error)
+type RunArvMount func(args []string, tok string) (*exec.Cmd, error)
type MkTempDir func(string, string) (string, error)
Kc IKeepClient
ContainerRecord
dockerclient.ContainerConfig
+ dockerclient.HostConfig
+ token string
ContainerID string
ExitCode *int
NewLogWriter
loggingDone chan bool
CrunchLog *ThrottledLogger
- Stdout *ThrottledLogger
+ Stdout io.WriteCloser
Stderr *ThrottledLogger
LogCollection *CollectionWriter
LogsPDH *string
return nil
}
-func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string) (c *exec.Cmd, err error) {
+func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (c *exec.Cmd, err error) {
c = exec.Command("arv-mount", arvMountCmd...)
+
+ // Copy our environment, but override ARVADOS_API_TOKEN with
+ // the container auth token.
+ c.Env = nil
+ for _, s := range os.Environ() {
+ if !strings.HasPrefix(s, "ARVADOS_API_TOKEN=") {
+ c.Env = append(c.Env, s)
+ }
+ }
+ c.Env = append(c.Env, "ARVADOS_API_TOKEN="+token)
+
nt := NewThrottledLogger(runner.NewLogWriter("arv-mount"))
c.Stdout = nt
c.Stderr = nt
runner.Binds = nil
for bind, mnt := range runner.ContainerRecord.Mounts {
+ if bind == "stdout" {
+ // Is it a "file" mount kind?
+ if mnt.Kind != "file" {
+ return fmt.Errorf("Unsupported mount kind '%s' for stdout. Only 'file' is supported.", mnt.Kind)
+ }
+
+ // Does path start with OutputPath?
+ prefix := runner.ContainerRecord.OutputPath
+ if !strings.HasSuffix(prefix, "/") {
+ prefix += "/"
+ }
+ if !strings.HasPrefix(mnt.Path, prefix) {
+ return fmt.Errorf("Stdout path does not start with OutputPath: %s, %s", mnt.Path, prefix)
+ }
+ }
+
if mnt.Kind == "collection" {
var src string
if mnt.UUID != "" && mnt.PortableDataHash != "" {
} else {
runner.Binds = append(runner.Binds, bind)
}
- } else {
- return fmt.Errorf("Unknown mount kind '%s'", mnt.Kind)
}
}
}
arvMountCmd = append(arvMountCmd, runner.ArvMountPoint)
- runner.ArvMount, err = runner.RunArvMount(arvMountCmd)
+ token, err := runner.ContainerToken()
+ if err != nil {
+ return fmt.Errorf("could not get container token: %s", err)
+ }
+
+ runner.ArvMount, err = runner.RunArvMount(arvMountCmd, token)
if err != nil {
return fmt.Errorf("While trying to start arv-mount: %v", err)
}
runner.loggingDone = make(chan bool)
- runner.Stdout = NewThrottledLogger(runner.NewLogWriter("stdout"))
+ if stdoutMnt, ok := runner.ContainerRecord.Mounts["stdout"]; ok {
+ stdoutPath := stdoutMnt.Path[len(runner.ContainerRecord.OutputPath):]
+ index := strings.LastIndex(stdoutPath, "/")
+ if index > 0 {
+ subdirs := stdoutPath[:index]
+ if subdirs != "" {
+ st, err := os.Stat(runner.HostOutputDir)
+ if err != nil {
+ return fmt.Errorf("While Stat on temp dir: %v", err)
+ }
+ stdoutPath := path.Join(runner.HostOutputDir, subdirs)
+ err = os.MkdirAll(stdoutPath, st.Mode()|os.ModeSetgid|0777)
+ if err != nil {
+ return fmt.Errorf("While MkdirAll %q: %v", stdoutPath, err)
+ }
+ }
+ }
+ stdoutFile, err := os.Create(path.Join(runner.HostOutputDir, stdoutPath))
+ if err != nil {
+ return fmt.Errorf("While creating stdout file: %v", err)
+ }
+ runner.Stdout = stdoutFile
+ } else {
+ runner.Stdout = NewThrottledLogger(runner.NewLogWriter("stdout"))
+ }
runner.Stderr = NewThrottledLogger(runner.NewLogWriter("stderr"))
go runner.ProcessDockerAttach(containerReader)
return nil
}
-// StartContainer creates the container and runs it.
-func (runner *ContainerRunner) StartContainer() (err error) {
+// CreateContainer creates the docker container.
+func (runner *ContainerRunner) CreateContainer() error {
runner.CrunchLog.Print("Creating Docker container")
- runner.CancelLock.Lock()
- defer runner.CancelLock.Unlock()
-
- if runner.Cancelled {
- return ErrCancelled
- }
-
runner.ContainerConfig.Cmd = runner.ContainerRecord.Command
if runner.ContainerRecord.Cwd != "." {
runner.ContainerConfig.WorkingDir = runner.ContainerRecord.Cwd
}
+
for k, v := range runner.ContainerRecord.Environment {
runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
}
+ if wantAPI := runner.ContainerRecord.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
+ tok, err := runner.ContainerToken()
+ if err != nil {
+ return err
+ }
+ runner.ContainerConfig.Env = append(runner.ContainerConfig.Env,
+ "ARVADOS_API_TOKEN="+tok,
+ "ARVADOS_API_HOST="+os.Getenv("ARVADOS_API_HOST"),
+ "ARVADOS_API_HOST_INSECURE="+os.Getenv("ARVADOS_API_HOST_INSECURE"),
+ )
+ }
+
runner.ContainerConfig.NetworkDisabled = true
+
+ var err error
runner.ContainerID, err = runner.Docker.CreateContainer(&runner.ContainerConfig, "", nil)
if err != nil {
return fmt.Errorf("While creating container: %v", err)
}
- hostConfig := &dockerclient.HostConfig{Binds: runner.Binds,
+
+ runner.HostConfig = dockerclient.HostConfig{Binds: runner.Binds,
LogConfig: dockerclient.LogConfig{Type: "none"}}
- err = runner.AttachStreams()
- if err != nil {
- return err
- }
+ return runner.AttachStreams()
+}
+// StartContainer starts the docker container created by CreateContainer.
+func (runner *ContainerRunner) StartContainer() error {
runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
- err = runner.Docker.StartContainer(runner.ContainerID, hostConfig)
+ err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
if err != nil {
- return fmt.Errorf("While starting container: %v", err)
+ return fmt.Errorf("could not start container: %v", err)
}
-
return nil
}
runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID,
"crunch-run", nil})
+ if runner.LogsPDH != nil {
+ // If we have already assigned something to LogsPDH,
+ // we must be closing the re-opened log, which won't
+ // end up getting attached to the container record and
+ // therefore doesn't need to be saved as a collection
+ // -- it exists only to send logs to other channels.
+ return nil
+ }
+
mt, err := runner.LogCollection.ManifestText()
if err != nil {
return fmt.Errorf("While creating log manifest: %v", err)
return fmt.Errorf("While creating log collection: %v", err)
}
- runner.LogsPDH = new(string)
- *runner.LogsPDH = response.PortableDataHash
+ runner.LogsPDH = &response.PortableDataHash
return nil
}
// UpdateContainerRecordRunning updates the container state to "Running"
func (runner *ContainerRunner) UpdateContainerRecordRunning() error {
+ runner.CancelLock.Lock()
+ defer runner.CancelLock.Unlock()
+ if runner.Cancelled {
+ return ErrCancelled
+ }
return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID,
arvadosclient.Dict{"container": arvadosclient.Dict{"state": "Running"}}, nil)
}
-// UpdateContainerRecordComplete updates the container record state on API
-// server to "Complete" or "Cancelled"
-func (runner *ContainerRunner) UpdateContainerRecordComplete() error {
- update := arvadosclient.Dict{}
- if runner.LogsPDH != nil {
- update["log"] = *runner.LogsPDH
+// ContainerToken returns the api_token the container (and any
+// arv-mount processes) are allowed to use.
+func (runner *ContainerRunner) ContainerToken() (string, error) {
+ if runner.token != "" {
+ return runner.token, nil
}
- if runner.ExitCode != nil {
- update["exit_code"] = *runner.ExitCode
- }
- if runner.OutputPDH != nil {
- update["output"] = runner.OutputPDH
+
+ var auth APIClientAuthorization
+ err := runner.ArvClient.Call("GET", "containers", runner.ContainerRecord.UUID, "auth", nil, &auth)
+ if err != nil {
+ return "", err
}
+ runner.token = auth.APIToken
+ return runner.token, nil
+}
+// UpdateContainerRecordComplete updates the container record state on API
+// server to "Complete" or "Cancelled"
+func (runner *ContainerRunner) UpdateContainerRecordFinal() error {
+ update := arvadosclient.Dict{}
update["state"] = runner.finalState
-
+ if runner.finalState == "Complete" {
+ if runner.LogsPDH != nil {
+ update["log"] = *runner.LogsPDH
+ }
+ if runner.ExitCode != nil {
+ update["exit_code"] = *runner.ExitCode
+ }
+ if runner.OutputPDH != nil {
+ update["output"] = *runner.OutputPDH
+ }
+ }
return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID, arvadosclient.Dict{"container": update}, nil)
}
+// IsCancelled returns the value of Cancelled, with goroutine safety.
+func (runner *ContainerRunner) IsCancelled() bool {
+ runner.CancelLock.Lock()
+ defer runner.CancelLock.Unlock()
+ return runner.Cancelled
+}
+
// NewArvLogWriter creates an ArvLogWriter
func (runner *ContainerRunner) NewArvLogWriter(name string) io.WriteCloser {
return &ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID, name, runner.LogCollection.Open(name + ".txt")}
runner.CrunchLog.Printf("Executing on host '%s'", hostname)
}
- var runerr, waiterr error
+ // Clean up temporary directories _after_ finalizing
+ // everything (if we've made any by then)
+ defer runner.CleanupDirs()
+
+ runner.finalState = "Queued"
defer func() {
- if err != nil {
- runner.CrunchLog.Print(err)
+ // checkErr prints e (unless it's nil) and sets err to
+ // e (unless err is already non-nil). Thus, if err
+ // hasn't already been assigned when Run() returns,
+ // this cleanup func will cause Run() to return the
+ // first non-nil error that is passed to checkErr().
+ checkErr := func(e error) {
+ if e == nil {
+ return
+ }
+ runner.CrunchLog.Print(e)
+ if err == nil {
+ err = e
+ }
}
- if runner.Cancelled {
- runner.finalState = "Cancelled"
- } else {
- runner.finalState = "Complete"
- }
+ // Log the error encountered in Run(), if any
+ checkErr(err)
- // (6) capture output
- outputerr := runner.CaptureOutput()
- if outputerr != nil {
- runner.CrunchLog.Print(outputerr)
+ if runner.finalState == "Queued" {
+ runner.UpdateContainerRecordFinal()
+ return
}
- // (7) clean up temporary directories
- runner.CleanupDirs()
-
- // (8) write logs
- logerr := runner.CommitLogs()
- if logerr != nil {
- runner.CrunchLog.Print(logerr)
+ if runner.IsCancelled() {
+ runner.finalState = "Cancelled"
+ // but don't return yet -- we still want to
+ // capture partial output and write logs
}
- // (9) update container record with results
- updateerr := runner.UpdateContainerRecordComplete()
- if updateerr != nil {
- runner.CrunchLog.Print(updateerr)
- }
+ checkErr(runner.CaptureOutput())
+ checkErr(runner.CommitLogs())
+ checkErr(runner.UpdateContainerRecordFinal())
+ // The real log is already closed, but then we opened
+ // a new one in case we needed to log anything while
+ // finalizing.
runner.CrunchLog.Close()
-
- if err == nil {
- if runerr != nil {
- err = runerr
- } else if waiterr != nil {
- err = waiterr
- } else if logerr != nil {
- err = logerr
- } else if updateerr != nil {
- err = updateerr
- }
- }
}()
err = runner.ArvClient.Get("containers", runner.ContainerRecord.UUID, nil, &runner.ContainerRecord)
if err != nil {
- return fmt.Errorf("While getting container record: %v", err)
+ err = fmt.Errorf("While getting container record: %v", err)
+ return
}
- // (1) setup signal handling
+ // setup signal handling
runner.SetupSignals()
- // (2) check for and/or load image
+ // check for and/or load image
err = runner.LoadImage()
if err != nil {
- return fmt.Errorf("While loading container image: %v", err)
+ err = fmt.Errorf("While loading container image: %v", err)
+ return
}
- // (3) set up FUSE mount and binds
+ // set up FUSE mount and binds
err = runner.SetupMounts()
if err != nil {
- return fmt.Errorf("While setting up mounts: %v", err)
+ err = fmt.Errorf("While setting up mounts: %v", err)
+ return
}
- // (3) create and start container
- err = runner.StartContainer()
+ err = runner.CreateContainer()
if err != nil {
- if err == ErrCancelled {
- err = nil
- }
return
}
- // (4) update container record state
+ if runner.IsCancelled() {
+ return
+ }
+
err = runner.UpdateContainerRecordRunning()
if err != nil {
- runner.CrunchLog.Print(err)
+ return
}
+ runner.finalState = "Cancelled"
- // (5) wait for container to finish
- waiterr = runner.WaitFinish()
+ err = runner.StartContainer()
+ if err != nil {
+ return
+ }
+ err = runner.WaitFinish()
+ if err == nil {
+ runner.finalState = "Complete"
+ }
return
}
"os/exec"
"sort"
"strings"
+ "sync"
"syscall"
"testing"
"time"
ContainerRecord
Logs map[string]*bytes.Buffer
WasSetRunning bool
+ sync.Mutex
}
type KeepTestClient struct {
var otherManifest = ". 68a84f561b1d1708c6baff5e019a9ab3+46+Ae5d0af96944a3690becb1decdf60cc1c937f556d@5693216f 0:46:md5sum.txt\n"
var otherPDH = "a3e8f74c6f101eae01fa08bfb4e49b3a+54"
+var fakeAuthUUID = "zzzzz-gj3su-55pqoyepgi2glem"
+var fakeAuthToken = "a3ltuwzqcu2u4sc0q7yhpc2w7s00fdcqecg5d6e0u3pfohmbjt"
+
type TestDockerClient struct {
imageLoaded string
logReader io.ReadCloser
parameters arvadosclient.Dict,
output interface{}) error {
+ this.Mutex.Lock()
+ defer this.Mutex.Unlock()
+
this.Calls += 1
this.Content = append(this.Content, parameters)
return nil
}
+func (this *ArvTestClient) Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error {
+ switch {
+ case method == "GET" && resourceType == "containers" && action == "auth":
+ return json.Unmarshal([]byte(`{
+ "kind": "arvados#api_client_authorization",
+ "uuid": "`+fakeAuthUUID+`",
+ "api_token": "`+fakeAuthToken+`"
+ }`), output)
+ default:
+ return fmt.Errorf("Not found")
+ }
+}
+
func (this *ArvTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
if resourceType == "collections" {
if uuid == hwPDH {
}
func (this *ArvTestClient) Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) (err error) {
+ this.Mutex.Lock()
+ defer this.Mutex.Unlock()
this.Calls += 1
this.Content = append(this.Content, parameters)
if resourceType == "containers" {
if parameters["container"].(arvadosclient.Dict)["state"] == "Running" {
this.WasSetRunning = true
}
+ }
+ return nil
+}
+// CalledWith returns the parameters from the first API call whose
+// parameters match jpath/string. E.g., CalledWith(c, "foo.bar",
+// "baz") returns parameters with parameters["foo"]["bar"]=="baz". If
+// no call matches, it returns nil.
+func (this *ArvTestClient) CalledWith(jpath, expect string) arvadosclient.Dict {
+ call: for _, content := range this.Content {
+ var v interface{} = content
+ for _, k := range strings.Split(jpath, ".") {
+ if dict, ok := v.(arvadosclient.Dict); !ok {
+ continue call
+ } else {
+ v = dict[k]
+ }
+ }
+ if v, ok := v.(string); ok && v == expect {
+ return content
+ }
}
return nil
}
return nil
}
+func (this ArvErrorTestClient) Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error {
+ return errors.New("ArvError")
+}
+
func (this ArvErrorTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
return errors.New("ArvError")
}
err := cr.LoadImage()
c.Check(err, IsNil)
+ err = cr.CreateContainer()
+ c.Check(err, IsNil)
+
err = cr.StartContainer()
c.Check(err, IsNil)
*cr.ExitCode = 42
cr.finalState = "Complete"
- err := cr.UpdateContainerRecordComplete()
+ err := cr.UpdateContainerRecordFinal()
c.Check(err, IsNil)
c.Check(api.Content[0]["container"].(arvadosclient.Dict)["log"], Equals, *cr.LogsPDH)
cr.Cancelled = true
cr.finalState = "Cancelled"
- err := cr.UpdateContainerRecordComplete()
+ err := cr.UpdateContainerRecordFinal()
c.Check(err, IsNil)
c.Check(api.Content[0]["container"].(arvadosclient.Dict)["log"], IsNil)
}
// Used by the TestFullRun*() test below to DRY up boilerplate setup to do full
-// dress rehersal of the Run() function, starting from a JSON container record.
+// dress rehearsal of the Run() function, starting from a JSON container record.
func FullRunHelper(c *C, record string, fn func(t *TestDockerClient)) (api *ArvTestClient, cr *ContainerRunner) {
rec := ContainerRecord{}
- err := json.NewDecoder(strings.NewReader(record)).Decode(&rec)
+ err := json.Unmarshal([]byte(record), &rec)
c.Check(err, IsNil)
docker := NewTestDockerClient()
t.finish <- dockerclient.WaitResult{ExitCode: 1}
})
- c.Check(api.Calls, Equals, 8)
+ c.Assert(api.Calls, Equals, 8)
c.Check(api.Content[7]["container"].(arvadosclient.Dict)["log"], NotNil)
c.Check(api.Content[7]["container"].(arvadosclient.Dict)["exit_code"], Equals, 1)
c.Check(api.Content[7]["container"].(arvadosclient.Dict)["state"], Equals, "Complete")
}`
rec := ContainerRecord{}
- err := json.NewDecoder(strings.NewReader(record)).Decode(&rec)
+ err := json.Unmarshal([]byte(record), &rec)
c.Check(err, IsNil)
docker := NewTestDockerClient()
go func() {
for cr.ContainerID == "" {
- time.Sleep(1 * time.Second)
+ time.Sleep(time.Millisecond)
}
cr.SigChan <- syscall.SIGINT
}()
err = cr.Run()
c.Check(err, IsNil)
-
- c.Check(api.Calls, Equals, 6)
- c.Check(api.Content[5]["container"].(arvadosclient.Dict)["log"], NotNil)
-
if err != nil {
for k, v := range api.Logs {
c.Log(k)
}
}
+ c.Assert(api.Calls, Equals, 6)
+ c.Check(api.Content[5]["container"].(arvadosclient.Dict)["log"], IsNil)
c.Check(api.Content[5]["container"].(arvadosclient.Dict)["state"], Equals, "Cancelled")
-
c.Check(strings.HasSuffix(api.Logs["stdout"].String(), "foo\n"), Equals, true)
}
}
type ArvMountCmdLine struct {
- Cmd []string
+ Cmd []string
+ token string
}
-func (am *ArvMountCmdLine) ArvMountTest(c []string) (*exec.Cmd, error) {
+func (am *ArvMountCmdLine) ArvMountTest(c []string, token string) (*exec.Cmd, error) {
am.Cmd = c
+ am.token = token
return nil, nil
}
cr.CleanupDirs()
}
}
+
+func (s *TestSuite) TestStdout(c *C) {
+ helperRecord := `{`
+ helperRecord += `"command": ["/bin/sh", "-c", "echo $FROBIZ"],`
+ helperRecord += `"container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",`
+ helperRecord += `"cwd": "/bin",`
+ helperRecord += `"environment": {"FROBIZ": "bilbo"},`
+ helperRecord += `"mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "file", "path": "/tmp/a/b/c.out"} },`
+ helperRecord += `"output_path": "/tmp",`
+ helperRecord += `"priority": 1,`
+ helperRecord += `"runtime_constraints": {}`
+ helperRecord += `}`
+
+ api, _ := FullRunHelper(c, helperRecord, func(t *TestDockerClient) {
+ t.logWriter.Write(dockerLog(1, t.env[0][7:]+"\n"))
+ t.logWriter.Close()
+ t.finish <- dockerclient.WaitResult{ExitCode: 0}
+ })
+
+ c.Assert(api.Calls, Equals, 6)
+ c.Check(api.Content[5]["container"].(arvadosclient.Dict)["exit_code"], Equals, 0)
+ c.Check(api.Content[5]["container"].(arvadosclient.Dict)["state"], Equals, "Complete")
+ c.Check(api.CalledWith("collection.manifest_text", "./a/b 307372fa8fd5c146b22ae7a45b49bc31+6 0:6:c.out\n"), Not(IsNil))
+}
+
+// Used by the TestStdoutWithWrongPath*()
+func StdoutErrorRunHelper(c *C, record string, fn func(t *TestDockerClient)) (api *ArvTestClient, cr *ContainerRunner, err error) {
+ rec := ContainerRecord{}
+ err = json.Unmarshal([]byte(record), &rec)
+ c.Check(err, IsNil)
+
+ docker := NewTestDockerClient()
+ docker.fn = fn
+ docker.RemoveImage(hwImageId, true)
+
+ api = &ArvTestClient{ContainerRecord: rec}
+ cr = NewContainerRunner(api, &KeepTestClient{}, docker, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
+ am := &ArvMountCmdLine{}
+ cr.RunArvMount = am.ArvMountTest
+
+ err = cr.Run()
+ return
+}
+
+func (s *TestSuite) TestStdoutWithWrongPath(c *C) {
+ _, _, err := StdoutErrorRunHelper(c, `{
+ "mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "file", "path":"/tmpa.out"} },
+ "output_path": "/tmp"
+}`, func(t *TestDockerClient) {})
+
+ c.Check(err, NotNil)
+ c.Check(strings.Contains(err.Error(), "Stdout path does not start with OutputPath"), Equals, true)
+}
+
+func (s *TestSuite) TestStdoutWithWrongKindTmp(c *C) {
+ _, _, err := StdoutErrorRunHelper(c, `{
+ "mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "tmp", "path":"/tmp/a.out"} },
+ "output_path": "/tmp"
+}`, func(t *TestDockerClient) {})
+
+ c.Check(err, NotNil)
+ c.Check(strings.Contains(err.Error(), "Unsupported mount kind 'tmp' for stdout"), Equals, true)
+}
+
+func (s *TestSuite) TestStdoutWithWrongKindCollection(c *C) {
+ _, _, err := StdoutErrorRunHelper(c, `{
+ "mounts": {"/tmp": {"kind": "tmp"}, "stdout": {"kind": "collection", "path":"/tmp/a.out"} },
+ "output_path": "/tmp"
+}`, func(t *TestDockerClient) {})
+
+ c.Check(err, NotNil)
+ c.Check(strings.Contains(err.Error(), "Unsupported mount kind 'collection' for stdout"), Equals, true)
+}
"fmt"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
. "gopkg.in/check.v1"
+ "testing"
"time"
)
}
func (s *LoggingTestSuite) TestWriteLogsLarge(c *C) {
+ if testing.Short() {
+ return
+ }
api := &ArvTestClient{}
kc := &KeepTestClient{}
cr := NewContainerRunner(api, kc, nil, "zzzzz-zzzzzzzzzzzzzzz")
cr.CrunchLog.Timestamper = (&TestTimestamper{}).Timestamp
cr.CrunchLog.Immediate = nil
- for i := 0; i < 2000000; i += 1 {
+ for i := 0; i < 2000000; i++ {
cr.CrunchLog.Printf("Hello %d", i)
}
cr.CrunchLog.Print("Goodbye")
stdout.Print("Doing stuff")
cr.CrunchLog.Print("Goodbye")
stdout.Print("Blurb")
-
cr.CrunchLog.Close()
- logtext1 := "2015-12-29T15:51:45.000000001Z Hello world!\n" +
- "2015-12-29T15:51:45.000000003Z Goodbye\n"
- c.Check(api.Content[0]["log"].(arvadosclient.Dict)["event_type"], Equals, "crunch-run")
- c.Check(api.Content[0]["log"].(arvadosclient.Dict)["properties"].(map[string]string)["text"], Equals, logtext1)
-
stdout.Close()
- logtext2 := "2015-12-29T15:51:45.000000002Z Doing stuff\n" +
- "2015-12-29T15:51:45.000000004Z Blurb\n"
- c.Check(api.Content[1]["log"].(arvadosclient.Dict)["event_type"], Equals, "stdout")
- c.Check(api.Content[1]["log"].(arvadosclient.Dict)["properties"].(map[string]string)["text"], Equals, logtext2)
+
+ logText := make(map[string]string)
+ for _, content := range api.Content {
+ log := content["log"].(arvadosclient.Dict)
+ logText[log["event_type"].(string)] += log["properties"].(map[string]string)["text"]
+ }
+
+ c.Check(logText["crunch-run"], Equals, `2015-12-29T15:51:45.000000001Z Hello world!
+2015-12-29T15:51:45.000000003Z Goodbye
+`)
+ c.Check(logText["stdout"], Equals, `2015-12-29T15:51:45.000000002Z Doing stuff
+2015-12-29T15:51:45.000000004Z Blurb
+`)
mt, err := cr.LogCollection.ManifestText()
c.Check(err, IsNil)
// SdkCollectionInfo holds collection info from api
type SdkCollectionInfo struct {
- UUID string `json:"uuid"`
- OwnerUUID string `json:"owner_uuid"`
- Redundancy int `json:"redundancy"`
- ModifiedAt time.Time `json:"modified_at"`
- ManifestText string `json:"manifest_text"`
+ UUID string `json:"uuid"`
+ OwnerUUID string `json:"owner_uuid"`
+ ReplicationDesired int `json:"replication_desired"`
+ ModifiedAt time.Time `json:"modified_at"`
+ ManifestText string `json:"manifest_text"`
}
// SdkCollectionList lists collections from api
fieldsWanted := []string{"manifest_text",
"owner_uuid",
"uuid",
- "redundancy",
+ "replication_desired",
"modified_at"}
sdkParams := arvadosclient.Dict{
"select": fieldsWanted,
"order": []string{"modified_at ASC", "uuid ASC"},
"filters": [][]string{[]string{"modified_at", ">=", "1900-01-01T00:00:00Z"}},
- "offset": 0}
+ "offset": 0}
if params.BatchSize > 0 {
sdkParams["limit"] = params.BatchSize
}
if totalCollections < finalNumberOfCollectionsAvailable {
err = fmt.Errorf("API server indicates a total of %d collections "+
- "available up to %v, but we only retrieved %d. "+
- "Refusing to continue as this could indicate an "+
- "otherwise undetected failure.",
- finalNumberOfCollectionsAvailable,
- sdkParams["filters"].([][]string)[0][2],
- totalCollections)
+ "available up to %v, but we only retrieved %d. "+
+ "Refusing to continue as this could indicate an "+
+ "otherwise undetected failure.",
+ finalNumberOfCollectionsAvailable,
+ sdkParams["filters"].([][]string)[0][2],
+ totalCollections)
return
}
for _, sdkCollection := range receivedCollections {
collection := Collection{UUID: StrCopy(sdkCollection.UUID),
OwnerUUID: StrCopy(sdkCollection.OwnerUUID),
- ReplicationLevel: sdkCollection.Redundancy,
+ ReplicationLevel: sdkCollection.ReplicationDesired,
BlockDigestToSize: make(map[blockdigest.BlockDigest]int)}
if sdkCollection.ModifiedAt.IsZero() {
"time"
)
-// Useful to call at the begining of execution to log info about the
+// Useful to call at the beginning of execution to log info about the
// current run.
func LogRunInfo(arvLogger *logger.Logger) {
if arvLogger != nil {
blockToDesiredReplication map[blockdigest.DigestWithSize]int,
underReplicated BlockSet) (m map[Locator]PullServers) {
m = map[Locator]PullServers{}
- // We use CanonicalString to avoid filling memory with dupicate
+ // We use CanonicalString to avoid filling memory with duplicate
// copies of the same string.
var cs CanonicalString
('share/doc/arvados-docker-cleaner', ['agpl-3.0.txt']),
],
install_requires=[
- 'docker-py',
+ 'docker-py==1.7.2',
],
tests_require=[
'pbr<1.7.0',
self.flush()
src.flush()
+ def clear(self, force=False):
+ r = super(CollectionDirectoryBase, self).clear(force)
+ self.collection = None
+ return r
+
class CollectionDirectory(CollectionDirectoryBase):
"""Represents the root of a directory tree representing a collection."""
return self.object_uuid
def update(self, obj=None):
+ if obj is None:
+ # TODO: retrieve the current record for self.object_uuid
+ # from the server. For now, at least don't crash when
+ # someone tells us it's a good time to update but doesn't
+ # pass us a fresh obj. See #8345
+ return
self._mtime = convertTime(obj['modified_at']) if 'modified_at' in obj else 0
self.contents = json.dumps(obj, indent=4, sort_keys=True) + "\n"
except ImportError:
tagger = egg_info_cmd.egg_info
+short_tests_only = False
+if '--short-tests-only' in sys.argv:
+ short_tests_only = True
+ sys.argv.remove('--short-tests-only')
+
setup(name='arvados_fuse',
version='0.1',
description='Arvados FUSE driver',
run_test_server.run()
run_test_server.authorize_with("admin")
self.api = api if api else arvados.safeapi.ThreadSafeApiCache(arvados.config.settings())
+ self.llfuse_thread = None
# This is a copy of Mount's method. TODO: Refactor MountTestBase
# to use a Mount instead of copying its code.
self.pool.join()
del self.pool
- subprocess.call(["fusermount", "-u", "-z", self.mounttmp])
- self.llfuse_thread.join(timeout=1)
- if self.llfuse_thread.is_alive():
- logger.warning("MountTestBase.tearDown():"
- " llfuse thread still alive 1s after umount"
- " -- abandoning and exiting anyway")
+ if self.llfuse_thread:
+ subprocess.call(["fusermount", "-u", "-z", self.mounttmp])
+ self.llfuse_thread.join(timeout=1)
+ if self.llfuse_thread.is_alive():
+ logger.warning("MountTestBase.tearDown():"
+ " llfuse thread still alive 1s after umount"
+ " -- abandoning and exiting anyway")
os.rmdir(self.mounttmp)
if self.keeptmp:
import unittest
from .. import run_test_server
from ..mount_test_base import MountTestBase
+from ..slow_test import slow_test
logger = logging.getLogger('arvados.arv-mount')
def setUp(self):
super(CreateCollectionWithMultipleBlocksAndMoveAndDeleteFile, self).setUp()
+ @slow_test
def test_CreateCollectionWithManyBlocksAndMoveAndDeleteFile(self):
collection = arvados.collection.Collection(api_client=self.api)
collection.save_new()
def setUp(self):
super(CreateCollectionWithManyFilesAndMoveAndDeleteFile, self).setUp()
+ @slow_test
def test_CreateCollectionWithManyFilesAndMoveAndDeleteFile(self):
collection = arvados.collection.Collection(api_client=self.api)
collection.save_new()
with open(os.path.join(self.mounttmp, collection, k)) as f:
self.assertEqual(v, f.read())
+ @slow_test
def test_UsingMagicDirCreateCollectionWithManyFilesAndMoveAndDeleteFile(self):
streams = 2
files_per_stream = 200
collection.save_new()
return collection
+ @slow_test
def test_UsingMagicDirCreateCollectionWithManyFilesAndMoveAllFilesIntoAnother(self):
streams = 2
files_per_stream = 200
self.pool.apply(magicDirTest_MoveFileFromCollection, (self.mounttmp, from_collection.manifest_locator(),
to_collection.manifest_locator(), 'stream0', 'file'+str(j)+'.txt',))
+ @slow_test
def test_UsingMagicDirCreateCollectionWithManyFilesAndMoveEachFileIntoAnother(self):
streams = 2
files_per_stream = 200
collection_contents = llfuse.listdir(os.path.join(self.mounttmp, collection_name))
self.assertIn('baz', collection_contents)
+ @slow_test
def test_listLargeProjectContents(self):
self.make_mount(fuse.ProjectDirectory,
project_object=run_test_server.fixture('groups')['project_with_201_collections'])
--- /dev/null
+../../../sdk/python/tests/slow_test.py
\ No newline at end of file
--- /dev/null
+import arvados
+import arvados.collection
+import arvados_fuse
+import arvados_fuse.command
+import json
+import logging
+import os
+import tempfile
+import unittest
+
+from .integration_test import IntegrationTest
+from .mount_test_base import MountTestBase
+
+class TmpCollectionTest(IntegrationTest):
+ mnt_args = ["--directory-cache=0"]
+
+ @IntegrationTest.mount(argv=mnt_args)
+ def test_cache_spill(self):
+ pdh = []
+ for i in range(0, 8):
+ cw = arvados.collection.Collection()
+ f = cw.open("blurg%i" % i, "w")
+ f.write("bloop%i" % i)
+
+ cw.mkdirs("dir%i" % i)
+ f = cw.open("dir%i/blurg" % i, "w")
+ f.write("dirbloop%i" % i)
+
+ cw.save_new()
+ pdh.append(cw.portable_data_hash())
+ self.pool_test(self.mnt, pdh)
+
+ @staticmethod
+ def _test_cache_spill(self, mnt, pdh):
+ for i,v in enumerate(pdh):
+ j = os.path.join(mnt, "by_id", v, "blurg%i" % i)
+ self.assertTrue(os.path.exists(j))
+ j = os.path.join(mnt, "by_id", v, "dir%i/blurg" % i)
+ self.assertTrue(os.path.exists(j))
+
+ for i,v in enumerate(pdh):
+ j = os.path.join(mnt, "by_id", v, "blurg%i" % i)
+ self.assertTrue(os.path.exists(j))
+ j = os.path.join(mnt, "by_id", v, "dir%i/blurg" % i)
+ self.assertTrue(os.path.exists(j))
@mock.patch('arvados.keep.KeepClient.get')
def runTest(self, mocked_get):
- logging.getLogger('arvados.arvados_fuse').setLevel(logging.DEBUG)
self.api._rootDesc = {"blobSignatureTtl": 2}
mnt = self.make_mount(fuse.CollectionDirectory, collection_record='zzzzz-4zz18-op4e2lbej01tcvu')
mocked_get.return_value = 'fake data'
--- /dev/null
+package main
+
+import (
+ "fmt"
+ "log"
+ "math"
+ "os"
+ "runtime"
+ "strings"
+ "sync"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "git.curoverse.com/arvados.git/sdk/go/keepclient"
+)
+
+// CheckConfig returns an error if anything is wrong with the given
+// config and runOptions.
+func CheckConfig(config Config, runOptions RunOptions) error {
+ if len(config.KeepServiceList.Items) > 0 && config.KeepServiceTypes != nil {
+ return fmt.Errorf("cannot specify both KeepServiceList and KeepServiceTypes in config")
+ }
+ if !runOptions.Once && config.RunPeriod == arvados.Duration(0) {
+ return fmt.Errorf("you must either use the -once flag, or specify RunPeriod in config")
+ }
+ return nil
+}
+
+// Balancer compares the contents of keepstore servers with the
+// collections stored in Arvados, and issues pull/trash requests
+// needed to get (closer to) the optimal data layout.
+//
+// In the optimal data layout: every data block referenced by a
+// collection is replicated at least as many times as desired by the
+// collection; there are no unreferenced data blocks older than
+// BlobSignatureTTL; and all N existing replicas of a given data block
+// are in the N best positions in rendezvous probe order.
+type Balancer struct {
+ *BlockStateMap
+ KeepServices map[string]*KeepService
+ DefaultReplication int
+ Logger *log.Logger
+ Dumper *log.Logger
+ MinMtime int64
+
+ collScanned int
+ serviceRoots map[string]string
+ errors []error
+ mutex sync.Mutex
+}
+
+// Run performs a balance operation using the given config and
+// runOptions. It should only be called once on a given Balancer
+// object. Typical usage:
+//
+// err = (&Balancer{}).Run(config, runOptions)
+func (bal *Balancer) Run(config Config, runOptions RunOptions) (err error) {
+ bal.Dumper = runOptions.Dumper
+ bal.Logger = runOptions.Logger
+ if bal.Logger == nil {
+ bal.Logger = log.New(os.Stderr, "", log.LstdFlags)
+ }
+
+ defer timeMe(bal.Logger, "Run")()
+
+ if len(config.KeepServiceList.Items) > 0 {
+ err = bal.SetKeepServices(config.KeepServiceList)
+ } else {
+ err = bal.DiscoverKeepServices(&config.Client, config.KeepServiceTypes)
+ }
+ if err != nil {
+ return
+ }
+
+ if err = bal.CheckSanityEarly(&config.Client); err != nil {
+ return
+ }
+ if runOptions.CommitTrash {
+ if err = bal.ClearTrashLists(&config.Client); err != nil {
+ return
+ }
+ }
+ if err = bal.GetCurrentState(&config.Client); err != nil {
+ return
+ }
+ bal.ComputeChangeSets()
+ bal.PrintStatistics()
+ if err = bal.CheckSanityLate(); err != nil {
+ return
+ }
+ if runOptions.CommitPulls {
+ err = bal.CommitPulls(&config.Client)
+ if err != nil {
+ // Skip trash if we can't pull. (Too cautious?)
+ return
+ }
+ }
+ if runOptions.CommitTrash {
+ err = bal.CommitTrash(&config.Client)
+ }
+ return
+}
+
+// SetKeepServices sets the list of KeepServices to operate on.
+func (bal *Balancer) SetKeepServices(srvList arvados.KeepServiceList) error {
+ bal.KeepServices = make(map[string]*KeepService)
+ for _, srv := range srvList.Items {
+ bal.KeepServices[srv.UUID] = &KeepService{
+ KeepService: srv,
+ ChangeSet: &ChangeSet{},
+ }
+ }
+ return nil
+}
+
+// DiscoverKeepServices sets the list of KeepServices by calling the
+// API to get a list of all services, and selecting the ones whose
+// ServiceType is in okTypes.
+func (bal *Balancer) DiscoverKeepServices(c *arvados.Client, okTypes []string) error {
+ bal.KeepServices = make(map[string]*KeepService)
+ ok := make(map[string]bool)
+ for _, t := range okTypes {
+ ok[t] = true
+ }
+ return c.EachKeepService(func(srv arvados.KeepService) error {
+ if ok[srv.ServiceType] {
+ bal.KeepServices[srv.UUID] = &KeepService{
+ KeepService: srv,
+ ChangeSet: &ChangeSet{},
+ }
+ } else {
+ bal.logf("skipping %v with service type %q", srv.UUID, srv.ServiceType)
+ }
+ return nil
+ })
+}
+
+// CheckSanityEarly checks for configuration and runtime errors that
+// can be detected before GetCurrentState() and ComputeChangeSets()
+// are called.
+//
+// If it returns an error, it is pointless to run GetCurrentState or
+// ComputeChangeSets: after doing so, the statistics would be
+// meaningless and it would be dangerous to run any Commit methods.
+func (bal *Balancer) CheckSanityEarly(c *arvados.Client) error {
+ u, err := c.CurrentUser()
+ if err != nil {
+ return fmt.Errorf("CurrentUser(): %v", err)
+ }
+ if !u.IsActive || !u.IsAdmin {
+ return fmt.Errorf("current user (%s) is not an active admin user", u.UUID)
+ }
+ for _, srv := range bal.KeepServices {
+ if srv.ServiceType == "proxy" {
+ return fmt.Errorf("config error: %s: proxy servers cannot be balanced", srv)
+ }
+ }
+ return nil
+}
+
+// ClearTrashLists sends an empty trash list to each keep
+// service. Calling this before GetCurrentState avoids races.
+//
+// When a block appears in an index, we assume that replica will still
+// exist after we delete other replicas on other servers. However,
+// it's possible that a previous rebalancing operation made different
+// decisions (e.g., servers were added/removed, and rendezvous order
+// changed). In this case, the replica might already be on that
+// server's trash list, and it might be deleted before we send a
+// replacement trash list.
+//
+// We avoid this problem if we clear all trash lists before getting
+// indexes. (We also assume there is only one rebalancing process
+// running at a time.)
+func (bal *Balancer) ClearTrashLists(c *arvados.Client) error {
+ for _, srv := range bal.KeepServices {
+ srv.ChangeSet = &ChangeSet{}
+ }
+ return bal.CommitTrash(c)
+}
+
+// GetCurrentState determines the current replication state, and the
+// desired replication level, for every block that is either
+// retrievable or referenced.
+//
+// It determines the current replication state by reading the block index
+// from every known Keep service.
+//
+// It determines the desired replication level by retrieving all
+// collection manifests in the database (API server).
+//
+// It encodes the resulting information in BlockStateMap.
+func (bal *Balancer) GetCurrentState(c *arvados.Client) error {
+ defer timeMe(bal.Logger, "GetCurrentState")()
+ bal.BlockStateMap = NewBlockStateMap()
+
+ dd, err := c.DiscoveryDocument()
+ if err != nil {
+ return err
+ }
+ bal.DefaultReplication = dd.DefaultCollectionReplication
+ bal.MinMtime = time.Now().Unix() - dd.BlobSignatureTTL
+
+ errs := make(chan error, 2+len(bal.KeepServices))
+ wg := sync.WaitGroup{}
+
+ // Start one goroutine for each KeepService: retrieve the
+ // index, and add the returned blocks to BlockStateMap.
+ for _, srv := range bal.KeepServices {
+ wg.Add(1)
+ go func(srv *KeepService) {
+ defer wg.Done()
+ bal.logf("%s: retrieve index", srv)
+ idx, err := srv.Index(c, "")
+ if err != nil {
+ errs <- fmt.Errorf("%s: %v", srv, err)
+ return
+ }
+ bal.logf("%s: add %d replicas to map", srv, len(idx))
+ bal.BlockStateMap.AddReplicas(srv, idx)
+ bal.logf("%s: done", srv)
+ }(srv)
+ }
+
+ // collQ buffers incoming collections so we can start fetching
+ // the next page without waiting for the current page to
+ // finish processing. (1000 happens to match the page size
+ // used by (*arvados.Client)EachCollection(), but it's OK if
+ // they don't match.)
+ collQ := make(chan arvados.Collection, 1000)
+
+ // Start a goroutine to process collections. (We could use a
+ // worker pool here, but even with a single worker we already
+ // process collections much faster than we can retrieve them.)
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ for coll := range collQ {
+ err := bal.addCollection(coll)
+ if err != nil {
+ errs <- err
+ for range collQ {
+ }
+ return
+ }
+ bal.collScanned++
+ }
+ }()
+
+ // Start a goroutine to retrieve all collections from the
+ // Arvados database and send them to collQ for processing.
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ err = EachCollection(c,
+ func(coll arvados.Collection) error {
+ collQ <- coll
+ if len(errs) > 0 {
+ // some other GetCurrentState
+ // error happened: no point
+ // getting any more
+ // collections.
+ return fmt.Errorf("")
+ }
+ return nil
+ }, func(done, total int) {
+ bal.logf("collections: %d/%d", done, total)
+ })
+ close(collQ)
+ if err != nil {
+ errs <- err
+ }
+ }()
+
+ go func() {
+ // Send a nil error when all goroutines finish. If
+ // this is the first error sent to errs, then
+ // everything worked.
+ wg.Wait()
+ errs <- nil
+ }()
+ return <-errs
+}
+
+func (bal *Balancer) addCollection(coll arvados.Collection) error {
+ blkids, err := coll.SizedDigests()
+ if err != nil {
+ bal.mutex.Lock()
+ bal.errors = append(bal.errors, fmt.Errorf("%v: %v", coll.UUID, err))
+ bal.mutex.Unlock()
+ return nil
+ }
+ repl := bal.DefaultReplication
+ if coll.ReplicationDesired != nil {
+ repl = *coll.ReplicationDesired
+ }
+ debugf("%v: %d block x%d", coll.UUID, len(blkids), repl)
+ bal.BlockStateMap.IncreaseDesired(repl, blkids)
+ return nil
+}
+
+// ComputeChangeSets compares, for each known block, the current and
+// desired replication states. If it is possible to get closer to the
+// desired state by copying or deleting blocks, it adds those changes
+// to the relevant KeepServices' ChangeSets.
+//
+// It does not actually apply any of the computed changes.
+func (bal *Balancer) ComputeChangeSets() {
+ // This just calls balanceBlock() once for each block, using a
+ // pool of worker goroutines.
+ defer timeMe(bal.Logger, "ComputeChangeSets")()
+ bal.setupServiceRoots()
+
+ type balanceTask struct {
+ blkid arvados.SizedDigest
+ blk *BlockState
+ }
+ nWorkers := 1 + runtime.NumCPU()
+ todo := make(chan balanceTask, nWorkers)
+ var wg sync.WaitGroup
+ for i := 0; i < nWorkers; i++ {
+ wg.Add(1)
+ go func() {
+ for work := range todo {
+ bal.balanceBlock(work.blkid, work.blk)
+ }
+ wg.Done()
+ }()
+ }
+ bal.BlockStateMap.Apply(func(blkid arvados.SizedDigest, blk *BlockState) {
+ todo <- balanceTask{
+ blkid: blkid,
+ blk: blk,
+ }
+ })
+ close(todo)
+ wg.Wait()
+}
+
+func (bal *Balancer) setupServiceRoots() {
+ bal.serviceRoots = make(map[string]string)
+ for _, srv := range bal.KeepServices {
+ bal.serviceRoots[srv.UUID] = srv.UUID
+ }
+}
+
+const (
+ changeStay = iota
+ changePull
+ changeTrash
+ changeNone
+)
+
+var changeName = map[int]string{
+ changeStay: "stay",
+ changePull: "pull",
+ changeTrash: "trash",
+ changeNone: "none",
+}
+
+// balanceBlock compares current state to desired state for a single
+// block, and makes the appropriate ChangeSet calls.
+func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) {
+ debugf("balanceBlock: %v %+v", blkid, blk)
+ uuids := keepclient.NewRootSorter(bal.serviceRoots, string(blkid[:32])).GetSortedRoots()
+ hasRepl := make(map[string]Replica, len(bal.serviceRoots))
+ for _, repl := range blk.Replicas {
+ hasRepl[repl.UUID] = repl
+ // TODO: when multiple copies are on one server, use
+ // the oldest one that doesn't have a timestamp
+ // collision with other replicas.
+ }
+ // number of replicas already found in positions better than
+ // the position we're contemplating now.
+ reportedBestRepl := 0
+ // To be safe we assume two replicas with the same Mtime are
+ // in fact the same replica being reported more than
+ // once. len(uniqueBestRepl) is the number of distinct
+ // replicas in the best rendezvous positions we've considered
+ // so far.
+ uniqueBestRepl := make(map[int64]bool, len(bal.serviceRoots))
+ // pulls is the number of Pull changes we have already
+ // requested. (For purposes of deciding whether to Pull to
+ // rendezvous position N, we should assume all pulls we have
+ // requested on rendezvous positions M<N will be successful.)
+ pulls := 0
+ var changes []string
+ for _, uuid := range uuids {
+ change := changeNone
+ srv := bal.KeepServices[uuid]
+ // TODO: request a Touch if Mtime is duplicated.
+ repl, ok := hasRepl[srv.UUID]
+ if ok {
+ // This service has a replica. We should
+ // delete it if [1] we already have enough
+ // distinct replicas in better rendezvous
+ // positions and [2] this replica's Mtime is
+ // distinct from all of the better replicas'
+ // Mtimes.
+ if !srv.ReadOnly &&
+ repl.Mtime < bal.MinMtime &&
+ len(uniqueBestRepl) >= blk.Desired &&
+ !uniqueBestRepl[repl.Mtime] {
+ srv.AddTrash(Trash{
+ SizedDigest: blkid,
+ Mtime: repl.Mtime,
+ })
+ change = changeTrash
+ } else {
+ change = changeStay
+ }
+ uniqueBestRepl[repl.Mtime] = true
+ reportedBestRepl++
+ } else if pulls+reportedBestRepl < blk.Desired &&
+ len(blk.Replicas) > 0 &&
+ !srv.ReadOnly {
+ // This service doesn't have a replica. We
+ // should pull one to this server if we don't
+ // already have enough (existing+requested)
+ // replicas in better rendezvous positions.
+ srv.AddPull(Pull{
+ SizedDigest: blkid,
+ Source: blk.Replicas[0].KeepService,
+ })
+ pulls++
+ change = changePull
+ }
+ if bal.Dumper != nil {
+ changes = append(changes, fmt.Sprintf("%s:%d=%s,%d", srv.ServiceHost, srv.ServicePort, changeName[change], repl.Mtime))
+ }
+ }
+ if bal.Dumper != nil {
+ bal.Dumper.Printf("%s have=%d want=%d %s", blkid, len(blk.Replicas), blk.Desired, strings.Join(changes, " "))
+ }
+}
+
+type blocksNBytes struct {
+ replicas int
+ blocks int
+ bytes int64
+}
+
+func (bb blocksNBytes) String() string {
+ return fmt.Sprintf("%d replicas (%d blocks, %d bytes)", bb.replicas, bb.blocks, bb.bytes)
+}
+
+type balancerStats struct {
+ lost, overrep, unref, garbage, underrep, justright blocksNBytes
+ desired, current blocksNBytes
+ pulls, trashes int
+ replHistogram []int
+}
+
+func (bal *Balancer) getStatistics() (s balancerStats) {
+ s.replHistogram = make([]int, 2)
+ bal.BlockStateMap.Apply(func(blkid arvados.SizedDigest, blk *BlockState) {
+ surplus := len(blk.Replicas) - blk.Desired
+ bytes := blkid.Size()
+ switch {
+ case len(blk.Replicas) == 0 && blk.Desired > 0:
+ s.lost.replicas -= surplus
+ s.lost.blocks++
+ s.lost.bytes += bytes * int64(-surplus)
+ case len(blk.Replicas) < blk.Desired:
+ s.underrep.replicas -= surplus
+ s.underrep.blocks++
+ s.underrep.bytes += bytes * int64(-surplus)
+ case len(blk.Replicas) > 0 && blk.Desired == 0:
+ counter := &s.garbage
+ for _, r := range blk.Replicas {
+ if r.Mtime >= bal.MinMtime {
+ counter = &s.unref
+ break
+ }
+ }
+ counter.replicas += surplus
+ counter.blocks++
+ counter.bytes += bytes * int64(surplus)
+ case len(blk.Replicas) > blk.Desired:
+ s.overrep.replicas += surplus
+ s.overrep.blocks++
+ s.overrep.bytes += bytes * int64(len(blk.Replicas)-blk.Desired)
+ default:
+ s.justright.replicas += blk.Desired
+ s.justright.blocks++
+ s.justright.bytes += bytes * int64(blk.Desired)
+ }
+
+ if blk.Desired > 0 {
+ s.desired.replicas += blk.Desired
+ s.desired.blocks++
+ s.desired.bytes += bytes * int64(blk.Desired)
+ }
+ if len(blk.Replicas) > 0 {
+ s.current.replicas += len(blk.Replicas)
+ s.current.blocks++
+ s.current.bytes += bytes * int64(len(blk.Replicas))
+ }
+
+ for len(s.replHistogram) <= len(blk.Replicas) {
+ s.replHistogram = append(s.replHistogram, 0)
+ }
+ s.replHistogram[len(blk.Replicas)]++
+ })
+ for _, srv := range bal.KeepServices {
+ s.pulls += len(srv.ChangeSet.Pulls)
+ s.trashes += len(srv.ChangeSet.Trashes)
+ }
+ return
+}
+
+// PrintStatistics writes statistics about the computed changes to
+// bal.Logger. It should not be called until ComputeChangeSets has
+// finished.
+func (bal *Balancer) PrintStatistics() {
+ s := bal.getStatistics()
+ bal.logf("===")
+ bal.logf("%s lost (0=have<want)", s.lost)
+ bal.logf("%s underreplicated (0<have<want)", s.underrep)
+ bal.logf("%s just right (have=want)", s.justright)
+ bal.logf("%s overreplicated (have>want>0)", s.overrep)
+ bal.logf("%s unreferenced (have>want=0, new)", s.unref)
+ bal.logf("%s garbage (have>want=0, old)", s.garbage)
+ bal.logf("===")
+ bal.logf("%s total commitment (excluding unreferenced)", s.desired)
+ bal.logf("%s total usage", s.current)
+ bal.logf("===")
+ for _, srv := range bal.KeepServices {
+ bal.logf("%s: %v\n", srv, srv.ChangeSet)
+ }
+ bal.logf("===")
+ bal.printHistogram(s, 60)
+ bal.logf("===")
+}
+
+func (bal *Balancer) printHistogram(s balancerStats, hashColumns int) {
+ bal.logf("Replication level distribution (counting N replicas on a single server as N):")
+ maxCount := 0
+ for _, count := range s.replHistogram {
+ if maxCount < count {
+ maxCount = count
+ }
+ }
+ hashes := strings.Repeat("#", hashColumns)
+ countWidth := 1 + int(math.Log10(float64(maxCount+1)))
+ scaleCount := 10 * float64(hashColumns) / math.Floor(1+10*math.Log10(float64(maxCount+1)))
+ for repl, count := range s.replHistogram {
+ nHashes := int(scaleCount * math.Log10(float64(count+1)))
+ bal.logf("%2d: %*d %s", repl, countWidth, count, hashes[:nHashes])
+ }
+}
+
+// CheckSanityLate checks for configuration and runtime errors after
+// GetCurrentState() and ComputeChangeSets() have finished.
+//
+// If it returns an error, it is dangerous to run any Commit methods.
+func (bal *Balancer) CheckSanityLate() error {
+ if bal.errors != nil {
+ for _, err := range bal.errors {
+ bal.logf("deferred error: %v", err)
+ }
+ return fmt.Errorf("cannot proceed safely after deferred errors")
+ }
+
+ if bal.collScanned == 0 {
+ return fmt.Errorf("received zero collections")
+ }
+
+ anyDesired := false
+ bal.BlockStateMap.Apply(func(_ arvados.SizedDigest, blk *BlockState) {
+ if blk.Desired > 0 {
+ anyDesired = true
+ }
+ })
+ if !anyDesired {
+ return fmt.Errorf("zero blocks have desired replication>0")
+ }
+
+ if dr := bal.DefaultReplication; dr < 1 {
+ return fmt.Errorf("Default replication (%d) is less than 1", dr)
+ }
+
+ // TODO: no two services have identical indexes
+ // TODO: no collisions (same md5, different size)
+ return nil
+}
+
+// CommitPulls sends the computed lists of pull requests to the
+// keepstore servers. This has the effect of increasing replication of
+// existing blocks that are either underreplicated or poorly
+// distributed according to rendezvous hashing.
+func (bal *Balancer) CommitPulls(c *arvados.Client) error {
+ return bal.commitAsync(c, "send pull list",
+ func(srv *KeepService) error {
+ return srv.CommitPulls(c)
+ })
+}
+
+// CommitTrash sends the computed lists of trash requests to the
+// keepstore servers. This has the effect of deleting blocks that are
+// overreplicated or unreferenced.
+func (bal *Balancer) CommitTrash(c *arvados.Client) error {
+ return bal.commitAsync(c, "send trash list",
+ func(srv *KeepService) error {
+ return srv.CommitTrash(c)
+ })
+}
+
+func (bal *Balancer) commitAsync(c *arvados.Client, label string, f func(srv *KeepService) error) error {
+ errs := make(chan error)
+ for _, srv := range bal.KeepServices {
+ go func(srv *KeepService) {
+ var err error
+ defer func() { errs <- err }()
+ label := fmt.Sprintf("%s: %v", srv, label)
+ defer timeMe(bal.Logger, label)()
+ err = f(srv)
+ if err != nil {
+ err = fmt.Errorf("%s: %v", label, err)
+ }
+ }(srv)
+ }
+ var lastErr error
+ for _ = range bal.KeepServices {
+ if err := <-errs; err != nil {
+ bal.logf("%v", err)
+ lastErr = err
+ }
+ }
+ close(errs)
+ return lastErr
+}
+
+func (bal *Balancer) logf(f string, args ...interface{}) {
+ if bal.Logger != nil {
+ bal.Logger.Printf(f, args...)
+ }
+}
--- /dev/null
+package main
+
+import (
+ _ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&runSuite{})
+
+type reqTracker struct {
+ reqs []http.Request
+ sync.Mutex
+}
+
+func (rt *reqTracker) Count() int {
+ rt.Lock()
+ defer rt.Unlock()
+ return len(rt.reqs)
+}
+
+func (rt *reqTracker) Add(req *http.Request) int {
+ rt.Lock()
+ defer rt.Unlock()
+ rt.reqs = append(rt.reqs, *req)
+ return len(rt.reqs)
+}
+
+// stubServer is an HTTP transport that intercepts and processes all
+// requests using its own handlers.
+type stubServer struct {
+ mux *http.ServeMux
+ srv *httptest.Server
+ mutex sync.Mutex
+ Requests reqTracker
+ logf func(string, ...interface{})
+}
+
+// Start initializes the stub server and returns an *http.Client that
+// uses the stub server to handle all requests.
+//
+// A stubServer that has been started should eventually be shut down
+// with Close().
+func (s *stubServer) Start() *http.Client {
+ // Set up a config.Client that forwards all requests to s.mux
+ // via s.srv. Test cases will attach handlers to s.mux to get
+ // the desired responses.
+ s.mux = http.NewServeMux()
+ s.srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ s.mutex.Lock()
+ s.Requests.Add(r)
+ s.mutex.Unlock()
+ w.Header().Set("Content-Type", "application/json")
+ s.mux.ServeHTTP(w, r)
+ }))
+ return &http.Client{Transport: s}
+}
+
+func (s *stubServer) RoundTrip(req *http.Request) (*http.Response, error) {
+ w := httptest.NewRecorder()
+ s.mux.ServeHTTP(w, req)
+ return &http.Response{
+ StatusCode: w.Code,
+ Status: fmt.Sprintf("%d %s", w.Code, http.StatusText(w.Code)),
+ Header: w.HeaderMap,
+ Body: ioutil.NopCloser(w.Body)}, nil
+}
+
+// Close releases resources used by the server.
+func (s *stubServer) Close() {
+ s.srv.Close()
+}
+
+func (s *stubServer) serveStatic(path, data string) *reqTracker {
+ rt := &reqTracker{}
+ s.mux.HandleFunc(path, func(w http.ResponseWriter, r *http.Request) {
+ rt.Add(r)
+ if r.Body != nil {
+ ioutil.ReadAll(r.Body)
+ r.Body.Close()
+ }
+ io.WriteString(w, data)
+ })
+ return rt
+}
+
+func (s *stubServer) serveCurrentUserAdmin() *reqTracker {
+ return s.serveStatic("/arvados/v1/users/current",
+ `{"uuid":"zzzzz-tpzed-000000000000000","is_admin":true,"is_active":true}`)
+}
+
+func (s *stubServer) serveCurrentUserNotAdmin() *reqTracker {
+ return s.serveStatic("/arvados/v1/users/current",
+ `{"uuid":"zzzzz-tpzed-000000000000000","is_admin":false,"is_active":true}`)
+}
+
+func (s *stubServer) serveDiscoveryDoc() *reqTracker {
+ return s.serveStatic("/discovery/v1/apis/arvados/v1/rest",
+ `{"defaultCollectionReplication":2}`)
+}
+
+func (s *stubServer) serveZeroCollections() *reqTracker {
+ return s.serveStatic("/arvados/v1/collections",
+ `{"items":[],"items_available":0}`)
+}
+
+func (s *stubServer) serveFooBarFileCollections() *reqTracker {
+ rt := &reqTracker{}
+ s.mux.HandleFunc("/arvados/v1/collections", func(w http.ResponseWriter, r *http.Request) {
+ r.ParseForm()
+ rt.Add(r)
+ if strings.Contains(r.Form.Get("filters"), `modified_at`) {
+ io.WriteString(w, `{"items_available":0,"items":[]}`)
+ } else {
+ io.WriteString(w, `{"items_available":2,"items":[
+ {"uuid":"zzzzz-4zz18-ehbhgtheo8909or","portable_data_hash":"fa7aeb5140e2848d39b416daeef4ffc5+45","manifest_text":". 37b51d194a7513e45b56f6524f2d51f2+3 0:3:bar\n","modified_at":"2014-02-03T17:22:54Z"},
+ {"uuid":"zzzzz-4zz18-znfnqtbbv4spc3w","portable_data_hash":"1f4b0bc7583c2a7f9102c395f4ffc5e3+45","manifest_text":". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n","modified_at":"2014-02-03T17:22:54Z"}]}`)
+ }
+ })
+ return rt
+}
+
+func (s *stubServer) serveCollectionsButSkipOne() *reqTracker {
+ rt := &reqTracker{}
+ s.mux.HandleFunc("/arvados/v1/collections", func(w http.ResponseWriter, r *http.Request) {
+ r.ParseForm()
+ rt.Add(r)
+ if strings.Contains(r.Form.Get("filters"), `"modified_at","\u003c="`) {
+ io.WriteString(w, `{"items_available":3,"items":[]}`)
+ } else if strings.Contains(r.Form.Get("filters"), `"modified_at","\u003e="`) {
+ io.WriteString(w, `{"items_available":0,"items":[]}`)
+ } else {
+ io.WriteString(w, `{"items_available":2,"items":[
+ {"uuid":"zzzzz-4zz18-ehbhgtheo8909or","portable_data_hash":"fa7aeb5140e2848d39b416daeef4ffc5+45","manifest_text":". 37b51d194a7513e45b56f6524f2d51f2+3 0:3:bar\n","modified_at":"2014-02-03T17:22:54Z"},
+ {"uuid":"zzzzz-4zz18-znfnqtbbv4spc3w","portable_data_hash":"1f4b0bc7583c2a7f9102c395f4ffc5e3+45","manifest_text":". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n","modified_at":"2014-02-03T17:22:54Z"}]}`)
+ }
+ })
+ return rt
+}
+
+func (s *stubServer) serveZeroKeepServices() *reqTracker {
+ return s.serveStatic("/arvados/v1/keep_services",
+ `{"items":[],"items_available":0}`)
+}
+
+func (s *stubServer) serveFourDiskKeepServices() *reqTracker {
+ return s.serveStatic("/arvados/v1/keep_services", `{"items_available":5,"items":[
+ {"uuid":"zzzzz-bi6l4-000000000000000","service_host":"keep0.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+ {"uuid":"zzzzz-bi6l4-000000000000001","service_host":"keep1.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+ {"uuid":"zzzzz-bi6l4-000000000000002","service_host":"keep2.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+ {"uuid":"zzzzz-bi6l4-000000000000003","service_host":"keep3.zzzzz.arvadosapi.com","service_port":25107,"service_ssl_flag":false,"service_type":"disk"},
+ {"uuid":"zzzzz-bi6l4-h0a0xwut9qa6g3a","service_host":"keep.zzzzz.arvadosapi.com","service_port":25333,"service_ssl_flag":true,"service_type":"proxy"}]}`)
+}
+
+func (s *stubServer) serveKeepstoreIndexFoo4Bar1() *reqTracker {
+ rt := &reqTracker{}
+ s.mux.HandleFunc("/index/", func(w http.ResponseWriter, r *http.Request) {
+ count := rt.Add(r)
+ if r.Host == "keep0.zzzzz.arvadosapi.com:25107" {
+ io.WriteString(w, "37b51d194a7513e45b56f6524f2d51f2+3 12345678\n")
+ }
+ fmt.Fprintf(w, "acbd18db4cc2f85cedef654fccc4a4d8+3 %d\n\n", 12345678+count)
+ })
+ return rt
+}
+
+func (s *stubServer) serveKeepstoreTrash() *reqTracker {
+ return s.serveStatic("/trash", `{}`)
+}
+
+func (s *stubServer) serveKeepstorePull() *reqTracker {
+ return s.serveStatic("/pull", `{}`)
+}
+
+type runSuite struct {
+ stub stubServer
+ config Config
+}
+
+// make a log.Logger that writes to the current test's c.Log().
+func (s *runSuite) logger(c *check.C) *log.Logger {
+ r, w := io.Pipe()
+ go func() {
+ buf := make([]byte, 10000)
+ for {
+ n, err := r.Read(buf)
+ if n > 0 {
+ if buf[n-1] == '\n' {
+ n--
+ }
+ c.Log(string(buf[:n]))
+ }
+ if err != nil {
+ break
+ }
+ }
+ }()
+ return log.New(w, "", log.LstdFlags)
+}
+
+func (s *runSuite) SetUpTest(c *check.C) {
+ s.config = Config{
+ Client: arvados.Client{
+ AuthToken: "xyzzy",
+ APIHost: "zzzzz.arvadosapi.com",
+ Client: s.stub.Start()},
+ KeepServiceTypes: []string{"disk"}}
+ s.stub.serveDiscoveryDoc()
+ s.stub.logf = c.Logf
+}
+
+func (s *runSuite) TearDownTest(c *check.C) {
+ s.stub.Close()
+}
+
+func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: s.logger(c),
+ }
+ s.stub.serveCurrentUserAdmin()
+ s.stub.serveZeroCollections()
+ s.stub.serveFourDiskKeepServices()
+ s.stub.serveKeepstoreIndexFoo4Bar1()
+ trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
+ err := (&Balancer{}).Run(s.config, opts)
+ c.Check(err, check.ErrorMatches, "received zero collections")
+ c.Check(trashReqs.Count(), check.Equals, 4)
+ c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestServiceTypes(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: s.logger(c),
+ }
+ s.config.KeepServiceTypes = []string{"unlisted-type"}
+ s.stub.serveCurrentUserAdmin()
+ s.stub.serveFooBarFileCollections()
+ s.stub.serveFourDiskKeepServices()
+ indexReqs := s.stub.serveKeepstoreIndexFoo4Bar1()
+ trashReqs := s.stub.serveKeepstoreTrash()
+ err := (&Balancer{}).Run(s.config, opts)
+ c.Check(err, check.IsNil)
+ c.Check(indexReqs.Count(), check.Equals, 0)
+ c.Check(trashReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: s.logger(c),
+ }
+ s.stub.serveCurrentUserNotAdmin()
+ s.stub.serveZeroCollections()
+ s.stub.serveFourDiskKeepServices()
+ trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
+ err := (&Balancer{}).Run(s.config, opts)
+ c.Check(err, check.ErrorMatches, "current user .* is not .* admin user")
+ c.Check(trashReqs.Count(), check.Equals, 0)
+ c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestDetectSkippedCollections(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: s.logger(c),
+ }
+ s.stub.serveCurrentUserAdmin()
+ s.stub.serveCollectionsButSkipOne()
+ s.stub.serveFourDiskKeepServices()
+ s.stub.serveKeepstoreIndexFoo4Bar1()
+ trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
+ err := (&Balancer{}).Run(s.config, opts)
+ c.Check(err, check.ErrorMatches, `Retrieved 2 collections with modtime <= .* but server now reports there are 3 collections.*`)
+ c.Check(trashReqs.Count(), check.Equals, 4)
+ c.Check(pullReqs.Count(), check.Equals, 0)
+}
+
+func (s *runSuite) TestDryRun(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: false,
+ CommitTrash: false,
+ Logger: s.logger(c),
+ }
+ s.stub.serveCurrentUserAdmin()
+ s.stub.serveFooBarFileCollections()
+ s.stub.serveFourDiskKeepServices()
+ s.stub.serveKeepstoreIndexFoo4Bar1()
+ trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
+ var bal Balancer
+ err := bal.Run(s.config, opts)
+ c.Check(err, check.IsNil)
+ c.Check(trashReqs.Count(), check.Equals, 0)
+ c.Check(pullReqs.Count(), check.Equals, 0)
+ stats := bal.getStatistics()
+ c.Check(stats.pulls, check.Not(check.Equals), 0)
+ c.Check(stats.underrep.replicas, check.Not(check.Equals), 0)
+ c.Check(stats.overrep.replicas, check.Not(check.Equals), 0)
+}
+
+func (s *runSuite) TestCommit(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: s.logger(c),
+ Dumper: s.logger(c),
+ }
+ s.stub.serveCurrentUserAdmin()
+ s.stub.serveFooBarFileCollections()
+ s.stub.serveFourDiskKeepServices()
+ s.stub.serveKeepstoreIndexFoo4Bar1()
+ trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
+ var bal Balancer
+ err := bal.Run(s.config, opts)
+ c.Check(err, check.IsNil)
+ c.Check(trashReqs.Count(), check.Equals, 8)
+ c.Check(pullReqs.Count(), check.Equals, 4)
+ stats := bal.getStatistics()
+ // "foo" block is overreplicated by 2
+ c.Check(stats.trashes, check.Equals, 2)
+ // "bar" block is underreplicated by 1, and its only copy is
+ // in a poor rendezvous position
+ c.Check(stats.pulls, check.Equals, 2)
+}
+
+func (s *runSuite) TestRunForever(c *check.C) {
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: s.logger(c),
+ Dumper: s.logger(c),
+ }
+ s.stub.serveCurrentUserAdmin()
+ s.stub.serveFooBarFileCollections()
+ s.stub.serveFourDiskKeepServices()
+ s.stub.serveKeepstoreIndexFoo4Bar1()
+ trashReqs := s.stub.serveKeepstoreTrash()
+ pullReqs := s.stub.serveKeepstorePull()
+
+ stop := make(chan interface{})
+ s.config.RunPeriod = arvados.Duration(time.Millisecond)
+ go RunForever(s.config, opts, stop)
+
+ // Each run should send 4 clear trash lists + 4 pull lists + 4
+ // trash lists. We should complete four runs in much less than
+ // a second.
+ for t0 := time.Now(); pullReqs.Count() < 16 && time.Since(t0) < 10*time.Second; {
+ time.Sleep(time.Millisecond)
+ }
+ stop <- true
+ c.Check(pullReqs.Count() >= 16, check.Equals, true)
+ c.Check(trashReqs.Count(), check.Equals, 2*pullReqs.Count())
+}
--- /dev/null
+package main
+
+import (
+ "crypto/md5"
+ "fmt"
+ "sort"
+ "strconv"
+ "testing"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+
+ check "gopkg.in/check.v1"
+)
+
+// Test with Gocheck
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
+
+var _ = check.Suite(&balancerSuite{})
+
+type balancerSuite struct {
+ Balancer
+ srvs []*KeepService
+ blks map[string]tester
+ knownRendezvous [][]int
+ signatureTTL int64
+}
+
+const (
+ // index into knownRendezvous
+ known0 = 0
+)
+
+type slots []int
+
+type tester struct {
+ known int
+ desired int
+ current slots
+ timestamps []int64
+ shouldPull slots
+ shouldTrash slots
+}
+
+func (bal *balancerSuite) SetUpSuite(c *check.C) {
+ bal.knownRendezvous = nil
+ for _, str := range []string{
+ "3eab2d5fc9681074",
+ "097dba52e648f1c3",
+ "c5b4e023f8a7d691",
+ "9d81c02e76a3bf54",
+ } {
+ var slots []int
+ for _, c := range []byte(str) {
+ pos, _ := strconv.ParseUint(string(c), 16, 4)
+ slots = append(slots, int(pos))
+ }
+ bal.knownRendezvous = append(bal.knownRendezvous, slots)
+ }
+
+ bal.signatureTTL = 3600
+}
+
+func (bal *balancerSuite) SetUpTest(c *check.C) {
+ bal.srvs = make([]*KeepService, 16)
+ bal.KeepServices = make(map[string]*KeepService)
+ for i := range bal.srvs {
+ srv := &KeepService{
+ KeepService: arvados.KeepService{
+ UUID: fmt.Sprintf("zzzzz-bi6l4-%015x", i),
+ },
+ }
+ bal.srvs[i] = srv
+ bal.KeepServices[srv.UUID] = srv
+ }
+
+ bal.MinMtime = time.Now().Unix() - bal.signatureTTL
+}
+
+func (bal *balancerSuite) TestPerfect(c *check.C) {
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{0, 1},
+ shouldPull: nil,
+ shouldTrash: nil})
+}
+
+func (bal *balancerSuite) TestDecreaseRepl(c *check.C) {
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{0, 2, 1},
+ shouldTrash: slots{2}})
+}
+
+func (bal *balancerSuite) TestDecreaseReplToZero(c *check.C) {
+ bal.try(c, tester{
+ desired: 0,
+ current: slots{0, 1, 3},
+ shouldTrash: slots{0, 1, 3}})
+}
+
+func (bal *balancerSuite) TestIncreaseRepl(c *check.C) {
+ bal.try(c, tester{
+ desired: 4,
+ current: slots{0, 1},
+ shouldPull: slots{2, 3}})
+}
+
+func (bal *balancerSuite) TestSkipReadonly(c *check.C) {
+ bal.srvList(0, slots{3})[0].ReadOnly = true
+ bal.try(c, tester{
+ desired: 4,
+ current: slots{0, 1},
+ shouldPull: slots{2, 4}})
+}
+
+func (bal *balancerSuite) TestFixUnbalanced(c *check.C) {
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{2, 0},
+ shouldPull: slots{1}})
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{2, 7},
+ shouldPull: slots{0, 1}})
+ // if only one of the pulls succeeds, we'll see this next:
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{2, 1, 7},
+ shouldPull: slots{0},
+ shouldTrash: slots{7}})
+ // if both pulls succeed, we'll see this next:
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{2, 0, 1, 7},
+ shouldTrash: slots{2, 7}})
+
+ // unbalanced + excessive replication => pull + trash
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{2, 5, 7},
+ shouldPull: slots{0, 1},
+ shouldTrash: slots{7}})
+}
+
+func (bal *balancerSuite) TestIncreaseReplTimestampCollision(c *check.C) {
+ // For purposes of increasing replication, we assume identical
+ // replicas are distinct.
+ bal.try(c, tester{
+ desired: 4,
+ current: slots{0, 1},
+ timestamps: []int64{12345678, 12345678},
+ shouldPull: slots{2, 3}})
+}
+
+func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
+ // For purposes of decreasing replication, we assume identical
+ // replicas are NOT distinct.
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{0, 1, 2},
+ timestamps: []int64{12345678, 12345678, 12345678}})
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{0, 1, 2},
+ timestamps: []int64{12345678, 10000000, 10000000}})
+}
+
+func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
+ oldTime := bal.MinMtime - 3600
+ newTime := bal.MinMtime + 3600
+ // The excess replica is too new to delete.
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{0, 1, 2},
+ timestamps: []int64{oldTime, newTime, newTime + 1}})
+ // The best replicas are too new to delete, but the excess
+ // replica is old enough.
+ bal.try(c, tester{
+ desired: 2,
+ current: slots{0, 1, 2},
+ timestamps: []int64{newTime, newTime + 1, oldTime},
+ shouldTrash: slots{2}})
+}
+
+// Clear all servers' changesets, balance a single block, and verify
+// the appropriate changes for that block have been added to the
+// changesets.
+func (bal *balancerSuite) try(c *check.C, t tester) {
+ bal.setupServiceRoots()
+ blk := &BlockState{
+ Desired: t.desired,
+ Replicas: bal.replList(t.known, t.current)}
+ for i, t := range t.timestamps {
+ blk.Replicas[i].Mtime = t
+ }
+ for _, srv := range bal.srvs {
+ srv.ChangeSet = &ChangeSet{}
+ }
+ bal.balanceBlock(knownBlkid(t.known), blk)
+
+ var didPull, didTrash slots
+ for i, srv := range bal.srvs {
+ var slot int
+ for probeOrder, srvNum := range bal.knownRendezvous[t.known] {
+ if srvNum == i {
+ slot = probeOrder
+ }
+ }
+ for _, pull := range srv.Pulls {
+ didPull = append(didPull, slot)
+ c.Check(pull.SizedDigest, check.Equals, knownBlkid(t.known))
+ }
+ for _, trash := range srv.Trashes {
+ didTrash = append(didTrash, slot)
+ c.Check(trash.SizedDigest, check.Equals, knownBlkid(t.known))
+ }
+ }
+
+ for _, list := range []slots{didPull, didTrash, t.shouldPull, t.shouldTrash} {
+ sort.Sort(sort.IntSlice(list))
+ }
+ c.Check(didPull, check.DeepEquals, t.shouldPull)
+ c.Check(didTrash, check.DeepEquals, t.shouldTrash)
+}
+
+// srvList returns the KeepServices, sorted in rendezvous order and
+// then selected by idx. For example, srvList(3, 0, 1, 4) returns the
+// the first-, second-, and fifth-best servers for storing
+// bal.knownBlkid(3).
+func (bal *balancerSuite) srvList(knownBlockID int, order slots) (srvs []*KeepService) {
+ for _, i := range order {
+ srvs = append(srvs, bal.srvs[bal.knownRendezvous[knownBlockID][i]])
+ }
+ return
+}
+
+// replList is like srvList but returns an "existing replicas" slice,
+// suitable for a BlockState test fixture.
+func (bal *balancerSuite) replList(knownBlockID int, order slots) (repls []Replica) {
+ mtime := time.Now().Unix() - bal.signatureTTL - 86400
+ for _, srv := range bal.srvList(knownBlockID, order) {
+ repls = append(repls, Replica{srv, mtime})
+ mtime++
+ }
+ return
+}
+
+// generate the same data hashes that are tested in
+// sdk/go/keepclient/root_sorter_test.go
+func knownBlkid(i int) arvados.SizedDigest {
+ return arvados.SizedDigest(fmt.Sprintf("%x+64", md5.Sum([]byte(fmt.Sprintf("%064x", i)))))
+}
--- /dev/null
+package main
+
+import (
+ "sync"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// Replica is a file on disk (or object in an S3 bucket, or blob in an
+// Azure storage container, etc.) as reported in a keepstore index
+// response.
+type Replica struct {
+ *KeepService
+ Mtime int64
+}
+
+// BlockState indicates the number of desired replicas (according to
+// the collections we know about) and the replicas actually stored
+// (according to the keepstore indexes we know about).
+type BlockState struct {
+ Replicas []Replica
+ Desired int
+}
+
+func (bs *BlockState) addReplica(r Replica) {
+ bs.Replicas = append(bs.Replicas, r)
+}
+
+func (bs *BlockState) increaseDesired(n int) {
+ if bs.Desired < n {
+ bs.Desired = n
+ }
+}
+
+// BlockStateMap is a goroutine-safe wrapper around a
+// map[arvados.SizedDigest]*BlockState.
+type BlockStateMap struct {
+ entries map[arvados.SizedDigest]*BlockState
+ mutex sync.Mutex
+}
+
+// NewBlockStateMap returns a newly allocated BlockStateMap.
+func NewBlockStateMap() *BlockStateMap {
+ return &BlockStateMap{
+ entries: make(map[arvados.SizedDigest]*BlockState),
+ }
+}
+
+// return a BlockState entry, allocating a new one if needed. (Private
+// method: not goroutine-safe.)
+func (bsm *BlockStateMap) get(blkid arvados.SizedDigest) *BlockState {
+ // TODO? Allocate BlockState structs a slice at a time,
+ // instead of one at a time.
+ blk := bsm.entries[blkid]
+ if blk == nil {
+ blk = &BlockState{}
+ bsm.entries[blkid] = blk
+ }
+ return blk
+}
+
+// Apply runs f on each entry in the map.
+func (bsm *BlockStateMap) Apply(f func(arvados.SizedDigest, *BlockState)) {
+ bsm.mutex.Lock()
+ defer bsm.mutex.Unlock()
+
+ for blkid, blk := range bsm.entries {
+ f(blkid, blk)
+ }
+}
+
+// AddReplicas updates the map to indicate srv has a replica of each
+// block in idx.
+func (bsm *BlockStateMap) AddReplicas(srv *KeepService, idx []arvados.KeepServiceIndexEntry) {
+ bsm.mutex.Lock()
+ defer bsm.mutex.Unlock()
+
+ for _, ent := range idx {
+ bsm.get(ent.SizedDigest).addReplica(Replica{
+ KeepService: srv,
+ Mtime: ent.Mtime,
+ })
+ }
+}
+
+// IncreaseDesired updates the map to indicate the desired replication
+// for the given blocks is at least n.
+func (bsm *BlockStateMap) IncreaseDesired(n int, blocks []arvados.SizedDigest) {
+ bsm.mutex.Lock()
+ defer bsm.mutex.Unlock()
+
+ for _, blkid := range blocks {
+ bsm.get(blkid).increaseDesired(n)
+ }
+}
--- /dev/null
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "sync"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// Pull is a request to retrieve a block from a remote server, and
+// store it locally.
+type Pull struct {
+ arvados.SizedDigest
+ Source *KeepService
+}
+
+// MarshalJSON formats a pull request the way keepstore wants to see
+// it.
+func (p Pull) MarshalJSON() ([]byte, error) {
+ type KeepstorePullRequest struct {
+ Locator string `json:"locator"`
+ Servers []string `json:"servers"`
+ }
+ return json.Marshal(KeepstorePullRequest{
+ Locator: string(p.SizedDigest[:32]),
+ Servers: []string{p.Source.URLBase()}})
+}
+
+// Trash is a request to delete a block.
+type Trash struct {
+ arvados.SizedDigest
+ Mtime int64
+}
+
+// MarshalJSON formats a trash request the way keepstore wants to see
+// it, i.e., as a bare locator with no +size hint.
+func (t Trash) MarshalJSON() ([]byte, error) {
+ type KeepstoreTrashRequest struct {
+ Locator string `json:"locator"`
+ BlockMtime int64 `json:"block_mtime"`
+ }
+ return json.Marshal(KeepstoreTrashRequest{
+ Locator: string(t.SizedDigest[:32]),
+ BlockMtime: t.Mtime})
+}
+
+// ChangeSet is a set of change requests that will be sent to a
+// keepstore server.
+type ChangeSet struct {
+ Pulls []Pull
+ Trashes []Trash
+ mutex sync.Mutex
+}
+
+// AddPull adds a Pull operation.
+func (cs *ChangeSet) AddPull(p Pull) {
+ cs.mutex.Lock()
+ cs.Pulls = append(cs.Pulls, p)
+ cs.mutex.Unlock()
+}
+
+// AddTrash adds a Trash operation
+func (cs *ChangeSet) AddTrash(t Trash) {
+ cs.mutex.Lock()
+ cs.Trashes = append(cs.Trashes, t)
+ cs.mutex.Unlock()
+}
+
+// String implements fmt.Stringer.
+func (cs *ChangeSet) String() string {
+ cs.mutex.Lock()
+ defer cs.mutex.Unlock()
+ return fmt.Sprintf("ChangeSet{Pulls:%d, Trashes:%d}", len(cs.Pulls), len(cs.Trashes))
+}
--- /dev/null
+package main
+
+import (
+ "encoding/json"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&changeSetSuite{})
+
+type changeSetSuite struct{}
+
+func (s *changeSetSuite) TestJSONFormat(c *check.C) {
+ srv := &KeepService{
+ KeepService: arvados.KeepService{
+ UUID: "zzzzz-bi6l4-000000000000001",
+ ServiceType: "disk",
+ ServiceSSLFlag: false,
+ ServiceHost: "keep1.zzzzz.arvadosapi.com",
+ ServicePort: 25107}}
+
+ buf, err := json.Marshal([]Pull{{
+ SizedDigest: arvados.SizedDigest("acbd18db4cc2f85cedef654fccc4a4d8+3"),
+ Source: srv}})
+ c.Check(err, check.IsNil)
+ c.Check(string(buf), check.Equals, `[{"locator":"acbd18db4cc2f85cedef654fccc4a4d8","servers":["http://keep1.zzzzz.arvadosapi.com:25107"]}]`)
+
+ buf, err = json.Marshal([]Trash{{
+ SizedDigest: arvados.SizedDigest("acbd18db4cc2f85cedef654fccc4a4d8+3"),
+ Mtime: 123456789}})
+ c.Check(err, check.IsNil)
+ c.Check(string(buf), check.Equals, `[{"locator":"acbd18db4cc2f85cedef654fccc4a4d8","block_mtime":123456789}]`)
+}
--- /dev/null
+package main
+
+import (
+ "fmt"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+func countCollections(c *arvados.Client, params arvados.ResourceListParams) (int, error) {
+ var page arvados.CollectionList
+ var zero int
+ params.Limit = &zero
+ err := c.RequestAndDecode(&page, "GET", "arvados/v1/collections", nil, params)
+ return page.ItemsAvailable, err
+}
+
+// EachCollection calls f once for every readable
+// collection. EachCollection stops if it encounters an error, such as
+// f returning a non-nil error.
+//
+// The progress function is called periodically with done (number of
+// times f has been called) and total (number of times f is expected
+// to be called).
+func EachCollection(c *arvados.Client, f func(arvados.Collection) error, progress func(done, total int)) error {
+ if progress == nil {
+ progress = func(_, _ int) {}
+ }
+
+ expectCount, err := countCollections(c, arvados.ResourceListParams{})
+ if err != nil {
+ return err
+ }
+
+ limit := 1000
+ params := arvados.ResourceListParams{
+ Limit: &limit,
+ Order: "modified_at, uuid",
+ Select: []string{"uuid", "manifest_text", "modified_at", "portable_data_hash", "replication_desired"},
+ }
+ var last arvados.Collection
+ var filterTime time.Time
+ callCount := 0
+ for {
+ progress(callCount, expectCount)
+ var page arvados.CollectionList
+ err := c.RequestAndDecode(&page, "GET", "arvados/v1/collections", nil, params)
+ if err != nil {
+ return err
+ }
+ for _, coll := range page.Items {
+ if last.ModifiedAt != nil && *last.ModifiedAt == *coll.ModifiedAt && last.UUID >= coll.UUID {
+ continue
+ }
+ callCount++
+ err = f(coll)
+ if err != nil {
+ return err
+ }
+ last = coll
+ }
+ if last.ModifiedAt == nil || *last.ModifiedAt == filterTime {
+ if page.ItemsAvailable > len(page.Items) {
+ // TODO: use "mtime=X && UUID>Y"
+ // filters to get all collections with
+ // this timestamp, then use "mtime>X"
+ // to get the next timestamp.
+ return fmt.Errorf("BUG: Received an entire page with the same modified_at timestamp (%v), cannot make progress", filterTime)
+ }
+ break
+ }
+ filterTime = *last.ModifiedAt
+ params.Filters = []arvados.Filter{{
+ Attr: "modified_at",
+ Operator: ">=",
+ Operand: filterTime,
+ }, {
+ Attr: "uuid",
+ Operator: "!=",
+ Operand: last.UUID,
+ }}
+ }
+ progress(callCount, expectCount)
+
+ if checkCount, err := countCollections(c, arvados.ResourceListParams{Filters: []arvados.Filter{{
+ Attr: "modified_at",
+ Operator: "<=",
+ Operand: filterTime}}}); err != nil {
+ return err
+ } else if callCount < checkCount {
+ return fmt.Errorf("Retrieved %d collections with modtime <= T=%q, but server now reports there are %d collections with modtime <= T", callCount, filterTime, checkCount)
+ }
+
+ return nil
+}
--- /dev/null
+package main
+
+import (
+ "bytes"
+ "log"
+ "net/http"
+ "os"
+ "strings"
+ "testing"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+ "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+ "git.curoverse.com/arvados.git/sdk/go/keepclient"
+
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&integrationSuite{})
+
+type integrationSuite struct {
+ config Config
+ keepClient *keepclient.KeepClient
+}
+
+func (s *integrationSuite) SetUpSuite(c *check.C) {
+ if testing.Short() {
+ c.Skip("-short")
+ }
+ arvadostest.ResetEnv()
+ arvadostest.StartAPI()
+ arvadostest.StartKeep(4, true)
+
+ arv, err := arvadosclient.MakeArvadosClient()
+ arv.ApiToken = arvadostest.DataManagerToken
+ c.Assert(err, check.IsNil)
+ s.keepClient = &keepclient.KeepClient{
+ Arvados: &arv,
+ Client: &http.Client{},
+ }
+ c.Assert(s.keepClient.DiscoverKeepServers(), check.IsNil)
+ s.putReplicas(c, "foo", 4)
+ s.putReplicas(c, "bar", 1)
+}
+
+func (s *integrationSuite) putReplicas(c *check.C, data string, replicas int) {
+ s.keepClient.Want_replicas = replicas
+ _, _, err := s.keepClient.PutB([]byte(data))
+ c.Assert(err, check.IsNil)
+}
+
+func (s *integrationSuite) TearDownSuite(c *check.C) {
+ if testing.Short() {
+ c.Skip("-short")
+ }
+ arvadostest.StopKeep(4)
+ arvadostest.StopAPI()
+}
+
+func (s *integrationSuite) SetUpTest(c *check.C) {
+ s.config = Config{
+ Client: arvados.Client{
+ APIHost: os.Getenv("ARVADOS_API_HOST"),
+ AuthToken: arvadostest.DataManagerToken,
+ Insecure: true,
+ },
+ KeepServiceTypes: []string{"disk"},
+ }
+}
+
+func (s *integrationSuite) TestBalanceAPIFixtures(c *check.C) {
+ var logBuf *bytes.Buffer
+ for iter := 0; iter < 20; iter++ {
+ logBuf := &bytes.Buffer{}
+ opts := RunOptions{
+ CommitPulls: true,
+ CommitTrash: true,
+ Logger: log.New(logBuf, "", log.LstdFlags),
+ }
+ err := (&Balancer{}).Run(s.config, opts)
+ c.Check(err, check.IsNil)
+ if iter == 0 {
+ c.Check(logBuf.String(), check.Matches, `(?ms).*ChangeSet{Pulls:1.*`)
+ c.Check(logBuf.String(), check.Not(check.Matches), `(?ms).*ChangeSet{.*Trashes:[^0]}*`)
+ } else if strings.Contains(logBuf.String(), "ChangeSet{Pulls:0") {
+ break
+ }
+ time.Sleep(200 * time.Millisecond)
+ }
+ c.Check(logBuf.String(), check.Not(check.Matches), `(?ms).*0 replicas (0 blocks, 0 bytes) underreplicated.*`)
+}
--- /dev/null
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net/http"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// KeepService represents a keepstore server that is being rebalanced.
+type KeepService struct {
+ arvados.KeepService
+ *ChangeSet
+}
+
+// String implements fmt.Stringer.
+func (srv *KeepService) String() string {
+ return fmt.Sprintf("%s (%s:%d, %s)", srv.UUID, srv.ServiceHost, srv.ServicePort, srv.ServiceType)
+}
+
+var ksSchemes = map[bool]string{false: "http", true: "https"}
+
+// URLBase returns scheme://host:port for this server.
+func (srv *KeepService) URLBase() string {
+ return fmt.Sprintf("%s://%s:%d", ksSchemes[srv.ServiceSSLFlag], srv.ServiceHost, srv.ServicePort)
+}
+
+// CommitPulls sends the current list of pull requests to the storage
+// server (even if the list is empty).
+func (srv *KeepService) CommitPulls(c *arvados.Client) error {
+ return srv.put(c, "pull", srv.ChangeSet.Pulls)
+}
+
+// CommitTrash sends the current list of trash requests to the storage
+// server (even if the list is empty).
+func (srv *KeepService) CommitTrash(c *arvados.Client) error {
+ return srv.put(c, "trash", srv.ChangeSet.Trashes)
+}
+
+// Perform a PUT request at path, with data (as JSON) in the request
+// body.
+func (srv *KeepService) put(c *arvados.Client, path string, data interface{}) error {
+ // We'll start a goroutine to do the JSON encoding, so we can
+ // stream it to the http client through a Pipe, rather than
+ // keeping the entire encoded version in memory.
+ jsonR, jsonW := io.Pipe()
+
+ // errC communicates any encoding errors back to our main
+ // goroutine.
+ errC := make(chan error, 1)
+
+ go func() {
+ enc := json.NewEncoder(jsonW)
+ errC <- enc.Encode(data)
+ jsonW.Close()
+ }()
+
+ url := srv.URLBase() + "/" + path
+ req, err := http.NewRequest("PUT", url, ioutil.NopCloser(jsonR))
+ if err != nil {
+ return fmt.Errorf("building request for %s: %v", url, err)
+ }
+ err = c.DoAndDecode(nil, req)
+
+ // If there was an error encoding the request body, report
+ // that instead of the response: obviously we won't get a
+ // useful response if our request wasn't properly encoded.
+ if encErr := <-errC; encErr != nil {
+ return fmt.Errorf("encoding data for %s: %v", url, encErr)
+ }
+
+ return err
+}
--- /dev/null
+package main
+
+import (
+ "encoding/json"
+ "flag"
+ "io/ioutil"
+ "log"
+ "os"
+ "os/signal"
+ "syscall"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvados"
+)
+
+// Config specifies site configuration, like API credentials and the
+// choice of which servers are to be balanced.
+//
+// Config is loaded from a JSON config file (see usage()).
+type Config struct {
+ // Arvados API endpoint and credentials.
+ Client arvados.Client
+
+ // List of service types (e.g., "disk") to balance.
+ KeepServiceTypes []string
+
+ KeepServiceList arvados.KeepServiceList
+
+ // How often to check
+ RunPeriod arvados.Duration
+}
+
+// RunOptions controls runtime behavior. The flags/options that belong
+// here are the ones that are useful for interactive use. For example,
+// "CommitTrash" is a runtime option rather than a config item because
+// it invokes a troubleshooting feature rather than expressing how
+// balancing is meant to be done at a given site.
+//
+// RunOptions fields are controlled by command line flags.
+type RunOptions struct {
+ Once bool
+ CommitPulls bool
+ CommitTrash bool
+ Logger *log.Logger
+ Dumper *log.Logger
+}
+
+var debugf = func(string, ...interface{}) {}
+
+func main() {
+ var config Config
+ var runOptions RunOptions
+
+ configPath := flag.String("config", "",
+ "`path` of json configuration file")
+ serviceListPath := flag.String("config.KeepServiceList", "",
+ "`path` of json file with list of keep services to balance, as given by \"arv keep_service list\" "+
+ "(default: config[\"KeepServiceList\"], or if none given, get all available services and filter by config[\"KeepServiceTypes\"])")
+ flag.BoolVar(&runOptions.Once, "once", false,
+ "balance once and then exit")
+ flag.BoolVar(&runOptions.CommitPulls, "commit-pulls", false,
+ "send pull requests (make more replicas of blocks that are underreplicated or are not in optimal rendezvous probe order)")
+ flag.BoolVar(&runOptions.CommitTrash, "commit-trash", false,
+ "send trash requests (delete unreferenced old blocks, and excess replicas of overreplicated blocks)")
+ dumpFlag := flag.Bool("dump", false, "dump details for each block to stdout")
+ debugFlag := flag.Bool("debug", false, "enable debug messages")
+ flag.Usage = usage
+ flag.Parse()
+
+ if *configPath == "" {
+ log.Fatal("You must specify a config file (see `keep-balance -help`)")
+ }
+ mustReadJSON(&config, *configPath)
+ if *serviceListPath != "" {
+ mustReadJSON(&config.KeepServiceList, *serviceListPath)
+ }
+
+ if *debugFlag {
+ debugf = log.Printf
+ if j, err := json.Marshal(config); err != nil {
+ log.Fatal(err)
+ } else {
+ log.Printf("config is %s", j)
+ }
+ }
+ if *dumpFlag {
+ runOptions.Dumper = log.New(os.Stdout, "", log.LstdFlags)
+ }
+ err := CheckConfig(config, runOptions)
+ if err != nil {
+ // (don't run)
+ } else if runOptions.Once {
+ err = (&Balancer{}).Run(config, runOptions)
+ } else {
+ err = RunForever(config, runOptions, nil)
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func mustReadJSON(dst interface{}, path string) {
+ if buf, err := ioutil.ReadFile(path); err != nil {
+ log.Fatalf("Reading %q: %v", path, err)
+ } else if err = json.Unmarshal(buf, dst); err != nil {
+ log.Fatalf("Decoding %q: %v", path, err)
+ }
+}
+
+// RunForever runs forever, or (for testing purposes) until the given
+// stop channel is ready to receive.
+func RunForever(config Config, runOptions RunOptions, stop <-chan interface{}) error {
+ if runOptions.Logger == nil {
+ runOptions.Logger = log.New(os.Stderr, "", log.LstdFlags)
+ }
+ logger := runOptions.Logger
+
+ ticker := time.NewTicker(time.Duration(config.RunPeriod))
+
+ // The unbuffered channel here means we only hear SIGUSR1 if
+ // it arrives while we're waiting in select{}.
+ sigUSR1 := make(chan os.Signal)
+ signal.Notify(sigUSR1, syscall.SIGUSR1)
+
+ logger.Printf("starting up: will scan every %v and on SIGUSR1", config.RunPeriod)
+
+ for {
+ if !runOptions.CommitPulls && !runOptions.CommitTrash {
+ logger.Print("WARNING: Will scan periodically, but no changes will be committed.")
+ logger.Print("======= Consider using -commit-pulls and -commit-trash flags.")
+ }
+
+ err := (&Balancer{}).Run(config, runOptions)
+ if err != nil {
+ logger.Print("run failed: ", err)
+ } else {
+ logger.Print("run succeeded")
+ }
+
+ select {
+ case <-stop:
+ signal.Stop(sigUSR1)
+ return nil
+ case <-ticker.C:
+ logger.Print("timer went off")
+ case <-sigUSR1:
+ logger.Print("received SIGUSR1, resetting timer")
+ // Reset the timer so we don't start the N+1st
+ // run too soon after the Nth run is triggered
+ // by SIGUSR1.
+ ticker.Stop()
+ ticker = time.NewTicker(time.Duration(config.RunPeriod))
+ }
+ logger.Print("starting next run")
+ }
+}
--- /dev/null
+package main
+
+import (
+ "encoding/json"
+ "time"
+
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&mainSuite{})
+
+type mainSuite struct{}
+
+func (s *mainSuite) TestExampleJSON(c *check.C) {
+ var config Config
+ c.Check(json.Unmarshal(exampleConfigFile, &config), check.IsNil)
+ c.Check(config.KeepServiceTypes, check.DeepEquals, []string{"disk"})
+ c.Check(config.Client.AuthToken, check.Equals, "xyzzy")
+ c.Check(time.Duration(config.RunPeriod), check.Equals, 600*time.Second)
+}
+
+func (s *mainSuite) TestConfigJSONWithKeepServiceList(c *check.C) {
+ var config Config
+ c.Check(json.Unmarshal([]byte(`
+ {
+ "Client": {
+ "APIHost": "zzzzz.arvadosapi.com:443",
+ "AuthToken": "xyzzy",
+ "Insecure": false
+ },
+ "KeepServiceList": {
+ "items": [
+ {"uuid":"zzzzz-bi64l-abcdefghijklmno", "service_type":"disk", "service_host":"a.zzzzz.arvadosapi.com", "service_port":12345},
+ {"uuid":"zzzzz-bi64l-bcdefghijklmnop", "service_type":"blob", "service_host":"b.zzzzz.arvadosapi.com", "service_port":12345}
+ ]
+ },
+ "RunPeriod": "600s"
+ }`), &config), check.IsNil)
+ c.Assert(len(config.KeepServiceList.Items), check.Equals, 2)
+ c.Check(config.KeepServiceList.Items[0].UUID, check.Equals, "zzzzz-bi64l-abcdefghijklmno")
+ c.Check(config.KeepServiceList.Items[0].ServicePort, check.Equals, 12345)
+ c.Check(config.Client.AuthToken, check.Equals, "xyzzy")
+}
--- /dev/null
+package main
+
+import (
+ "log"
+ "time"
+)
+
+func timeMe(logger *log.Logger, label string) func() {
+ t0 := time.Now()
+ logger.Printf("%s: start", label)
+ return func() {
+ logger.Printf("%s: took %v", label, time.Since(t0))
+ }
+}
--- /dev/null
+package main
+
+import (
+ "flag"
+ "fmt"
+ "os"
+)
+
+var exampleConfigFile = []byte(`
+ {
+ "Client": {
+ "APIHost": "zzzzz.arvadosapi.com:443",
+ "AuthToken": "xyzzy",
+ "Insecure": false
+ },
+ "KeepServiceTypes": [
+ "disk"
+ ],
+ "RunPeriod": "600s"
+ }`)
+
+func usage() {
+ fmt.Fprintf(os.Stderr, `
+
+keep-balance rebalances a set of keepstore servers. It creates new
+copies of underreplicated blocks, deletes excess copies of
+overreplicated and unreferenced blocks, and moves blocks to better
+positions (according to the rendezvous hash algorithm) so clients find
+them faster.
+
+Usage: keep-balance -config path/to/config.json [options]
+
+Options:
+`)
+ flag.PrintDefaults()
+ fmt.Fprintf(os.Stderr, `
+Example config file:
+%s
+
+ Client.AuthToken must be recognized by Arvados as an admin token,
+ and must be recognized by all Keep services as a "data manager
+ key".
+
+ Client.Insecure should be true if your Arvados API endpoint uses
+ an unverifiable SSL/TLS certificate.
+
+Periodic scanning:
+
+ By default, keep-balance operates periodically, i.e.: do a
+ scan/balance operation, sleep, repeat.
+
+ RunPeriod determines the interval between start times of
+ successive scan/balance operations. If a scan/balance operation
+ takes longer than RunPeriod, the next one will follow it
+ immediately.
+
+ If SIGUSR1 is received during an idle period between operations,
+ the next operation will start immediately.
+
+One-time scanning:
+
+ Use the -once flag to do a single operation and then exit. The
+ exit code will be zero if the operation was successful.
+
+Committing:
+
+ By default, keep-service computes and reports changes but does not
+ implement them by sending pull and trash lists to the Keep
+ services.
+
+ Use the -commit-pull and -commit-trash flags to implement the
+ computed changes.
+
+Limitations:
+
+ keep-balance does not attempt to discover whether committed pull
+ and trash requests ever get carried out -- only that they are
+ accepted by the Keep services. If some services are full, new
+ copies of underreplicated blocks might never get made, only
+ repeatedly requested.
+
+`, exampleConfigFile)
+}
statusCode, statusText = http.StatusInternalServerError, err.Error()
return
}
+ if kc.Client != nil && kc.Client.Transport != nil {
+ // Workaround for https://dev.arvados.org/issues/9005
+ if t, ok := kc.Client.Transport.(*http.Transport); ok {
+ defer t.CloseIdleConnections()
+ }
+ }
rdr, err := kc.CollectionFileReader(collection, filename)
if os.IsNotExist(err) {
statusCode = http.StatusNotFound
s.test100BlockFile(c, 10000000)
}
-func (s *IntegrationSuite) Test300MBFile(c *check.C) {
- s.test100BlockFile(c, 3000000)
+func (s *IntegrationSuite) Test100BlockFile(c *check.C) {
+ if testing.Short() {
+ // 3 MB
+ s.test100BlockFile(c, 30000)
+ } else {
+ // 300 MB
+ s.test100BlockFile(c, 3000000)
+ }
}
func (s *IntegrationSuite) test100BlockFile(c *check.C, blocksize int) {
}
if cache.RecallToken(tok) {
- // Valid in the cache, short circut
+ // Valid in the cache, short circuit
return true, tok
}
"log"
"os"
"regexp"
+ "strconv"
"strings"
"sync"
"time"
return nil
}
+// Return true if expires_at metadata attribute is found on the block
+func (v *AzureBlobVolume) checkTrashed(loc string) (bool, map[string]string, error) {
+ metadata, err := v.bsClient.GetBlobMetadata(v.containerName, loc)
+ if err != nil {
+ return false, metadata, v.translateError(err)
+ }
+ if metadata["expires_at"] != "" {
+ return true, metadata, nil
+ }
+ return false, metadata, nil
+}
+
// Get reads a Keep block that has been stored as a block blob in the
// container.
//
// If the block is younger than azureWriteRaceInterval and is
// unexpectedly empty, assume a PutBlob operation is in progress, and
// wait for it to finish writing.
-func (v *AzureBlobVolume) Get(loc string) ([]byte, error) {
+func (v *AzureBlobVolume) Get(loc string, buf []byte) (int, error) {
+ trashed, _, err := v.checkTrashed(loc)
+ if err != nil {
+ return 0, err
+ }
+ if trashed {
+ return 0, os.ErrNotExist
+ }
var deadline time.Time
haveDeadline := false
- buf, err := v.get(loc)
- for err == nil && len(buf) == 0 && loc != "d41d8cd98f00b204e9800998ecf8427e" {
+ size, err := v.get(loc, buf)
+ for err == nil && size == 0 && loc != "d41d8cd98f00b204e9800998ecf8427e" {
// Seeing a brand new empty block probably means we're
// in a race with CreateBlob, which under the hood
// (apparently) does "CreateEmpty" and "CommitData"
} else if time.Now().After(deadline) {
break
}
- bufs.Put(buf)
time.Sleep(azureWriteRacePollTime)
- buf, err = v.get(loc)
+ size, err = v.get(loc, buf)
}
if haveDeadline {
- log.Printf("Race ended with len(buf)==%d", len(buf))
+ log.Printf("Race ended with size==%d", size)
}
- return buf, err
+ return size, err
}
-func (v *AzureBlobVolume) get(loc string) ([]byte, error) {
- expectSize := BlockSize
+func (v *AzureBlobVolume) get(loc string, buf []byte) (int, error) {
+ expectSize := len(buf)
if azureMaxGetBytes < BlockSize {
// Unfortunately the handler doesn't tell us how long the blob
// is expected to be, so we have to ask Azure.
props, err := v.bsClient.GetBlobProperties(v.containerName, loc)
if err != nil {
- return nil, v.translateError(err)
+ return 0, v.translateError(err)
}
if props.ContentLength > int64(BlockSize) || props.ContentLength < 0 {
- return nil, fmt.Errorf("block %s invalid size %d (max %d)", loc, props.ContentLength, BlockSize)
+ return 0, fmt.Errorf("block %s invalid size %d (max %d)", loc, props.ContentLength, BlockSize)
}
expectSize = int(props.ContentLength)
}
- buf := bufs.Get(expectSize)
if expectSize == 0 {
- return buf, nil
+ return 0, nil
}
// We'll update this actualSize if/when we get the last piece.
if startPos == 0 && endPos == expectSize {
rdr, err = v.bsClient.GetBlob(v.containerName, loc)
} else {
- rdr, err = v.bsClient.GetBlobRange(v.containerName, loc, fmt.Sprintf("%d-%d", startPos, endPos-1))
+ rdr, err = v.bsClient.GetBlobRange(v.containerName, loc, fmt.Sprintf("%d-%d", startPos, endPos-1), nil)
}
if err != nil {
errors[p] = err
wg.Wait()
for _, err := range errors {
if err != nil {
- bufs.Put(buf)
- return nil, v.translateError(err)
+ return 0, v.translateError(err)
}
}
- return buf[:actualSize], nil
+ return actualSize, nil
}
// Compare the given data with existing stored data.
func (v *AzureBlobVolume) Compare(loc string, expect []byte) error {
+ trashed, _, err := v.checkTrashed(loc)
+ if err != nil {
+ return err
+ }
+ if trashed {
+ return os.ErrNotExist
+ }
rdr, err := v.bsClient.GetBlob(v.containerName, loc)
if err != nil {
return v.translateError(err)
if v.readonly {
return MethodDisabledError
}
- return v.bsClient.CreateBlockBlobFromReader(v.containerName, loc, uint64(len(block)), bytes.NewReader(block))
+ return v.bsClient.CreateBlockBlobFromReader(v.containerName, loc, uint64(len(block)), bytes.NewReader(block), nil)
}
// Touch updates the last-modified property of a block blob.
if v.readonly {
return MethodDisabledError
}
- return v.bsClient.SetBlobMetadata(v.containerName, loc, map[string]string{
- "touch": fmt.Sprintf("%d", time.Now()),
- })
+ trashed, metadata, err := v.checkTrashed(loc)
+ if err != nil {
+ return err
+ }
+ if trashed {
+ return os.ErrNotExist
+ }
+
+ metadata["touch"] = fmt.Sprintf("%d", time.Now())
+ return v.bsClient.SetBlobMetadata(v.containerName, loc, metadata, nil)
}
// Mtime returns the last-modified property of a block blob.
func (v *AzureBlobVolume) Mtime(loc string) (time.Time, error) {
+ trashed, _, err := v.checkTrashed(loc)
+ if err != nil {
+ return time.Time{}, err
+ }
+ if trashed {
+ return time.Time{}, os.ErrNotExist
+ }
+
props, err := v.bsClient.GetBlobProperties(v.containerName, loc)
if err != nil {
return time.Time{}, err
// container.
func (v *AzureBlobVolume) IndexTo(prefix string, writer io.Writer) error {
params := storage.ListBlobsParameters{
- Prefix: prefix,
+ Prefix: prefix,
+ Include: "metadata",
}
for {
resp, err := v.bsClient.ListBlobs(v.containerName, params)
// value.
continue
}
+ if b.Metadata["expires_at"] != "" {
+ // Trashed blob; exclude it from response
+ continue
+ }
fmt.Fprintf(writer, "%s+%d %d\n", b.Name, b.Properties.ContentLength, t.Unix())
}
if resp.NextMarker == "" {
return MethodDisabledError
}
- if trashLifetime != 0 {
- return ErrNotImplemented
- }
-
// Ideally we would use If-Unmodified-Since, but that
// particular condition seems to be ignored by Azure. Instead,
// we get the Etag before checking Mtime, and use If-Match to
} else if time.Since(t) < blobSignatureTTL {
return nil
}
- return v.bsClient.DeleteBlob(v.containerName, loc, map[string]string{
+
+ // If trashLifetime == 0, just delete it
+ if trashLifetime == 0 {
+ return v.bsClient.DeleteBlob(v.containerName, loc, map[string]string{
+ "If-Match": props.Etag,
+ })
+ }
+
+ // Otherwise, mark as trash
+ return v.bsClient.SetBlobMetadata(v.containerName, loc, map[string]string{
+ "expires_at": fmt.Sprintf("%d", time.Now().Add(trashLifetime).Unix()),
+ }, map[string]string{
"If-Match": props.Etag,
})
}
// Untrash a Keep block.
-// TBD
+// Delete the expires_at metadata attribute
func (v *AzureBlobVolume) Untrash(loc string) error {
- return ErrNotImplemented
+ // if expires_at does not exist, return NotFoundError
+ metadata, err := v.bsClient.GetBlobMetadata(v.containerName, loc)
+ if err != nil {
+ return v.translateError(err)
+ }
+ if metadata["expires_at"] == "" {
+ return os.ErrNotExist
+ }
+
+ // reset expires_at metadata attribute
+ metadata["expires_at"] = ""
+ err = v.bsClient.SetBlobMetadata(v.containerName, loc, metadata, nil)
+ return v.translateError(err)
}
// Status returns a VolumeStatus struct with placeholder data.
switch {
case err == nil:
return err
- case strings.Contains(err.Error(), "404 Not Found"):
+ case strings.Contains(err.Error(), "Not Found"):
// "storage: service returned without a response body (404 Not Found)"
return os.ErrNotExist
default:
// EmptyTrash looks for trashed blocks that exceeded trashLifetime
// and deletes them from the volume.
-// TBD
func (v *AzureBlobVolume) EmptyTrash() {
+ var bytesDeleted, bytesInTrash int64
+ var blocksDeleted, blocksInTrash int
+ params := storage.ListBlobsParameters{Include: "metadata"}
+
+ for {
+ resp, err := v.bsClient.ListBlobs(v.containerName, params)
+ if err != nil {
+ log.Printf("EmptyTrash: ListBlobs: %v", err)
+ break
+ }
+ for _, b := range resp.Blobs {
+ // Check if the block is expired
+ if b.Metadata["expires_at"] == "" {
+ continue
+ }
+
+ blocksInTrash++
+ bytesInTrash += b.Properties.ContentLength
+
+ expiresAt, err := strconv.ParseInt(b.Metadata["expires_at"], 10, 64)
+ if err != nil {
+ log.Printf("EmptyTrash: ParseInt(%v): %v", b.Metadata["expires_at"], err)
+ continue
+ }
+
+ if expiresAt > time.Now().Unix() {
+ continue
+ }
+
+ err = v.bsClient.DeleteBlob(v.containerName, b.Name, map[string]string{
+ "If-Match": b.Properties.Etag,
+ })
+ if err != nil {
+ log.Printf("EmptyTrash: DeleteBlob(%v): %v", b.Name, err)
+ continue
+ }
+ blocksDeleted++
+ bytesDeleted += b.Properties.ContentLength
+ }
+ if resp.NextMarker == "" {
+ break
+ }
+ params.Marker = resp.NextMarker
+ }
+
+ log.Printf("EmptyTrash stats for %v: Deleted %v bytes in %v blocks. Remaining in trash: %v bytes in %v blocks.", v.String(), bytesDeleted, blocksDeleted, bytesInTrash-bytesDeleted, blocksInTrash-blocksDeleted)
}
h.blobs[container+"|"+hash] = &azBlob{
Data: data,
Mtime: time.Now(),
+ Metadata: make(map[string]string),
Uncommitted: make(map[string][]byte),
}
}
h.blobs[container+"|"+hash] = &azBlob{
Mtime: time.Now(),
Uncommitted: make(map[string][]byte),
+ Metadata: make(map[string]string),
Etag: makeEtag(),
}
h.unlockAndRace()
}
+ metadata := make(map[string]string)
+ for k, v := range r.Header {
+ if strings.HasPrefix(strings.ToLower(k), "x-ms-meta-") {
+ name := k[len("x-ms-meta-"):]
+ metadata[strings.ToLower(name)] = v[0]
+ }
+ }
h.blobs[container+"|"+hash] = &azBlob{
Data: body,
Mtime: time.Now(),
Uncommitted: make(map[string][]byte),
+ Metadata: metadata,
Etag: makeEtag(),
}
rw.WriteHeader(http.StatusCreated)
blob.Metadata = make(map[string]string)
for k, v := range r.Header {
if strings.HasPrefix(strings.ToLower(k), "x-ms-meta-") {
- blob.Metadata[k] = v[0]
+ name := k[len("x-ms-meta-"):]
+ blob.Metadata[strings.ToLower(name)] = v[0]
}
}
blob.Mtime = time.Now()
blob.Etag = makeEtag()
+ case (r.Method == "GET" || r.Method == "HEAD") && r.Form.Get("comp") == "metadata" && hash != "":
+ // "Get Blob Metadata" API
+ if !blobExists {
+ rw.WriteHeader(http.StatusNotFound)
+ return
+ }
+ for k, v := range blob.Metadata {
+ rw.Header().Set(fmt.Sprintf("x-ms-meta-%s", k), v)
+ }
+ return
case (r.Method == "GET" || r.Method == "HEAD") && hash != "":
// "Get Blob" API
if !blobExists {
}
if len(resp.Blobs) > 0 || marker == "" || marker == hash {
blob := h.blobs[container+"|"+hash]
- resp.Blobs = append(resp.Blobs, storage.Blob{
+ bmeta := map[string]string(nil)
+ if r.Form.Get("include") == "metadata" {
+ bmeta = blob.Metadata
+ }
+ b := storage.Blob{
Name: hash,
Properties: storage.BlobProperties{
LastModified: blob.Mtime.Format(time.RFC1123),
ContentLength: int64(len(blob.Data)),
Etag: blob.Etag,
},
- })
+ Metadata: bmeta,
+ }
+ resp.Blobs = append(resp.Blobs, b)
}
}
buf, err := xml.Marshal(resp)
if err != nil {
t.Error(err)
}
- gotData, err := v.Get(hash)
+ gotData := make([]byte, len(data))
+ gotLen, err := v.Get(hash, gotData)
if err != nil {
t.Error(err)
}
gotHash := fmt.Sprintf("%x", md5.Sum(gotData))
- gotLen := len(gotData)
- bufs.Put(gotData)
if gotLen != size {
t.Error("length mismatch: got %d != %d", gotLen, size)
}
// Wait for the stub's Put to create the empty blob
v.azHandler.race <- continuePut
go func() {
- buf, err := v.Get(TestHash)
+ buf := make([]byte, len(TestBlock))
+ _, err := v.Get(TestHash, buf)
if err != nil {
t.Error(err)
- } else {
- bufs.Put(buf)
}
close(allDone)
}()
allDone := make(chan struct{})
go func() {
defer close(allDone)
- buf, err := v.Get(TestHash)
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, buf)
if err != nil {
t.Error(err)
return
}
- if len(buf) != 0 {
- t.Errorf("Got %+q, expected empty buf", buf)
+ if n != 0 {
+ t.Errorf("Got %+q, expected empty buf", buf[:n])
}
- bufs.Put(buf)
}()
select {
case <-allDone:
expectedDc, responseDc)
}
// Confirm the block has been deleted
- _, err := vols[0].Get(TestHash)
+ buf := make([]byte, BlockSize)
+ _, err := vols[0].Get(TestHash, buf)
var blockDeleted = os.IsNotExist(err)
if !blockDeleted {
t.Error("superuserExistingBlockReq: block not deleted")
expectedDc, responseDc)
}
// Confirm the block has NOT been deleted.
- _, err = vols[0].Get(TestHash)
+ _, err = vols[0].Get(TestHash, buf)
if err != nil {
t.Errorf("testing delete on new block: %s\n", err)
}
if rt.apiToken != "" {
req.Header.Set("Authorization", "OAuth2 "+rt.apiToken)
}
- loggingRouter := MakeLoggingRESTRouter()
+ loggingRouter := MakeRESTRouter()
loggingRouter.ServeHTTP(response, req)
return response
}
}
}
+type notifyingResponseRecorder struct {
+ *httptest.ResponseRecorder
+ closer chan bool
+}
+
+func (r *notifyingResponseRecorder) CloseNotify() <-chan bool {
+ return r.closer
+}
+
+func TestGetHandlerClientDisconnect(t *testing.T) {
+ defer func(was bool) {
+ enforcePermissions = was
+ }(enforcePermissions)
+ enforcePermissions = false
+
+ defer func(orig *bufferPool) {
+ bufs = orig
+ }(bufs)
+ bufs = newBufferPool(1, BlockSize)
+ defer bufs.Put(bufs.Get(BlockSize))
+
+ KeepVM = MakeTestVolumeManager(2)
+ defer KeepVM.Close()
+
+ if err := KeepVM.AllWritable()[0].Put(TestHash, TestBlock); err != nil {
+ t.Error(err)
+ }
+
+ resp := ¬ifyingResponseRecorder{
+ ResponseRecorder: httptest.NewRecorder(),
+ closer: make(chan bool, 1),
+ }
+ if _, ok := http.ResponseWriter(resp).(http.CloseNotifier); !ok {
+ t.Fatal("notifyingResponseRecorder is broken")
+ }
+ // If anyone asks, the client has disconnected.
+ resp.closer <- true
+
+ ok := make(chan struct{})
+ go func() {
+ req, _ := http.NewRequest("GET", fmt.Sprintf("/%s+%d", TestHash, len(TestBlock)), nil)
+ (&LoggingRESTRouter{MakeRESTRouter()}).ServeHTTP(resp, req)
+ ok <- struct{}{}
+ }()
+
+ select {
+ case <-time.After(20 * time.Second):
+ t.Fatal("request took >20s, close notifier must be broken")
+ case <-ok:
+ }
+
+ ExpectStatusCode(t, "client disconnect", http.StatusServiceUnavailable, resp.ResponseRecorder)
+ for i, v := range KeepVM.AllWritable() {
+ if calls := v.(*MockVolume).called["GET"]; calls != 0 {
+ t.Errorf("volume %d got %d calls, expected 0", i, calls)
+ }
+ }
+}
+
// Invoke the GetBlockHandler a bunch of times to test for bufferpool resource
// leak.
func TestGetHandlerNoBufferleak(t *testing.T) {
}
}
- block, err := GetBlock(mux.Vars(req)["hash"])
+ // TODO: Probe volumes to check whether the block _might_
+ // exist. Some volumes/types could support a quick existence
+ // check without causing other operations to suffer. If all
+ // volumes support that, and assure us the block definitely
+ // isn't here, we can return 404 now instead of waiting for a
+ // buffer.
+
+ buf, err := getBufferForResponseWriter(resp, bufs, BlockSize)
if err != nil {
- // This type assertion is safe because the only errors
- // GetBlock can return are DiskHashError or NotFoundError.
- http.Error(resp, err.Error(), err.(*KeepError).HTTPCode)
+ http.Error(resp, err.Error(), http.StatusServiceUnavailable)
return
}
- defer bufs.Put(block)
+ defer bufs.Put(buf)
- resp.Header().Set("Content-Length", strconv.Itoa(len(block)))
+ size, err := GetBlock(mux.Vars(req)["hash"], buf, resp)
+ if err != nil {
+ code := http.StatusInternalServerError
+ if err, ok := err.(*KeepError); ok {
+ code = err.HTTPCode
+ }
+ http.Error(resp, err.Error(), code)
+ return
+ }
+
+ resp.Header().Set("Content-Length", strconv.Itoa(size))
resp.Header().Set("Content-Type", "application/octet-stream")
- resp.Write(block)
+ resp.Write(buf[:size])
+}
+
+// Get a buffer from the pool -- but give up and return a non-nil
+// error if resp implements http.CloseNotifier and tells us that the
+// client has disconnected before we get a buffer.
+func getBufferForResponseWriter(resp http.ResponseWriter, bufs *bufferPool, bufSize int) ([]byte, error) {
+ var closeNotifier <-chan bool
+ if resp, ok := resp.(http.CloseNotifier); ok {
+ closeNotifier = resp.CloseNotify()
+ }
+ var buf []byte
+ bufReady := make(chan []byte)
+ go func() {
+ bufReady <- bufs.Get(bufSize)
+ close(bufReady)
+ }()
+ select {
+ case buf = <-bufReady:
+ return buf, nil
+ case <-closeNotifier:
+ go func() {
+ // Even if closeNotifier happened first, we
+ // need to keep waiting for our buf so we can
+ // return it to the pool.
+ bufs.Put(<-bufReady)
+ }()
+ return nil, ErrClientDisconnect
+ }
}
// PutBlockHandler is a HandleFunc to address Put block requests.
return
}
- buf := bufs.Get(int(req.ContentLength))
- _, err := io.ReadFull(req.Body, buf)
+ buf, err := getBufferForResponseWriter(resp, bufs, int(req.ContentLength))
+ if err != nil {
+ http.Error(resp, err.Error(), http.StatusServiceUnavailable)
+ return
+ }
+
+ _, err = io.ReadFull(req.Body, buf)
if err != nil {
http.Error(resp, err.Error(), 500)
bufs.Put(buf)
}
}
-// ==============================
// GetBlock and PutBlock implement lower-level code for handling
// blocks by rooting through volumes connected to the local machine.
// Once the handler has determined that system policy permits the
// should be the only part of the code that cares about which volume a
// block is stored on, so it should be responsible for figuring out
// which volume to check for fetching blocks, storing blocks, etc.
-// ==============================
-// GetBlock fetches and returns the block identified by "hash".
-//
-// On success, GetBlock returns a byte slice with the block data, and
-// a nil error.
+// GetBlock fetches the block identified by "hash" into the provided
+// buf, and returns the data size.
//
// If the block cannot be found on any volume, returns NotFoundError.
//
// If the block found does not have the correct MD5 hash, returns
// DiskHashError.
//
-func GetBlock(hash string) ([]byte, error) {
+func GetBlock(hash string, buf []byte, resp http.ResponseWriter) (int, error) {
// Attempt to read the requested hash from a keep volume.
errorToCaller := NotFoundError
for _, vol := range KeepVM.AllReadable() {
- buf, err := vol.Get(hash)
+ size, err := vol.Get(hash, buf)
if err != nil {
// IsNotExist is an expected error and may be
// ignored. All other errors are logged. In
}
// Check the file checksum.
//
- filehash := fmt.Sprintf("%x", md5.Sum(buf))
+ filehash := fmt.Sprintf("%x", md5.Sum(buf[:size]))
if filehash != hash {
// TODO: Try harder to tell a sysadmin about
// this.
log.Printf("%s: checksum mismatch for request %s (actual %s)",
vol, hash, filehash)
errorToCaller = DiskHashError
- bufs.Put(buf)
continue
}
if errorToCaller == DiskHashError {
log.Printf("%s: checksum mismatch for request %s but a good copy was found on another volume and returned",
vol, hash)
}
- return buf, nil
+ return size, nil
}
- return nil, errorToCaller
+ return 0, errorToCaller
}
// PutBlock Stores the BLOCK (identified by the content id HASH) in Keep.
testableVolumes[1].PutRaw(testHash, testBlock)
// Get should pass
- buf, err := GetBlock(testHash)
+ buf := make([]byte, len(testBlock))
+ n, err := GetBlock(testHash, buf, nil)
if err != nil {
t.Fatalf("Error while getting block %s", err)
}
- if bytes.Compare(buf, testBlock) != 0 {
- t.Errorf("Put succeeded but Get returned %+v, expected %+v", buf, testBlock)
+ if bytes.Compare(buf[:n], testBlock) != 0 {
+ t.Errorf("Put succeeded but Get returned %+v, expected %+v", buf[:n], testBlock)
}
}
testableVolumes[1].PutRaw(testHash, badData)
// Get should fail
- _, err := GetBlock(testHash)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(testHash, buf, nil)
if err == nil {
- t.Fatalf("Expected error while getting corrupt block %v", testHash)
+ t.Fatalf("Got %+q, expected error while getting corrupt block %v", buf[:size], testHash)
}
}
}
// Check that PutBlock stored the data as expected
- buf, err := GetBlock(testHash)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(testHash, buf, nil)
if err != nil {
t.Fatalf("Error during GetBlock for %q: %s", testHash, err)
- } else if bytes.Compare(buf, testBlock) != 0 {
- t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf)
+ } else if bytes.Compare(buf[:size], testBlock) != 0 {
+ t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf[:size])
}
}
// Put succeeded and overwrote the badData in one volume,
// and Get should return the testBlock now, ignoring the bad data.
- buf, err := GetBlock(testHash)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(testHash, buf, nil)
if err != nil {
t.Fatalf("Error during GetBlock for %q: %s", testHash, err)
- } else if bytes.Compare(buf, testBlock) != 0 {
- t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf)
+ } else if bytes.Compare(buf[:size], testBlock) != 0 {
+ t.Errorf("Get response incorrect. Expected %q; found %q", testBlock, buf[:size])
}
}
"bytes"
"flag"
"fmt"
+ "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+ "git.curoverse.com/arvados.git/sdk/go/httpserver"
"git.curoverse.com/arvados.git/sdk/go/keepclient"
"io/ioutil"
"log"
TooLongError = &KeepError{413, "Block is too large"}
MethodDisabledError = &KeepError{405, "Method disabled"}
ErrNotImplemented = &KeepError{500, "Unsupported configuration"}
+ ErrClientDisconnect = &KeepError{503, "Client disconnected"}
)
func (e *KeepError) Error() string {
blobSigningKeyFile string
permissionTTLSec int
pidfile string
+ maxRequests int
)
flag.StringVar(
&dataManagerTokenFile,
"listen",
DefaultAddr,
"Listening address, in the form \"host:port\". e.g., 10.0.1.24:8000. Omit the host part to listen on all interfaces.")
+ flag.IntVar(
+ &maxRequests,
+ "max-requests",
+ 0,
+ "Maximum concurrent requests. When this limit is reached, new requests will receive 503 responses. Note: this limit does not include idle connections from clients using HTTP keepalive, so it does not strictly limit the number of concurrent connections. (default 2 * max-buffers)")
flag.BoolVar(
&neverDelete,
"never-delete",
&permissionTTLSec,
"blob-signature-ttl",
int(time.Duration(2*7*24*time.Hour).Seconds()),
- "Lifetime of blob permission signatures. "+
+ "Lifetime of blob permission signatures. Modifying the ttl will invalidate all existing signatures. "+
"See services/api/config/application.default.yml.")
flag.BoolVar(
&flagSerializeIO,
}
}
+ if maxRequests <= 0 {
+ maxRequests = maxBuffers * 2
+ log.Printf("-max-requests <1 or not specified; defaulting to maxBuffers * 2 == %d", maxRequests)
+ }
+
// Start a round-robin VolumeManager with the volumes we have found.
KeepVM = MakeRRVolumeManager(volumes)
- // Tell the built-in HTTP server to direct all requests to the REST router.
- loggingRouter := MakeLoggingRESTRouter()
- http.HandleFunc("/", func(resp http.ResponseWriter, req *http.Request) {
- loggingRouter.ServeHTTP(resp, req)
+ // Middleware stack: logger, maxRequests limiter, method handlers
+ http.Handle("/", &LoggingRESTRouter{
+ httpserver.NewRequestLimiter(maxRequests,
+ MakeRESTRouter()),
})
// Set up a TCP listener.
// Initialize Pull queue and worker
keepClient := &keepclient.KeepClient{
- Arvados: nil,
+ Arvados: &arvadosclient.ArvadosClient{},
Want_replicas: 1,
Client: &http.Client{},
}
}
// Check that GetBlock returns success.
- result, err := GetBlock(TestHash)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(TestHash, buf, nil)
if err != nil {
t.Errorf("GetBlock error: %s", err)
}
- if fmt.Sprint(result) != fmt.Sprint(TestBlock) {
- t.Errorf("expected %s, got %s", TestBlock, result)
+ if bytes.Compare(buf[:size], TestBlock) != 0 {
+ t.Errorf("got %v, expected %v", buf[:size], TestBlock)
}
}
defer KeepVM.Close()
// Check that GetBlock returns failure.
- result, err := GetBlock(TestHash)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(TestHash, buf, nil)
if err != NotFoundError {
- t.Errorf("Expected NotFoundError, got %v", result)
+ t.Errorf("Expected NotFoundError, got %v, err %v", buf[:size], err)
}
}
vols[0].Put(TestHash, BadBlock)
// Check that GetBlock returns failure.
- result, err := GetBlock(TestHash)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(TestHash, buf, nil)
if err != DiskHashError {
- t.Errorf("Expected DiskHashError, got %v (buf: %v)", err, result)
+ t.Errorf("Expected DiskHashError, got %v (buf: %v)", err, buf[:size])
}
}
}
vols := KeepVM.AllReadable()
- result, err := vols[1].Get(TestHash)
+ buf := make([]byte, BlockSize)
+ n, err := vols[1].Get(TestHash, buf)
if err != nil {
t.Fatalf("Volume #0 Get returned error: %v", err)
}
- if string(result) != string(TestBlock) {
+ if string(buf[:n]) != string(TestBlock) {
t.Fatalf("PutBlock stored '%s', Get retrieved '%s'",
- string(TestBlock), string(result))
+ string(TestBlock), string(buf[:n]))
}
}
t.Fatalf("PutBlock: n %d err %v", n, err)
}
- result, err := GetBlock(TestHash)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(TestHash, buf, nil)
if err != nil {
t.Fatalf("GetBlock: %v", err)
}
- if string(result) != string(TestBlock) {
- t.Error("PutBlock/GetBlock mismatch")
- t.Fatalf("PutBlock stored '%s', GetBlock retrieved '%s'",
- string(TestBlock), string(result))
+ if bytes.Compare(buf[:size], TestBlock) != 0 {
+ t.Fatalf("PutBlock stored %+q, GetBlock retrieved %+q",
+ TestBlock, buf[:size])
}
}
}
// Confirm that GetBlock fails to return anything.
- if result, err := GetBlock(TestHash); err != NotFoundError {
+ if result, err := GetBlock(TestHash, make([]byte, BlockSize), nil); err != NotFoundError {
t.Errorf("GetBlock succeeded after a corrupt block store (result = %s, err = %v)",
string(result), err)
}
}
// The block on disk should now match TestBlock.
- if block, err := GetBlock(TestHash); err != nil {
+ buf := make([]byte, BlockSize)
+ if size, err := GetBlock(TestHash, buf, nil); err != nil {
t.Errorf("GetBlock: %v", err)
- } else if bytes.Compare(block, TestBlock) != 0 {
- t.Errorf("GetBlock returned: '%s'", string(block))
+ } else if bytes.Compare(buf[:size], TestBlock) != 0 {
+ t.Errorf("Got %+q, expected %+q", buf[:size], TestBlock)
}
}
t.Errorf("mtime was changed on vols[0]:\noldMtime = %v\nnewMtime = %v\n",
oldMtime, newMtime)
}
- result, err := vols[1].Get(TestHash)
+ buf := make([]byte, BlockSize)
+ n, err := vols[1].Get(TestHash, buf)
if err != nil {
t.Fatalf("vols[1]: %v", err)
}
- if bytes.Compare(result, TestBlock) != 0 {
- t.Errorf("new block does not match test block\nnew block = %v\n", result)
+ if bytes.Compare(buf[:n], TestBlock) != 0 {
+ t.Errorf("new block does not match test block\nnew block = %v\n", buf[:n])
}
}
// LoggingResponseWriter
import (
- "github.com/gorilla/mux"
"log"
"net/http"
"strings"
sentHdr time.Time
}
+// CloseNotify implements http.CloseNotifier.
+func (resp *LoggingResponseWriter) CloseNotify() <-chan bool {
+ wrapped, ok := resp.ResponseWriter.(http.CloseNotifier)
+ if !ok {
+ // If upstream doesn't implement CloseNotifier, we can
+ // satisfy the interface by returning a channel that
+ // never sends anything (the interface doesn't
+ // guarantee that anything will ever be sent on the
+ // channel even if the client disconnects).
+ return nil
+ }
+ return wrapped.CloseNotify()
+}
+
// WriteHeader writes header to ResponseWriter
-func (loggingWriter *LoggingResponseWriter) WriteHeader(code int) {
- if loggingWriter.sentHdr == zeroTime {
- loggingWriter.sentHdr = time.Now()
+func (resp *LoggingResponseWriter) WriteHeader(code int) {
+ if resp.sentHdr == zeroTime {
+ resp.sentHdr = time.Now()
}
- loggingWriter.Status = code
- loggingWriter.ResponseWriter.WriteHeader(code)
+ resp.Status = code
+ resp.ResponseWriter.WriteHeader(code)
}
var zeroTime time.Time
-func (loggingWriter *LoggingResponseWriter) Write(data []byte) (int, error) {
- if loggingWriter.Length == 0 && len(data) > 0 && loggingWriter.sentHdr == zeroTime {
- loggingWriter.sentHdr = time.Now()
+func (resp *LoggingResponseWriter) Write(data []byte) (int, error) {
+ if resp.Length == 0 && len(data) > 0 && resp.sentHdr == zeroTime {
+ resp.sentHdr = time.Now()
}
- loggingWriter.Length += len(data)
- if loggingWriter.Status >= 400 {
- loggingWriter.ResponseBody += string(data)
+ resp.Length += len(data)
+ if resp.Status >= 400 {
+ resp.ResponseBody += string(data)
}
- return loggingWriter.ResponseWriter.Write(data)
+ return resp.ResponseWriter.Write(data)
}
// LoggingRESTRouter is used to add logging capabilities to mux.Router
type LoggingRESTRouter struct {
- router *mux.Router
-}
-
-// MakeLoggingRESTRouter initializes LoggingRESTRouter
-func MakeLoggingRESTRouter() *LoggingRESTRouter {
- router := MakeRESTRouter()
- return (&LoggingRESTRouter{router})
+ router http.Handler
}
-func (loggingRouter *LoggingRESTRouter) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+func (loggingRouter *LoggingRESTRouter) ServeHTTP(wrappedResp http.ResponseWriter, req *http.Request) {
t0 := time.Now()
- loggingWriter := LoggingResponseWriter{http.StatusOK, 0, resp, "", zeroTime}
- loggingRouter.router.ServeHTTP(&loggingWriter, req)
- statusText := http.StatusText(loggingWriter.Status)
- if loggingWriter.Status >= 400 {
- statusText = strings.Replace(loggingWriter.ResponseBody, "\n", "", -1)
+ resp := LoggingResponseWriter{http.StatusOK, 0, wrappedResp, "", zeroTime}
+ loggingRouter.router.ServeHTTP(&resp, req)
+ statusText := http.StatusText(resp.Status)
+ if resp.Status >= 400 {
+ statusText = strings.Replace(resp.ResponseBody, "\n", "", -1)
}
now := time.Now()
tTotal := now.Sub(t0)
- tLatency := loggingWriter.sentHdr.Sub(t0)
- tResponse := now.Sub(loggingWriter.sentHdr)
- log.Printf("[%s] %s %s %d %.6fs %.6fs %.6fs %d %d \"%s\"", req.RemoteAddr, req.Method, req.URL.Path[1:], req.ContentLength, tTotal.Seconds(), tLatency.Seconds(), tResponse.Seconds(), loggingWriter.Status, loggingWriter.Length, statusText)
+ tLatency := resp.sentHdr.Sub(t0)
+ tResponse := now.Sub(resp.sentHdr)
+ log.Printf("[%s] %s %s %d %.6fs %.6fs %.6fs %d %d \"%s\"", req.RemoteAddr, req.Method, req.URL.Path[1:], req.ContentLength, tTotal.Seconds(), tLatency.Seconds(), tResponse.Seconds(), resp.Status, resp.Length, statusText)
}
--- /dev/null
+package main
+
+import (
+ "net/http"
+ "testing"
+)
+
+func TestLoggingResponseWriterImplementsCloseNotifier(t *testing.T) {
+ http.ResponseWriter(&LoggingResponseWriter{}).(http.CloseNotifier).CloseNotify()
+}
// SignLocator takes a blobLocator, an apiToken and an expiry time, and
// returns a signed locator string.
func SignLocator(blobLocator, apiToken string, expiry time.Time) string {
- return keepclient.SignLocator(blobLocator, apiToken, expiry, PermissionSecret)
+ return keepclient.SignLocator(blobLocator, apiToken, expiry, blobSignatureTTL, PermissionSecret)
}
// VerifySignature returns nil if the signature on the signedLocator
// something the client could have figured out independently) or
// PermissionError.
func VerifySignature(signedLocator, apiToken string) error {
- err := keepclient.VerifySignature(signedLocator, apiToken, PermissionSecret)
+ err := keepclient.VerifySignature(signedLocator, apiToken, blobSignatureTTL, PermissionSecret)
if err == keepclient.ErrSignatureExpired {
return ExpiredError
} else if err != nil {
"gokee3eamvjy8qq1fvy238838enjmy5wzy2md7yvsitp5vztft6j4q866efym7e6" +
"vu5wm9fpnwjyxfldw3vbo01mgjs75rgo7qioh8z8ij7jpyp8508okhgbbex3ceei" +
"786u5rw2a9gx743dj3fgq2irk"
- knownSignature = "257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a"
+ knownSignatureTTL = 1209600 * time.Second
+ knownSignature = "89118b78732c33104a4d6231e8b5a5fa1e4301e3"
knownTimestamp = "7fffffff"
knownSigHint = "+A" + knownSignature + "@" + knownTimestamp
knownSignedLocator = knownLocator + knownSigHint
}
t0 := time.Unix(tsInt, 0)
+ blobSignatureTTL = knownSignatureTTL
+
PermissionSecret = []byte(knownKey)
if x := SignLocator(knownLocator, knownToken, t0); x != knownSignedLocator {
t.Fatalf("Got %+q, expected %+q", x, knownSignedLocator)
PermissionSecret = b
}(PermissionSecret)
+ blobSignatureTTL = knownSignatureTTL
+
PermissionSecret = []byte(knownKey)
if err := VerifySignature(knownSignedLocator, knownToken); err != nil {
t.Fatal(err)
"net/http"
"os"
"regexp"
+ "strings"
"time"
"github.com/AdRoll/goamz/aws"
return nil
}
-func (v *S3Volume) Get(loc string) ([]byte, error) {
+func (v *S3Volume) Get(loc string, buf []byte) (int, error) {
rdr, err := v.Bucket.GetReader(loc)
if err != nil {
- return nil, v.translateError(err)
+ return 0, v.translateError(err)
}
defer rdr.Close()
- buf := bufs.Get(BlockSize)
n, err := io.ReadFull(rdr, buf)
switch err {
case nil, io.EOF, io.ErrUnexpectedEOF:
- return buf[:n], nil
+ return n, nil
default:
- bufs.Put(buf)
- return nil, v.translateError(err)
+ return 0, v.translateError(err)
}
}
func (v *S3Volume) translateError(err error) error {
switch err := err.(type) {
case *s3.Error:
- if err.StatusCode == http.StatusNotFound && err.Code == "NoSuchKey" {
+ if (err.StatusCode == http.StatusNotFound && err.Code == "NoSuchKey") ||
+ strings.Contains(err.Error(), "Not Found") {
return os.ErrNotExist
}
// Other 404 errors like NoSuchVersion and
}
/* Allow default Trash Life time to be used. Thus, the newly created block
- will not be deleted becuase its Mtime is within the trash life time.
+ will not be deleted because its Mtime is within the trash life time.
*/
func TestTrashWorkerIntegration_SameLocatorInTwoVolumesWithDefaultTrashLifeTime(t *testing.T) {
neverDelete = false
expectEqualWithin(t, time.Second, 0, func() interface{} { return trashq.Status().InProgress })
// Verify Locator1 to be un/deleted as expected
- data, _ := GetBlock(testData.Locator1)
+ buf := make([]byte, BlockSize)
+ size, err := GetBlock(testData.Locator1, buf, nil)
if testData.ExpectLocator1 {
- if len(data) == 0 {
+ if size == 0 || err != nil {
t.Errorf("Expected Locator1 to be still present: %s", testData.Locator1)
}
} else {
- if len(data) > 0 {
+ if size > 0 || err == nil {
t.Errorf("Expected Locator1 to be deleted: %s", testData.Locator1)
}
}
// Verify Locator2 to be un/deleted as expected
if testData.Locator1 != testData.Locator2 {
- data, _ = GetBlock(testData.Locator2)
+ size, err = GetBlock(testData.Locator2, buf, nil)
if testData.ExpectLocator2 {
- if len(data) == 0 {
+ if size == 0 || err != nil {
t.Errorf("Expected Locator2 to be still present: %s", testData.Locator2)
}
} else {
- if len(data) > 0 {
+ if size > 0 || err == nil {
t.Errorf("Expected Locator2 to be deleted: %s", testData.Locator2)
}
}
if testData.DifferentMtimes {
locatorFoundIn := 0
for _, volume := range KeepVM.AllReadable() {
- if _, err := volume.Get(testData.Locator1); err == nil {
+ buf := make([]byte, BlockSize)
+ if _, err := volume.Get(testData.Locator1, buf); err == nil {
locatorFoundIn = locatorFoundIn + 1
}
}
// for example, a single mounted disk, a RAID array, an Amazon S3 volume,
// etc.
type Volume interface {
- // Get a block. IFF the returned error is nil, the caller must
- // put the returned slice back into the buffer pool when it's
- // finished with it. (Otherwise, the buffer pool will be
- // depleted and eventually -- when all available buffers are
- // used and not returned -- operations will reach deadlock.)
+ // Get a block: copy the block data into buf, and return the
+ // number of bytes copied.
//
// loc is guaranteed to consist of 32 or more lowercase hex
// digits.
//
- // Get should not verify the integrity of the returned data:
- // it should just return whatever was found in its backing
+ // Get should not verify the integrity of the data: it should
+ // just return whatever was found in its backing
// store. (Integrity checking is the caller's responsibility.)
//
// If an error is encountered that prevents it from
// access log if the block is not found on any other volumes
// either).
//
- // If the data in the backing store is bigger than BlockSize,
- // Get is permitted to return an error without reading any of
- // the data.
- Get(loc string) ([]byte, error)
+ // If the data in the backing store is bigger than len(buf),
+ // then Get is permitted to return an error without reading
+ // any of the data.
+ //
+ // len(buf) will not exceed BlockSize.
+ Get(loc string, buf []byte) (int, error)
// Compare the given data with the stored data (i.e., what Get
// would return). If equal, return nil. If not, return
v.PutRaw(TestHash, TestBlock)
- buf, err := v.Get(TestHash)
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, buf)
if err != nil {
t.Fatal(err)
}
- bufs.Put(buf)
-
- if bytes.Compare(buf, TestBlock) != 0 {
+ if bytes.Compare(buf[:n], TestBlock) != 0 {
t.Errorf("expected %s, got %s", string(TestBlock), string(buf))
}
}
v := factory(t)
defer v.Teardown()
- if _, err := v.Get(TestHash2); err == nil {
+ buf := make([]byte, BlockSize)
+ if _, err := v.Get(TestHash2, buf); err == nil {
t.Errorf("Expected error while getting non-existing block %v", TestHash2)
}
}
v.PutRaw(testHash, testDataA)
putErr := v.Put(testHash, testDataB)
- buf, getErr := v.Get(testHash)
+ buf := make([]byte, BlockSize)
+ n, getErr := v.Get(testHash, buf)
if putErr == nil {
// Put must not return a nil error unless it has
// overwritten the existing data.
- if bytes.Compare(buf, testDataB) != 0 {
- t.Errorf("Put succeeded but Get returned %+q, expected %+q", buf, testDataB)
+ if bytes.Compare(buf[:n], testDataB) != 0 {
+ t.Errorf("Put succeeded but Get returned %+q, expected %+q", buf[:n], testDataB)
}
} else {
// It is permissible for Put to fail, but it must
// leave us with either the original data, the new
// data, or nothing at all.
- if getErr == nil && bytes.Compare(buf, testDataA) != 0 && bytes.Compare(buf, testDataB) != 0 {
- t.Errorf("Put failed but Get returned %+q, which is neither %+q nor %+q", buf, testDataA, testDataB)
+ if getErr == nil && bytes.Compare(buf[:n], testDataA) != 0 && bytes.Compare(buf[:n], testDataB) != 0 {
+ t.Errorf("Put failed but Get returned %+q, which is neither %+q nor %+q", buf[:n], testDataA, testDataB)
}
}
- if getErr == nil {
- bufs.Put(buf)
- }
}
// Put and get multiple blocks
t.Errorf("Got err putting block %q: %q, expected nil", TestBlock3, err)
}
- data, err := v.Get(TestHash)
+ data := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, data)
if err != nil {
t.Error(err)
} else {
- if bytes.Compare(data, TestBlock) != 0 {
- t.Errorf("Block present, but got %+q, expected %+q", data, TestBlock)
+ if bytes.Compare(data[:n], TestBlock) != 0 {
+ t.Errorf("Block present, but got %+q, expected %+q", data[:n], TestBlock)
}
- bufs.Put(data)
}
- data, err = v.Get(TestHash2)
+ n, err = v.Get(TestHash2, data)
if err != nil {
t.Error(err)
} else {
- if bytes.Compare(data, TestBlock2) != 0 {
- t.Errorf("Block present, but got %+q, expected %+q", data, TestBlock2)
+ if bytes.Compare(data[:n], TestBlock2) != 0 {
+ t.Errorf("Block present, but got %+q, expected %+q", data[:n], TestBlock2)
}
- bufs.Put(data)
}
- data, err = v.Get(TestHash3)
+ n, err = v.Get(TestHash3, data)
if err != nil {
t.Error(err)
} else {
- if bytes.Compare(data, TestBlock3) != 0 {
- t.Errorf("Block present, but to %+q, expected %+q", data, TestBlock3)
+ if bytes.Compare(data[:n], TestBlock3) != 0 {
+ t.Errorf("Block present, but to %+q, expected %+q", data[:n], TestBlock3)
}
- bufs.Put(data)
}
}
if err := v.Trash(TestHash); err != nil {
t.Error(err)
}
- data, err := v.Get(TestHash)
+ data := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, data)
if err != nil {
t.Error(err)
- } else {
- if bytes.Compare(data, TestBlock) != 0 {
- t.Errorf("Got data %+q, expected %+q", data, TestBlock)
- }
- bufs.Put(data)
+ } else if bytes.Compare(data[:n], TestBlock) != 0 {
+ t.Errorf("Got data %+q, expected %+q", data[:n], TestBlock)
}
}
if err := v.Trash(TestHash); err != nil {
t.Error(err)
}
- if _, err := v.Get(TestHash); err == nil || !os.IsNotExist(err) {
+ data := make([]byte, BlockSize)
+ if _, err := v.Get(TestHash, data); err == nil || !os.IsNotExist(err) {
t.Errorf("os.IsNotExist(%v) should have been true", err)
}
+
+ _, err := v.Mtime(TestHash)
+ if err == nil || !os.IsNotExist(err) {
+ t.Fatalf("os.IsNotExist(%v) should have been true", err)
+ }
+
+ err = v.Compare(TestHash, TestBlock)
+ if err == nil || !os.IsNotExist(err) {
+ t.Fatalf("os.IsNotExist(%v) should have been true", err)
+ }
+
+ indexBuf := new(bytes.Buffer)
+ v.IndexTo("", indexBuf)
+ if strings.Contains(string(indexBuf.Bytes()), TestHash) {
+ t.Fatalf("Found trashed block in IndexTo")
+ }
+
+ err = v.Touch(TestHash)
+ if err == nil || !os.IsNotExist(err) {
+ t.Fatalf("os.IsNotExist(%v) should have been true", err)
+ }
}
// Calling Delete() for a block that does not exist should result in error.
}
v.PutRaw(TestHash, TestBlock)
+ buf := make([]byte, BlockSize)
// Get from read-only volume should succeed
- _, err := v.Get(TestHash)
+ _, err := v.Get(TestHash, buf)
if err != nil {
t.Errorf("got err %v, expected nil", err)
}
if err == nil {
t.Errorf("Expected error when putting block in a read-only volume")
}
- _, err = v.Get(TestHash2)
+ _, err = v.Get(TestHash2, buf)
if err == nil {
t.Errorf("Expected error when getting block whose put in read-only volume failed")
}
v.PutRaw(TestHash3, TestBlock3)
sem := make(chan int)
- go func(sem chan int) {
- buf, err := v.Get(TestHash)
+ go func() {
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, buf)
if err != nil {
t.Errorf("err1: %v", err)
}
- bufs.Put(buf)
- if bytes.Compare(buf, TestBlock) != 0 {
- t.Errorf("buf should be %s, is %s", string(TestBlock), string(buf))
+ if bytes.Compare(buf[:n], TestBlock) != 0 {
+ t.Errorf("buf should be %s, is %s", string(TestBlock), string(buf[:n]))
}
sem <- 1
- }(sem)
+ }()
- go func(sem chan int) {
- buf, err := v.Get(TestHash2)
+ go func() {
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash2, buf)
if err != nil {
t.Errorf("err2: %v", err)
}
- bufs.Put(buf)
- if bytes.Compare(buf, TestBlock2) != 0 {
- t.Errorf("buf should be %s, is %s", string(TestBlock2), string(buf))
+ if bytes.Compare(buf[:n], TestBlock2) != 0 {
+ t.Errorf("buf should be %s, is %s", string(TestBlock2), string(buf[:n]))
}
sem <- 1
- }(sem)
+ }()
- go func(sem chan int) {
- buf, err := v.Get(TestHash3)
+ go func() {
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash3, buf)
if err != nil {
t.Errorf("err3: %v", err)
}
- bufs.Put(buf)
- if bytes.Compare(buf, TestBlock3) != 0 {
- t.Errorf("buf should be %s, is %s", string(TestBlock3), string(buf))
+ if bytes.Compare(buf[:n], TestBlock3) != 0 {
+ t.Errorf("buf should be %s, is %s", string(TestBlock3), string(buf[:n]))
}
sem <- 1
- }(sem)
+ }()
// Wait for all goroutines to finish
- for done := 0; done < 3; {
- done += <-sem
+ for done := 0; done < 3; done++ {
+ <-sem
}
}
}(sem)
// Wait for all goroutines to finish
- for done := 0; done < 3; {
- done += <-sem
+ for done := 0; done < 3; done++ {
+ <-sem
}
// Double check that we actually wrote the blocks we expected to write.
- buf, err := v.Get(TestHash)
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, buf)
if err != nil {
t.Errorf("Get #1: %v", err)
}
- bufs.Put(buf)
- if bytes.Compare(buf, TestBlock) != 0 {
- t.Errorf("Get #1: expected %s, got %s", string(TestBlock), string(buf))
+ if bytes.Compare(buf[:n], TestBlock) != 0 {
+ t.Errorf("Get #1: expected %s, got %s", string(TestBlock), string(buf[:n]))
}
- buf, err = v.Get(TestHash2)
+ n, err = v.Get(TestHash2, buf)
if err != nil {
t.Errorf("Get #2: %v", err)
}
- bufs.Put(buf)
- if bytes.Compare(buf, TestBlock2) != 0 {
- t.Errorf("Get #2: expected %s, got %s", string(TestBlock2), string(buf))
+ if bytes.Compare(buf[:n], TestBlock2) != 0 {
+ t.Errorf("Get #2: expected %s, got %s", string(TestBlock2), string(buf[:n]))
}
- buf, err = v.Get(TestHash3)
+ n, err = v.Get(TestHash3, buf)
if err != nil {
t.Errorf("Get #3: %v", err)
}
- bufs.Put(buf)
- if bytes.Compare(buf, TestBlock3) != 0 {
- t.Errorf("Get #3: expected %s, got %s", string(TestBlock3), string(buf))
+ if bytes.Compare(buf[:n], TestBlock3) != 0 {
+ t.Errorf("Get #3: expected %s, got %s", string(TestBlock3), string(buf[:n]))
}
}
if err != nil {
t.Fatal(err)
}
- rdata, err := v.Get(hash)
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(hash, buf)
if err != nil {
t.Error(err)
- } else {
- defer bufs.Put(rdata)
}
- if bytes.Compare(rdata, wdata) != 0 {
- t.Error("rdata != wdata")
+ if bytes.Compare(buf[:n], wdata) != 0 {
+ t.Error("buf %+q != wdata %+q", buf[:n], wdata)
}
}
v.PutRaw(TestHash, TestBlock)
v.TouchWithDate(TestHash, time.Now().Add(-2*blobSignatureTTL))
- buf, err := v.Get(TestHash)
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, buf)
if err != nil {
t.Fatal(err)
}
- if bytes.Compare(buf, TestBlock) != 0 {
- t.Errorf("Got data %+q, expected %+q", buf, TestBlock)
+ if bytes.Compare(buf[:n], TestBlock) != 0 {
+ t.Errorf("Got data %+q, expected %+q", buf[:n], TestBlock)
}
- bufs.Put(buf)
// Trash
err = v.Trash(TestHash)
if v.Writable() == false {
if err != MethodDisabledError {
- t.Error(err)
+ t.Fatal(err)
}
} else if err != nil {
if err != ErrNotImplemented {
- t.Error(err)
+ t.Fatal(err)
}
} else {
- _, err = v.Get(TestHash)
+ _, err = v.Get(TestHash, buf)
if err == nil || !os.IsNotExist(err) {
t.Errorf("os.IsNotExist(%v) should have been true", err)
}
}
// Get the block - after trash and untrash sequence
- buf, err = v.Get(TestHash)
+ n, err = v.Get(TestHash, buf)
if err != nil {
t.Fatal(err)
}
- if bytes.Compare(buf, TestBlock) != 0 {
- t.Errorf("Got data %+q, expected %+q", buf, TestBlock)
+ if bytes.Compare(buf[:n], TestBlock) != 0 {
+ t.Errorf("Got data %+q, expected %+q", buf[:n], TestBlock)
}
- bufs.Put(buf)
}
func testTrashEmptyTrashUntrash(t TB, factory TestableVolumeFactory) {
}(trashLifetime)
checkGet := func() error {
- buf, err := v.Get(TestHash)
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash, buf)
if err != nil {
return err
}
- if bytes.Compare(buf, TestBlock) != 0 {
- t.Fatalf("Got data %+q, expected %+q", buf, TestBlock)
+ if bytes.Compare(buf[:n], TestBlock) != 0 {
+ t.Fatalf("Got data %+q, expected %+q", buf[:n], TestBlock)
+ }
+
+ _, err = v.Mtime(TestHash)
+ if err != nil {
+ return err
}
- bufs.Put(buf)
+
+ err = v.Compare(TestHash, TestBlock)
+ if err != nil {
+ return err
+ }
+
+ indexBuf := new(bytes.Buffer)
+ v.IndexTo("", indexBuf)
+ if !strings.Contains(string(indexBuf.Bytes()), TestHash) {
+ return os.ErrNotExist
+ }
+
return nil
}
t.Fatal(err)
}
+ // Trash the block
err = v.Trash(TestHash)
if err == MethodDisabledError || err == ErrNotImplemented {
// Skip the trash tests for read-only volumes, and
t.Fatalf("os.IsNotExist(%v) should have been true", err)
}
+ err = v.Touch(TestHash)
+ if err == nil || !os.IsNotExist(err) {
+ t.Fatalf("os.IsNotExist(%v) should have been true", err)
+ }
+
v.EmptyTrash()
// Even after emptying the trash, we can untrash our block
if err != nil {
t.Fatal(err)
}
+
err = checkGet()
if err != nil {
t.Fatal(err)
}
+ err = v.Touch(TestHash)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Because we Touch'ed, need to backdate again for next set of tests
+ v.TouchWithDate(TestHash, time.Now().Add(-2*blobSignatureTTL))
+
// Untrash should fail if the only block in the trash has
// already been untrashed.
err = v.Untrash(TestHash)
// Trash it again, and this time call EmptyTrash so it really
// goes away.
+ // (In Azure volumes, un/trash changes Mtime, so first backdate again)
+ v.TouchWithDate(TestHash, time.Now().Add(-2*blobSignatureTTL))
err = v.Trash(TestHash)
err = checkGet()
if err == nil || !os.IsNotExist(err) {
- t.Errorf("os.IsNotExist(%v) should have been true", err)
+ t.Fatalf("os.IsNotExist(%v) should have been true", err)
}
+ // EmptryTrash
v.EmptyTrash()
// Untrash won't find it
}
}
-func (v *MockVolume) Get(loc string) ([]byte, error) {
+func (v *MockVolume) Get(loc string, buf []byte) (int, error) {
v.gotCall("Get")
<-v.Gate
if v.Bad {
- return nil, errors.New("Bad volume")
+ return 0, errors.New("Bad volume")
} else if block, ok := v.Store[loc]; ok {
- buf := bufs.Get(len(block))
- copy(buf, block)
- return buf, nil
+ copy(buf[:len(block)], block)
+ return len(block), nil
}
- return nil, os.ErrNotExist
+ return 0, os.ErrNotExist
}
func (v *MockVolume) Put(loc string, block []byte) error {
return stat, err
}
-// Get retrieves a block identified by the locator string "loc", and
-// returns its contents as a byte slice.
-//
-// Get returns a nil buffer IFF it returns a non-nil error.
-func (v *UnixVolume) Get(loc string) ([]byte, error) {
+// Get retrieves a block, copies it to the given slice, and returns
+// the number of bytes copied.
+func (v *UnixVolume) Get(loc string, buf []byte) (int, error) {
path := v.blockPath(loc)
stat, err := v.stat(path)
if err != nil {
- return nil, v.translateError(err)
+ return 0, v.translateError(err)
+ }
+ if stat.Size() > int64(len(buf)) {
+ return 0, TooLongError
}
- buf := bufs.Get(int(stat.Size()))
+ var read int
+ size := int(stat.Size())
err = v.getFunc(path, func(rdr io.Reader) error {
- _, err = io.ReadFull(rdr, buf)
+ read, err = io.ReadFull(rdr, buf[:size])
return err
})
- if err != nil {
- bufs.Put(buf)
- return nil, err
- }
- return buf, nil
+ return read, err
}
// Compare returns nil if Get(loc) would return the same content as
}
}
-var trashLocRegexp = regexp.MustCompile(`/([0-9a-f]{32})\.trash\.(\d+)$`)
+var unixTrashLocRegexp = regexp.MustCompile(`/([0-9a-f]{32})\.trash\.(\d+)$`)
// EmptyTrash walks hierarchy looking for {hash}.trash.*
// and deletes those with deadline < now.
if info.Mode().IsDir() {
return nil
}
- matches := trashLocRegexp.FindStringSubmatch(path)
+ matches := unixTrashLocRegexp.FindStringSubmatch(path)
if len(matches) != 3 {
return nil
}
defer v.Teardown()
v.Put(TestHash, TestBlock)
- buf, err := v.Get(TestHash2)
+ buf := make([]byte, BlockSize)
+ n, err := v.Get(TestHash2, buf)
switch {
case os.IsNotExist(err):
break
case err == nil:
- t.Errorf("Read should have failed, returned %s", string(buf))
+ t.Errorf("Read should have failed, returned %+q", buf[:n])
default:
t.Errorf("Read expected ErrNotExist, got: %s", err)
}
v.PutRaw(TestHash, TestBlock)
- _, err := v.Get(TestHash)
+ buf := make([]byte, BlockSize)
+ _, err := v.Get(TestHash, buf)
if err != nil {
t.Errorf("got err %v, expected nil", err)
}
import errno
import logging
import os
+import signal
+import time
import threading
import traceback
if (exception_type in (threading.ThreadError, MemoryError) or
exception_type is OSError and exception_value.errno == errno.ENOMEM):
lg.critical("Unhandled exception is a fatal error, killing Node Manager")
- os.killpg(os.getpgid(0), 9)
+ os.kill(os.getpid(), signal.SIGKILL)
+
+ def ping(self):
+ return True
+
+
+class WatchdogActor(pykka.ThreadingActor):
+ def __init__(self, timeout, *args, **kwargs):
+ super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
+ self.timeout = timeout
+ self.actors = [a.proxy() for a in args]
+ self.actor_ref = TellableActorRef(self)
+ self._later = self.actor_ref.tell_proxy()
+
+ def kill_self(self, e, act):
+ lg = getattr(self, "_logger", logging)
+ lg.critical("Watchdog exception", exc_info=e)
+ lg.critical("Actor %s watchdog ping time out, killing Node Manager", act)
+ os.kill(os.getpid(), signal.SIGKILL)
+
+ def on_start(self):
+ self._later.run()
+
+ def run(self):
+ a = None
+ try:
+ for a in self.actors:
+ a.ping().get(self.timeout)
+ time.sleep(20)
+ self._later.run()
+ except Exception as e:
+ self.kill_self(e, a)
arvados_node_missing, RetryMixin
from ...clientactor import _notify_subscribers
from ... import config
+from .transitions import transitions
class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
"""Base class for actors that change a compute node's state.
self._logger.info("Shutdown cancelled: %s.", reason)
self._finished(success_flag=False)
- def _stop_if_window_closed(orig_func):
+ def _cancel_on_exception(orig_func):
@functools.wraps(orig_func)
- def stop_wrapper(self, *args, **kwargs):
- if (self.cancellable and
- (self._monitor.shutdown_eligible().get() is not True)):
- self._later.cancel_shutdown(self.WINDOW_CLOSED)
- return None
- else:
+ def finish_wrapper(self, *args, **kwargs):
+ try:
return orig_func(self, *args, **kwargs)
- return stop_wrapper
+ except Exception as error:
+ self._logger.error("Actor error %s", error)
+ self._later.cancel_shutdown("Unhandled exception %s" % error)
+ return finish_wrapper
- @ComputeNodeStateChangeBase._finish_on_exception
- @_stop_if_window_closed
+ @_cancel_on_exception
@RetryMixin._retry()
def shutdown_node(self):
self._logger.info("Starting shutdown")
+ arv_node = self._arvados_node()
if not self._cloud.destroy_node(self.cloud_node):
if self._cloud.broken(self.cloud_node):
self._later.cancel_shutdown(self.NODE_BROKEN)
# Force a retry.
raise cloud_types.LibcloudError("destroy_node failed")
self._logger.info("Shutdown success")
- arv_node = self._arvados_node()
if arv_node is None:
self._finished(success_flag=True)
else:
self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
self._finished(success_flag=True)
- # Make the decorator available to subclasses.
- _stop_if_window_closed = staticmethod(_stop_if_window_closed)
-
class ComputeNodeUpdateActor(config.actor_class):
"""Actor to dispatch one-off cloud management requests.
this to perform maintenance tasks on themselves. Having a
dedicated actor for this gives us the opportunity to control the
flow of requests; e.g., by backing off when errors occur.
-
- This actor is most like a "traditional" Pykka actor: there's no
- subscribing, but instead methods return real driver results. If
- you're interested in those results, you should get them from the
- Future that the proxy method returns. Be prepared to handle exceptions
- from the cloud driver when you do.
"""
def __init__(self, cloud_factory, max_retry_wait=180):
super(ComputeNodeUpdateActor, self).__init__()
self.error_streak = 0
self.next_request_time = time.time()
+ def _set_logger(self):
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
+
+ def on_start(self):
+ self._set_logger()
+
def _throttle_errors(orig_func):
@functools.wraps(orig_func)
def throttle_wrapper(self, *args, **kwargs):
try:
result = orig_func(self, *args, **kwargs)
except Exception as error:
- self.error_streak += 1
- self.next_request_time += min(2 ** self.error_streak,
- self.max_retry_wait)
- raise
+ if self._cloud.is_cloud_exception(error):
+ self.error_streak += 1
+ self.next_request_time += min(2 ** self.error_streak,
+ self.max_retry_wait)
+ self._logger.warn(
+ "Unhandled exception: %s", error, exc_info=error)
else:
self.error_streak = 0
return result
self._last_log = msg
self._logger.debug(msg, *args)
- def in_state(self, *states):
- # Return a boolean to say whether or not our Arvados node record is in
- # one of the given states. If state information is not
- # available--because this node has no Arvados record, the record is
- # stale, or the record has no state information--return None.
- if (self.arvados_node is None) or not timestamp_fresh(
- arvados_node_mtime(self.arvados_node), self.node_stale_after):
- return None
+ def get_state(self):
+ """Get node state, one of ['unpaired', 'busy', 'idle', 'down']."""
+
+ # If this node is not associated with an Arvados node, return 'unpaired'.
+ if self.arvados_node is None:
+ return 'unpaired'
+
state = self.arvados_node['crunch_worker_state']
- if not state:
- return None
- result = state in states
- if state == 'idle':
- result = result and not self.arvados_node['job_uuid']
- return result
+
+ # If state information is not available because it is missing or the
+ # record is stale, return 'down'.
+ if not state or not timestamp_fresh(arvados_node_mtime(self.arvados_node),
+ self.node_stale_after):
+ state = 'down'
+
+ # There's a window between when a node pings for the first time and the
+ # value of 'slurm_state' is synchronized by crunch-dispatch. In this
+ # window, the node will still report as 'down'. Check that
+ # first_ping_at is truthy and consider the node 'idle' during the
+ # initial boot grace period.
+ if (state == 'down' and
+ self.arvados_node['first_ping_at'] and
+ timestamp_fresh(self.cloud_node_start_time,
+ self.boot_fail_after) and
+ not self._cloud.broken(self.cloud_node)):
+ state = 'idle'
+
+ # "missing" means last_ping_at is stale, this should be
+ # considered "down"
+ if arvados_node_missing(self.arvados_node, self.node_stale_after):
+ state = 'down'
+
+ # Turns out using 'job_uuid' this way is a bad idea. The node record
+ # is assigned the job_uuid before the job is locked (which removes it
+ # from the queue) which means the job will be double-counted as both in
+ # the wishlist and but also keeping a node busy. This end result is
+ # excess nodes being booted.
+ #if state == 'idle' and self.arvados_node['job_uuid']:
+ # state = 'busy'
+
+ return state
+
+ def in_state(self, *states):
+ return self.get_state() in states
def shutdown_eligible(self):
- """Return True if eligible for shutdown, or a string explaining why the node
- is not eligible for shutdown."""
+ """Determine if node is candidate for shut down.
- if not self._shutdowns.window_open():
- return "shutdown window is not open."
- if self.arvados_node is None:
- # Node is unpaired.
- # If it hasn't pinged Arvados after boot_fail seconds, shut it down
- if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
- return "node is still booting, will be considered a failed boot at %s" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.cloud_node_start_time + self.boot_fail_after))
- else:
- return True
- missing = arvados_node_missing(self.arvados_node, self.node_stale_after)
- if missing and self._cloud.broken(self.cloud_node):
- # Node is paired, but Arvados says it is missing and the cloud says the node
- # is in an error state, so shut it down.
- return True
- if missing is None and self._cloud.broken(self.cloud_node):
- self._logger.info(
- "Cloud node considered 'broken' but paired node %s last_ping_at is None, " +
- "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).",
- self.arvados_node['uuid'])
- if self.in_state('idle'):
- return True
+ Returns a tuple of (boolean, string) where the first value is whether
+ the node is candidate for shut down, and the second value is the
+ reason for the decision.
+ """
+
+ # Collect states and then consult state transition table whether we
+ # should shut down. Possible states are:
+ # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
+ # window = ["open", "closed"]
+ # boot_grace = ["boot wait", "boot exceeded"]
+ # idle_grace = ["not idle", "idle wait", "idle exceeded"]
+
+ if self.arvados_node and not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
+ return (False, "node state is stale")
+
+ crunch_worker_state = self.get_state()
+
+ window = "open" if self._shutdowns.window_open() else "closed"
+
+ if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
+ boot_grace = "boot wait"
else:
- return "node is not idle."
+ boot_grace = "boot exceeded"
+
+ # API server side not implemented yet.
+ idle_grace = 'idle exceeded'
+
+ node_state = (crunch_worker_state, window, boot_grace, idle_grace)
+ t = transitions[node_state]
+ if t is not None:
+ # yes, shutdown eligible
+ return (True, "node state is %s" % (node_state,))
+ else:
+ # no, return a reason
+ return (False, "node state is %s" % (node_state,))
def consider_shutdown(self):
try:
+ eligible, reason = self.shutdown_eligible()
next_opening = self._shutdowns.next_opening()
- eligible = self.shutdown_eligible()
- if eligible is True:
- self._debug("Suggesting shutdown.")
+ if eligible:
+ self._debug("Suggesting shutdown because %s", reason)
_notify_subscribers(self.actor_ref.proxy(), self.subscribers)
- elif self._shutdowns.window_open():
- self._debug("Cannot shut down because %s", eligible)
- elif self.last_shutdown_opening != next_opening:
- self._debug("Shutdown window closed. Next at %s.",
- time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
- self._timer.schedule(next_opening, self._later.consider_shutdown)
- self.last_shutdown_opening = next_opening
+ else:
+ self._debug("Not eligible for shut down because %s", reason)
+
+ if self.last_shutdown_opening != next_opening:
+ self._debug("Shutdown window closed. Next at %s.",
+ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
+ self._timer.schedule(next_opening, self._later.consider_shutdown)
+ self.last_shutdown_opening = next_opening
except Exception:
self._logger.exception("Unexpected exception")
from . import ComputeNodeShutdownActor as ShutdownActorBase
from .. import RetryMixin
-class ComputeNodeShutdownActor(ShutdownActorBase):
+class SlurmMixin(object):
SLURM_END_STATES = frozenset(['down\n', 'down*\n',
'drain\n', 'drain*\n',
'fail\n', 'fail*\n'])
SLURM_DRAIN_STATES = frozenset(['drain\n', 'drng\n'])
+ def _set_node_state(self, nodename, state, *args):
+ cmd = ['scontrol', 'update', 'NodeName=' + nodename,
+ 'State=' + state]
+ cmd.extend(args)
+ subprocess.check_output(cmd)
+
+ def _get_slurm_state(self, nodename):
+ return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', nodename])
+
+
+class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
def on_start(self):
arv_node = self._arvados_node()
if arv_node is None:
self._logger.info("Draining SLURM node %s", self._nodename)
self._later.issue_slurm_drain()
- def _set_node_state(self, state, *args):
- cmd = ['scontrol', 'update', 'NodeName=' + self._nodename,
- 'State=' + state]
- cmd.extend(args)
- subprocess.check_output(cmd)
-
- def _get_slurm_state(self):
- return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
-
- # The following methods retry on OSError. This is intended to mitigate bug
- # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
- # allocate memory" resulting in the untimely death of the shutdown actor
- # and tends to result in node manager getting into a wedged state where it
- # won't allocate new nodes or shut down gracefully. The underlying causes
- # of the excessive memory usage that result in the "Cannot allocate memory"
- # error are still being investigated.
-
- @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+ @RetryMixin._retry((subprocess.CalledProcessError,))
def cancel_shutdown(self, reason):
if self._nodename:
- if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
+ if self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
# Resume from "drng" or "drain"
- self._set_node_state('RESUME')
+ self._set_node_state(self._nodename, 'RESUME')
else:
# Node is in a state such as 'idle' or 'alloc' so don't
# try to resume it because that will just raise an error.
pass
return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
- @RetryMixin._retry((subprocess.CalledProcessError, OSError))
- @ShutdownActorBase._stop_if_window_closed
+ @RetryMixin._retry((subprocess.CalledProcessError,))
def issue_slurm_drain(self):
- self._set_node_state('DRAIN', 'Reason=Node Manager shutdown')
- self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
- self._later.await_slurm_drain()
+ if self.cancel_reason is not None:
+ return
+ if self._nodename:
+ self._set_node_state(self._nodename, 'DRAIN', 'Reason=Node Manager shutdown')
+ self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
+ self._later.await_slurm_drain()
+ else:
+ self._later.shutdown_node()
- @RetryMixin._retry((subprocess.CalledProcessError, OSError))
- @ShutdownActorBase._stop_if_window_closed
+ @RetryMixin._retry((subprocess.CalledProcessError,))
def await_slurm_drain(self):
- output = self._get_slurm_state()
- if output in self.SLURM_END_STATES:
- self._later.shutdown_node()
- else:
+ if self.cancel_reason is not None:
+ return
+ output = self._get_slurm_state(self._nodename)
+ if output in ("drng\n", "alloc\n", "drng*\n", "alloc*\n"):
self._timer.schedule(time.time() + 10,
self._later.await_slurm_drain)
+ elif output in ("idle\n"):
+ # Not in "drng" so cancel self.
+ self.cancel_shutdown("slurm state is %s" % output.strip())
+ else:
+ # any other state.
+ self._later.shutdown_node()
--- /dev/null
+transitions = {
+ ('busy', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'closed', 'boot exceeded', 'not idle'): None,
+ ('busy', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot wait', 'idle wait'): None,
+ ('busy', 'closed', 'boot wait', 'not idle'): None,
+ ('busy', 'open', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'open', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'open', 'boot exceeded', 'not idle'): None,
+ ('busy', 'open', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'open', 'boot wait', 'idle wait'): None,
+ ('busy', 'open', 'boot wait', 'not idle'): None,
+
+ ('down', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('down', 'closed', 'boot wait', 'idle wait'): None,
+ ('down', 'closed', 'boot wait', 'not idle'): None,
+ ('down', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN",
+
+ ('idle', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'closed', 'boot exceeded', 'not idle'): None,
+ ('idle', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot wait', 'idle wait'): None,
+ ('idle', 'closed', 'boot wait', 'not idle'): None,
+ ('idle', 'open', 'boot exceeded', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'open', 'boot exceeded', 'not idle'): None,
+ ('idle', 'open', 'boot wait', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot wait', 'idle wait'): None,
+ ('idle', 'open', 'boot wait', 'not idle'): None,
+
+ ('unpaired', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'closed', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'closed', 'boot wait', 'not idle'): None,
+ ('unpaired', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'open', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'open', 'boot wait', 'not idle'): None}
key = NodeAuthSSHKey(ssh_file.read())
return 'auth', key
- def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
+ def search_for_now(self, term, list_method, key=attrgetter('id'), **kwargs):
"""Return one matching item from a list of cloud objects.
Raises ValueError if the number of matching objects is not exactly 1.
Arguments:
* term: The value that identifies a matching item.
- * list_method: A string that names the method to call on this
- instance's libcloud driver for a list of objects.
+ * list_method: A string that names the method to call for a
+ list of objects.
* key: A function that accepts a cloud object and returns a
value search for a `term` match on each item. Returns the
object's 'id' attribute by default.
"""
+ try:
+ list_func = getattr(self, list_method)
+ except AttributeError:
+ list_func = getattr(self.real, list_method)
+ items = list_func(**kwargs)
+ results = [item for item in items if key(item) == term]
+ count = len(results)
+ if count != 1:
+ raise ValueError("{} returned {} results for {!r}".format(
+ list_method, count, term))
+ return results[0]
+
+ def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
+ """Return one cached matching item from a list of cloud objects.
+
+ See search_for_now() for details of arguments and exceptions.
+ This method caches results, so it's good to find static cloud objects
+ like node sizes, regions, etc.
+ """
cache_key = (list_method, term)
if cache_key not in self.SEARCH_CACHE:
- items = getattr(self.real, list_method)(**kwargs)
- results = [item for item in items
- if key(item) == term]
- count = len(results)
- if count != 1:
- raise ValueError("{} returned {} results for '{}'".format(
- list_method, count, term))
- self.SEARCH_CACHE[cache_key] = results[0]
+ self.SEARCH_CACHE[cache_key] = self.search_for_now(
+ term, list_method, key, **kwargs)
return self.SEARCH_CACHE[cache_key]
- def list_nodes(self):
- return self.real.list_nodes(**self.list_kwargs)
+ def list_nodes(self, **kwargs):
+ l = self.list_kwargs.copy()
+ l.update(kwargs)
+ return self.real.list_nodes(**l)
+
+ def create_cloud_name(self, arvados_node):
+ """Return a cloud node name for the given Arvados node record.
+
+ Subclasses must override this method. It should return a string
+ that can be used as the name for a newly-created cloud node,
+ based on identifying information in the Arvados node record.
+
+ Arguments:
+ * arvados_node: This Arvados node record to seed the new cloud node.
+ """
+ raise NotImplementedError("BaseComputeNodeDriver.create_cloud_name")
def arvados_create_kwargs(self, size, arvados_node):
"""Return dynamic keyword arguments for create_node.
kwargs.update(self.arvados_create_kwargs(size, arvados_node))
kwargs['size'] = size
return self.real.create_node(**kwargs)
- except self.CLOUD_ERRORS:
+ except self.CLOUD_ERRORS as create_error:
# Workaround for bug #6702: sometimes the create node request
# succeeds but times out and raises an exception instead of
# returning a result. If this happens, we get stuck in a retry
# loop forever because subsequent create_node attempts will fail
# due to node name collision. So check if the node we intended to
# create shows up in the cloud node list and return it if found.
- node = self.search_for(kwargs['name'], 'list_nodes', self._name_key)
- if node:
- return node
- else:
- # something else went wrong, re-raise the exception
- raise
+ try:
+ return self.search_for_now(kwargs['name'], 'list_nodes', self._name_key)
+ except ValueError:
+ raise create_error
def post_create_node(self, cloud_node):
# ComputeNodeSetupActor calls this method after the cloud node is
auth_kwargs, list_kwargs, create_kwargs,
driver_class)
+ def create_cloud_name(self, arvados_node):
+ uuid_parts = arvados_node['uuid'].split('-', 2)
+ return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
def arvados_create_kwargs(self, size, arvados_node):
- cluster_id, _, node_id = arvados_node['uuid'].split('-')
- name = 'compute-{}-{}'.format(node_id, cluster_id)
tags = {
'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
'arv-ping-url': self._make_ping_url(arvados_node)
}
tags.update(self.tags)
+ name = self.create_cloud_name(arvados_node)
customdata = """#!/bin/sh
mkdir -p /var/tmp/arv-node-data/meta-data
echo %s > /var/tmp/arv-node-data/arv-ping-url
def _init_subnet_id(self, subnet_id):
return 'ex_subnet', self.search_for(subnet_id, 'ex_list_subnets')
+ create_cloud_name = staticmethod(arvados_node_fqdn)
+
def arvados_create_kwargs(self, size, arvados_node):
- return {'name': arvados_node_fqdn(arvados_node),
+ return {'name': self.create_cloud_name(arvados_node),
'ex_userdata': self._make_ping_url(arvados_node)}
def post_create_node(self, cloud_node):
self.create_kwargs['ex_metadata']['sshKeys'] = (
'root:' + ssh_file.read().strip())
+ def create_cloud_name(self, arvados_node):
+ uuid_parts = arvados_node['uuid'].split('-', 2)
+ return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
def arvados_create_kwargs(self, size, arvados_node):
- cluster_id, _, node_id = arvados_node['uuid'].split('-')
- name = 'compute-{}-{}'.format(node_id, cluster_id)
+ name = self.create_cloud_name(arvados_node)
disks = [
{'autoDelete': True,
'boot': True,
'poll_stale_after': '600',
'max_total_price': '0',
'boot_fail_after': str(sys.maxint),
- 'node_stale_after': str(60 * 60 * 2)},
+ 'node_stale_after': str(60 * 60 * 2),
+ 'watchdog': '600'},
'Logging': {'file': '/dev/stderr',
'level': 'WARNING'},
}.iteritems():
self.cloud_node = cloud_node
self.arvados_node = arvados_node
self.assignment_time = assignment_time
-
+ self.shutdown_actor = None
class _BaseNodeTracker(object):
def __init__(self):
self.cloud_nodes = _CloudNodeTracker()
self.arvados_nodes = _ArvadosNodeTracker()
self.booting = {} # Actor IDs to ComputeNodeSetupActors
- self.booted = {} # Cloud node IDs to _ComputeNodeRecords
- self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors
- self.sizes_booting_shutdown = {} # Actor IDs or Cloud node IDs to node size
+ self.sizes_booting = {} # Actor IDs to node size
def on_start(self):
self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
record = _ComputeNodeRecord(actor.proxy(), cloud_node)
return record
+ def _register_cloud_node(self, node):
+ rec = self.cloud_nodes.get(node.id)
+ if rec is None:
+ self._logger.info("Registering new cloud node %s", node.id)
+ record = self._new_node(node)
+ self.cloud_nodes.add(record)
+ else:
+ rec.cloud_node = node
+
def update_cloud_nodes(self, nodelist):
self._update_poll_time('cloud_nodes')
- for key, node in self.cloud_nodes.update_from(nodelist):
- self._logger.info("Registering new cloud node %s", key)
- if key in self.booted:
- record = self.booted.pop(key)
- else:
- record = self._new_node(node)
- self.cloud_nodes.add(record)
- for arv_rec in self.arvados_nodes.unpaired():
- if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
- self._pair_nodes(record, arv_rec.arvados_node)
- break
- for key, record in self.cloud_nodes.orphans.iteritems():
- if key in self.shutdowns:
+ for _, node in self.cloud_nodes.update_from(nodelist):
+ self._register_cloud_node(node)
+
+ self.try_pairing()
+
+ for record in self.cloud_nodes.orphans.itervalues():
+ if record.shutdown_actor:
try:
- self.shutdowns[key].stop().get()
+ record.shutdown_actor.stop()
except pykka.ActorDeadError:
pass
- del self.shutdowns[key]
- del self.sizes_booting_shutdown[key]
- record.actor.stop()
- record.cloud_node = None
+ record.shutdown_actor = None
+
+ # A recently booted node is a node that successfully completed the
+ # setup actor but has not yet appeared in the cloud node list.
+ # This will have the tag _nodemanager_recently_booted on it, which
+ # means (if we're not shutting it down) we want to put it back into
+ # the cloud node list. Once it really appears in the cloud list,
+ # the object in record.cloud_node will be replaced by a new one
+ # that lacks the "_nodemanager_recently_booted" tag.
+ if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+ self.cloud_nodes.add(record)
+ else:
+ record.actor.stop()
+ record.cloud_node = None
+
+ def _register_arvados_node(self, key, arv_node):
+ self._logger.info("Registering new Arvados node %s", key)
+ record = _ComputeNodeRecord(arvados_node=arv_node)
+ self.arvados_nodes.add(record)
def update_arvados_nodes(self, nodelist):
self._update_poll_time('arvados_nodes')
for key, node in self.arvados_nodes.update_from(nodelist):
- self._logger.info("Registering new Arvados node %s", key)
- record = _ComputeNodeRecord(arvados_node=node)
- self.arvados_nodes.add(record)
- for arv_rec in self.arvados_nodes.unpaired():
- arv_node = arv_rec.arvados_node
- for cloud_rec in self.cloud_nodes.unpaired():
- if cloud_rec.actor.offer_arvados_pair(arv_node).get():
- self._pair_nodes(cloud_rec, arv_node)
+ self._register_arvados_node(key, node)
+ self.try_pairing()
+
+ def try_pairing(self):
+ for record in self.cloud_nodes.unpaired():
+ for arv_rec in self.arvados_nodes.unpaired():
+ if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+ self._pair_nodes(record, arv_rec.arvados_node)
break
def _nodes_booting(self, size):
s = sum(1
for c in self.booting.iterkeys()
- if size is None or self.sizes_booting_shutdown[c].id == size.id)
- s += sum(1
- for c in self.booted.itervalues()
- if size is None or c.cloud_node.size.id == size.id)
+ if size is None or self.sizes_booting[c].id == size.id)
return s
- def _nodes_unpaired(self, size):
- return sum(1
- for c in self.cloud_nodes.unpaired()
- if size is None or c.cloud_node.size.id == size.id)
-
- def _nodes_booted(self, size):
- return sum(1
- for c in self.cloud_nodes.nodes.itervalues()
- if size is None or c.cloud_node.size.id == size.id)
-
- def _nodes_up(self, size):
- up = self._nodes_booting(size) + self._nodes_booted(size)
+ def _node_states(self, size):
+ states = pykka.get_all(rec.actor.get_state()
+ for rec in self.cloud_nodes.nodes.itervalues()
+ if ((size is None or rec.cloud_node.size.id == size.id) and
+ rec.shutdown_actor is None))
+ states += ['shutdown' for rec in self.cloud_nodes.nodes.itervalues()
+ if ((size is None or rec.cloud_node.size.id == size.id) and
+ rec.shutdown_actor is not None)]
+ return states
+
+ def _state_counts(self, size):
+ states = self._node_states(size)
+ counts = {
+ "booting": self._nodes_booting(size),
+ "unpaired": 0,
+ "busy": 0,
+ "idle": 0,
+ "down": 0,
+ "shutdown": 0
+ }
+ for s in states:
+ counts[s] = counts[s] + 1
+ return counts
+
+ def _nodes_up(self, counts):
+ up = counts["booting"] + counts["unpaired"] + counts["idle"] + counts["busy"]
return up
def _total_price(self):
cost = 0
- cost += sum(self.server_calculator.find_size(self.sizes_booting_shutdown[c].id).price
+ cost += sum(self.server_calculator.find_size(self.sizes_booting[c].id).price
for c in self.booting.iterkeys())
cost += sum(self.server_calculator.find_size(c.cloud_node.size.id).price
- for i in (self.booted, self.cloud_nodes.nodes)
- for c in i.itervalues())
+ for c in self.cloud_nodes.nodes.itervalues())
return cost
- def _nodes_busy(self, size):
- return sum(1 for busy in
- pykka.get_all(rec.actor.in_state('busy') for rec in
- self.cloud_nodes.nodes.itervalues()
- if rec.cloud_node.size.id == size.id)
- if busy)
-
- def _nodes_missing(self, size):
- return sum(1 for arv_node in
- pykka.get_all(rec.actor.arvados_node for rec in
- self.cloud_nodes.nodes.itervalues()
- if rec.cloud_node.size.id == size.id and rec.actor.cloud_node.get().id not in self.shutdowns)
- if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
-
def _size_wishlist(self, size):
return sum(1 for c in self.last_wishlist if c.id == size.id)
- def _size_shutdowns(self, size):
- sh = 0
- for c in self.shutdowns.iterkeys():
- try:
- if self.sizes_booting_shutdown[c].id == size.id:
- sh += 1
- except pykka.ActorDeadError:
- pass
- return sh
-
def _nodes_wanted(self, size):
- total_up_count = self._nodes_up(None)
- under_min = self.min_nodes - total_up_count
- over_max = total_up_count - self.max_nodes
+ total_node_count = self._nodes_booting(None) + len(self.cloud_nodes)
+ under_min = self.min_nodes - total_node_count
+ over_max = total_node_count - self.max_nodes
total_price = self._total_price()
- if over_max >= 0:
- return -over_max
- elif under_min > 0 and size.id == self.min_cloud_size.id:
- return under_min
+ counts = self._state_counts(size)
- booting_count = self._nodes_booting(size) + self._nodes_unpaired(size)
- shutdown_count = self._size_shutdowns(size)
- busy_count = self._nodes_busy(size)
- up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+ up_count = self._nodes_up(counts)
+ busy_count = counts["busy"]
- self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
+ self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.name,
self._size_wishlist(size),
- up_count + busy_count,
- booting_count,
- up_count - booting_count,
+ up_count,
+ counts["booting"],
+ counts["unpaired"],
+ counts["idle"],
busy_count,
- shutdown_count)
+ counts["down"],
+ counts["shutdown"])
+
+ if over_max >= 0:
+ return -over_max
+ elif under_min > 0 and size.id == self.min_cloud_size.id:
+ return under_min
- wanted = self._size_wishlist(size) - up_count
+ wanted = self._size_wishlist(size) - (up_count - busy_count)
if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
can_boot = int((self.max_total_price - total_price) / size.price)
if can_boot == 0:
return wanted
def _nodes_excess(self, size):
- up_count = self._nodes_up(size) - self._size_shutdowns(size)
+ counts = self._state_counts(size)
+ up_count = self._nodes_up(counts)
if size.id == self.min_cloud_size.id:
up_count -= self.min_nodes
- return up_count - self._nodes_busy(size) - self._size_wishlist(size)
+ return up_count - (counts["busy"] + self._size_wishlist(size))
def update_server_wishlist(self, wishlist):
self._update_poll_time('server_wishlist')
cloud_client=self._new_cloud(),
cloud_size=cloud_size).proxy()
self.booting[new_setup.actor_ref.actor_urn] = new_setup
- self.sizes_booting_shutdown[new_setup.actor_ref.actor_urn] = cloud_size
+ self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
if arvados_node is not None:
self.arvados_nodes[arvados_node['uuid']].assignment_time = (
return pykka.get_all([getattr(actor, name) for name in attr_names])
def node_up(self, setup_proxy):
- cloud_node = setup_proxy.cloud_node.get()
- del self.booting[setup_proxy.actor_ref.actor_urn]
- del self.sizes_booting_shutdown[setup_proxy.actor_ref.actor_urn]
-
+ # Called when a SetupActor has completed.
+ cloud_node, arvados_node = self._get_actor_attrs(
+ setup_proxy, 'cloud_node', 'arvados_node')
setup_proxy.stop()
+
+ # If cloud_node is None then the node create wasn't
+ # successful and so there isn't anything to do.
if cloud_node is not None:
- record = self.cloud_nodes.get(cloud_node.id)
- if record is None:
- record = self._new_node(cloud_node)
- self.booted[cloud_node.id] = record
- self._timer.schedule(time.time() + self.boot_fail_after,
- self._later.shutdown_unpaired_node, cloud_node.id)
+ # Node creation succeeded. Update cloud node list.
+ cloud_node._nodemanager_recently_booted = True
+ self._register_cloud_node(cloud_node)
+ del self.booting[setup_proxy.actor_ref.actor_urn]
+ del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
@_check_poll_freshness
def stop_booting_node(self, size):
if (nodes_excess < 1) or not self.booting:
return None
for key, node in self.booting.iteritems():
- if node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get():
+ if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get():
del self.booting[key]
- del self.sizes_booting_shutdown[key]
+ del self.sizes_booting[key]
if nodes_excess > 1:
self._later.stop_booting_node(size)
def _begin_node_shutdown(self, node_actor, cancellable):
cloud_node_obj = node_actor.cloud_node.get()
cloud_node_id = cloud_node_obj.id
- if cloud_node_id in self.shutdowns:
+ record = self.cloud_nodes[cloud_node_id]
+ if record.shutdown_actor is not None:
return None
shutdown = self._node_shutdown.start(
timer_actor=self._timer, cloud_client=self._new_cloud(),
arvados_client=self._new_arvados(),
node_monitor=node_actor.actor_ref, cancellable=cancellable)
- self.shutdowns[cloud_node_id] = shutdown.proxy()
- self.sizes_booting_shutdown[cloud_node_id] = cloud_node_obj.size
+ record.shutdown_actor = shutdown.proxy()
shutdown.tell_proxy().subscribe(self._later.node_finished_shutdown)
@_check_poll_freshness
def node_can_shutdown(self, node_actor):
if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
self._begin_node_shutdown(node_actor, cancellable=True)
-
- def shutdown_unpaired_node(self, cloud_node_id):
- for record_dict in [self.cloud_nodes, self.booted]:
- if cloud_node_id in record_dict:
- record = record_dict[cloud_node_id]
- break
- else:
- return None
- if not record.actor.in_state('idle', 'busy').get():
- self._begin_node_shutdown(record.actor, cancellable=False)
+ elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
+ # Node is unpaired, which means it probably exceeded its booting
+ # grace period without a ping, so shut it down so we can boot a new
+ # node in its place.
+ self._begin_node_shutdown(node_actor, cancellable=False)
+ elif node_actor.in_state('down').get():
+ # Node is down and unlikely to come back.
+ self._begin_node_shutdown(node_actor, cancellable=False)
def node_finished_shutdown(self, shutdown_actor):
- cloud_node, success, cancel_reason = self._get_actor_attrs(
- shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
- shutdown_actor.stop()
+ try:
+ cloud_node, success, cancel_reason = self._get_actor_attrs(
+ shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
+ except pykka.ActorDeadError:
+ return
cloud_node_id = cloud_node.id
+ record = self.cloud_nodes[cloud_node_id]
+ shutdown_actor.stop()
if not success:
if cancel_reason == self._node_shutdown.NODE_BROKEN:
self.cloud_nodes.blacklist(cloud_node_id)
- elif cloud_node_id in self.booted:
- self.booted.pop(cloud_node_id).actor.stop()
- del self.shutdowns[cloud_node_id]
- del self.sizes_booting_shutdown[cloud_node_id]
+ record.shutdown_actor = None
+ else:
+ # If the node went from being booted to being shut down without ever
+ # appearing in the cloud node list, it will have the
+ # _nodemanager_recently_booted tag, so get rid of it so that the node
+ # can be forgotten completely.
+ if hasattr(self.cloud_nodes[cloud_node_id].cloud_node, "_nodemanager_recently_booted"):
+ del self.cloud_nodes[cloud_node_id].cloud_node._nodemanager_recently_booted
def shutdown(self):
self._logger.info("Shutting down after signal.")
import pykka
from . import config as nmconfig
+from .baseactor import WatchdogActor
from .daemon import NodeManagerDaemonActor
from .jobqueue import JobQueueMonitorActor, ServerCalculator
from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
node_setup, node_shutdown, node_monitor,
max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+ WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+ cloud_node_poller.actor_ref,
+ arvados_node_poller.actor_ref,
+ job_queue_poller.actor_ref,
+ node_daemon.actor_ref)
+
signal.pause()
daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
while not daemon_stopped():
cloud_node = testutil.cloud_node_mock(61)
arv_node = testutil.arvados_node_mock(61)
self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+ self.cloud_client.destroy_node.return_value = False
self.make_actor(cancellable=True)
+ self.shutdown_actor.cancel_shutdown("test")
self.check_success_flag(False, 2)
self.assertFalse(self.arvados_client.nodes().update.called)
self.check_success_flag(True)
self.assertTrue(self.cloud_client.destroy_node.called)
- def test_shutdown_cancelled_when_window_closes(self):
- self.make_mocks(shutdown_open=False)
- self.make_actor()
- self.check_success_flag(False, 2)
- self.assertFalse(self.cloud_client.destroy_node.called)
- self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
- self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
def test_shutdown_retries_when_cloud_fails(self):
self.make_mocks()
self.cloud_client.destroy_node.return_value = False
self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
self.driver().sync_node.assert_called_with(cloud_node, arv_node)
+ @testutil.no_sleep
+ def test_node_sync_error(self):
+ self.make_actor()
+ cloud_node = testutil.cloud_node_mock()
+ arv_node = testutil.arvados_node_mock()
+ self.driver().sync_node.side_effect = (IOError, Exception, True)
+ self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+ self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+ self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+ self.driver().sync_node.assert_called_with(cloud_node, arv_node)
class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
unittest.TestCase):
def test_in_state_when_unpaired(self):
self.make_actor()
- self.assertIsNone(self.node_state('idle', 'busy'))
+ self.assertTrue(self.node_state('unpaired'))
def test_in_state_when_pairing_stale(self):
self.make_actor(arv_node=testutil.arvados_node_mock(
job_uuid=None, age=90000))
- self.assertIsNone(self.node_state('idle', 'busy'))
+ self.assertTrue(self.node_state('down'))
def test_in_state_when_no_state_available(self):
self.make_actor(arv_node=testutil.arvados_node_mock(
crunch_worker_state=None))
- self.assertIsNone(self.node_state('idle', 'busy'))
+ print(self.node_actor.get_state().get())
+ self.assertTrue(self.node_state('idle'))
+
+ def test_in_state_when_no_state_available_old(self):
+ self.make_actor(arv_node=testutil.arvados_node_mock(
+ crunch_worker_state=None, age=90000))
+ print(self.node_actor.get_state().get())
+ self.assertTrue(self.node_state('down'))
def test_in_idle_state(self):
self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
def test_no_shutdown_booting(self):
self.make_actor()
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
- def test_no_shutdown_missing(self):
+ def test_shutdown_missing(self):
arv_node = testutil.arvados_node_mock(10, job_uuid=None,
crunch_worker_state="down",
last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
- def test_no_shutdown_running_broken(self):
+ def test_shutdown_running_broken(self):
arv_node = testutil.arvados_node_mock(12, job_uuid=None,
crunch_worker_state="down")
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_missing_broken(self):
arv_node = testutil.arvados_node_mock(11, job_uuid=None,
self.make_actor(11, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
+ self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_when_node_running_job(self):
self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
- def test_no_shutdown_when_node_state_unknown(self):
+ def test_shutdown_when_node_state_unknown(self):
self.make_actor(5, testutil.arvados_node_mock(
5, crunch_worker_state=None))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_when_node_state_stale(self):
self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals((False, "node state is stale"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_arvados_node_match(self):
self.make_actor(2)
from __future__ import absolute_import, print_function
import subprocess
+import time
import unittest
import mock
self.check_success_after_reset(proc_mock, end_state)
return test
- for wait_state in ['alloc\n', 'drng\n', 'idle*\n']:
+ for wait_state in ['alloc\n', 'drng\n']:
locals()['test_wait_while_' + wait_state.strip()
] = make_wait_state_test(start_state=wait_state)
- for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']:
+ for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
locals()['test_wait_until_' + end_state.strip()
] = make_wait_state_test(end_state=end_state)
def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
# Test we correctly handle a node that failed to bootstrap.
- proc_mock.return_value = 'idle\n'
+ proc_mock.return_value = 'down\n'
self.make_actor(start_time=0)
self.check_success_flag(True)
self.assertFalse(proc_mock.called)
- def test_node_undrained_when_shutdown_window_closes(self, proc_mock):
- proc_mock.side_effect = iter(['drng\n', 'idle\n'])
- self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
- self.make_actor()
- self.check_success_flag(False, 2)
- self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
-
- def test_alloc_node_undrained_when_shutdown_window_closes(self, proc_mock):
- proc_mock.side_effect = iter(['alloc\n'])
- self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
- self.make_actor()
- self.check_success_flag(False, 2)
- self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
+ def test_node_undrained_when_shutdown_cancelled(self, proc_mock):
+ try:
+ proc_mock.side_effect = iter(['', 'drng\n', 'drng\n', ''])
+ self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+ self.timer = testutil.MockTimer(False)
+ self.make_actor()
+ self.busywait(lambda: proc_mock.call_args is not None)
+ self.shutdown_actor.cancel_shutdown("test").get(self.TIMEOUT)
+ self.check_success_flag(False, 2)
+ self.assertEqual(proc_mock.call_args_list,
+ [mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=DRAIN', 'Reason=Node Manager shutdown']),
+ mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+ mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+ mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=RESUME'])])
+ finally:
+ self.shutdown_actor.actor_ref.stop()
def test_cancel_shutdown_retry(self, proc_mock):
- proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n'])
+ proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
self.make_actor()
self.check_success_flag(False, 2)
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import libcloud.common.types as cloud_types
+import mock
+
+import arvnodeman.computenode.driver as driver_base
+from . import testutil
+
+class ComputeNodeDriverTestCase(unittest.TestCase):
+ def setUp(self):
+ self.driver_mock = mock.MagicMock(name='driver_mock')
+ driver_base.BaseComputeNodeDriver.SEARCH_CACHE = {}
+
+ def test_search_for_now_uses_public_method(self):
+ image = testutil.cloud_object_mock(1)
+ self.driver_mock().list_images.return_value = [image]
+ driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+ self.assertIs(image, driver.search_for_now('id_1', 'list_images'))
+ self.assertEqual(1, self.driver_mock().list_images.call_count)
+
+ def test_search_for_now_uses_private_method(self):
+ net = testutil.cloud_object_mock(1)
+ self.driver_mock().ex_list_networks.return_value = [net]
+ driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+ self.assertIs(net, driver.search_for_now('id_1', 'ex_list_networks'))
+ self.assertEqual(1, self.driver_mock().ex_list_networks.call_count)
+
+ def test_search_for_now_raises_ValueError_on_zero_results(self):
+ self.driver_mock().list_images.return_value = []
+ driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+ with self.assertRaises(ValueError) as test:
+ driver.search_for_now('id_1', 'list_images')
+
+ def test_search_for_now_raises_ValueError_on_extra_results(self):
+ image = testutil.cloud_object_mock(1)
+ self.driver_mock().list_images.return_value = [image, image]
+ driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+ with self.assertRaises(ValueError) as test:
+ driver.search_for_now('id_1', 'list_images')
+
+ def test_search_for_now_does_not_cache_results(self):
+ image1 = testutil.cloud_object_mock(1)
+ image2 = testutil.cloud_object_mock(1)
+ self.driver_mock().list_images.side_effect = [[image1], [image2]]
+ driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+ self.assertIsNot(driver.search_for_now('id_1', 'list_images'),
+ driver.search_for_now('id_1', 'list_images'))
+ self.assertEqual(2, self.driver_mock().list_images.call_count)
+
+ def test_search_for_returns_cached_results(self):
+ image1 = testutil.cloud_object_mock(1)
+ image2 = testutil.cloud_object_mock(1)
+ self.driver_mock().list_images.side_effect = [[image1], [image2]]
+ driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
+ self.assertIs(driver.search_for('id_1', 'list_images'),
+ driver.search_for('id_1', 'list_images'))
+ self.assertEqual(1, self.driver_mock().list_images.call_count)
self.driver_mock().create_node.side_effect = IOError
n = driver.create_node(testutil.MockSize(1), arv_node)
self.assertEqual('compute-000000000000001-zzzzz', n.name)
+
+ def test_ex_fetch_nic_false(self):
+ arv_node = testutil.arvados_node_mock(1, hostname=None)
+ driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
+ nodelist = [testutil.cloud_node_mock(1, tags={"arvados-class": "dynamic-compute"})]
+ nodelist[0].name = 'compute-000000000000001-zzzzz'
+ self.driver_mock().list_nodes.return_value = nodelist
+ n = driver.list_nodes()
+ self.assertEqual(nodelist, n)
+ self.driver_mock().list_nodes.assert_called_with(ex_fetch_nic=False, ex_resource_group='TestResourceGroup')
+
+ def test_create_can_find_node_after_timeout(self):
+ super(AzureComputeNodeDriverTestCase,
+ self).test_create_can_find_node_after_timeout(
+ create_kwargs={'tag_arvados-class': 'test'},
+ node_extra={'tags': {'arvados-class': 'test'}})
+
+ def test_node_found_after_timeout_has_fixed_size(self):
+ size = testutil.MockSize(4)
+ node_props = {'hardwareProfile': {'vmSize': size.id}}
+ cloud_node = testutil.cloud_node_mock(
+ size=None, tags={'arvados-class': 'test'}, properties=node_props)
+ self.check_node_found_after_timeout_has_fixed_size(
+ size, cloud_node, {'tag_arvados-class': 'test'})
self.assertIs(node, nodelist[0])
self.assertIs(size, nodelist[0].size)
+ def test_node_found_after_timeout_has_fixed_size(self):
+ size = testutil.MockSize(4)
+ cloud_node = testutil.cloud_node_mock(size=size.id)
+ self.check_node_found_after_timeout_has_fixed_size(size, cloud_node)
+
def test_list_empty_nodes(self):
self.driver_mock().list_nodes.return_value = []
self.assertEqual([], self.new_driver().list_nodes())
for name in ['cloud_nodes', 'arvados_nodes', 'server_wishlist']:
setattr(self, name + '_poller', mock.MagicMock(name=name + '_mock'))
self.arv_factory = mock.MagicMock(name='arvados_mock')
+ api_client = mock.MagicMock(name='api_client')
+ api_client.nodes().create().execute.side_effect = [testutil.arvados_node_mock(1),
+ testutil.arvados_node_mock(2)]
+ self.arv_factory.return_value = api_client
+
self.cloud_factory = mock.MagicMock(name='cloud_mock')
self.cloud_factory().node_start_time.return_value = time.time()
self.cloud_updates = mock.MagicMock(name='updates_mock')
min_nodes, max_nodes, 600, 1800, 3600,
self.node_setup, self.node_shutdown,
max_total_price=max_total_price).proxy()
- if cloud_nodes is not None:
- self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
if arvados_nodes is not None:
self.daemon.update_arvados_nodes(arvados_nodes).get(self.TIMEOUT)
+ if cloud_nodes is not None:
+ self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
if want_sizes is not None:
self.daemon.update_server_wishlist(want_sizes).get(self.TIMEOUT)
self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1),
testutil.cloud_node_mock(2)],
arvados_nodes=[testutil.arvados_node_mock(1),
- testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
+ testutil.arvados_node_mock(2,
+ last_ping_at='1970-01-01T01:02:03.04050607Z')],
want_sizes=[size, size])
self.stop_proxy(self.daemon)
self.assertTrue(self.node_setup.start.called)
mock_node_monitor.proxy.return_value = mock.NonCallableMock(cloud_node=get_cloud_node)
mock_shutdown = self.node_shutdown.start(node_monitor=mock_node_monitor)
- self.daemon.shutdowns.get()[cloud_nodes[1].id] = mock_shutdown.proxy()
- self.daemon.sizes_booting_shutdown.get()[cloud_nodes[1].id] = size
+ self.daemon.cloud_nodes.get()[cloud_nodes[1].id].shutdown_actor = mock_shutdown.proxy()
self.assertEqual(2, self.alive_monitor_count())
for mon_ref in self.monitor_list():
arv_node = testutil.arvados_node_mock(2, job_uuid=True)
self.make_daemon([testutil.cloud_node_mock(2, size=size)], [arv_node],
[size], avail_sizes=[(size, {"cores":1})])
+ self.busywait(lambda: self.node_setup.start.called)
self.stop_proxy(self.daemon)
self.assertTrue(self.node_setup.start.called)
self.last_setup.arvados_node.get.return_value = arv_node
return self.last_setup
- def test_no_new_node_when_booted_node_not_usable(self):
+ def test_new_node_when_booted_node_not_usable(self):
cloud_node = testutil.cloud_node_mock(4)
arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
setup = self.start_node_boot(cloud_node, arv_node)
self.daemon.node_up(setup).get(self.TIMEOUT)
self.assertEqual(1, self.alive_monitor_count())
- self.daemon.update_cloud_nodes([cloud_node])
self.daemon.update_arvados_nodes([arv_node])
+ self.daemon.update_cloud_nodes([cloud_node])
+ self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-1801
self.daemon.update_server_wishlist(
[testutil.MockSize(1)]).get(self.TIMEOUT)
self.stop_proxy(self.daemon)
- self.assertEqual(1, self.node_setup.start.call_count)
+ self.assertEqual(2, self.node_setup.start.call_count)
def test_no_duplication_when_booting_node_listed_fast(self):
# Test that we don't start two ComputeNodeMonitorActors when
shutdown = self.node_shutdown.start().proxy()
shutdown.cloud_node.get.return_value = cloud_node
self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
+ self.daemon.update_cloud_nodes([])
self.assertTrue(shutdown.stop.called,
"shutdown actor not stopped after finishing")
self.assertTrue(monitor.actor_ref.actor_stopped.wait(self.TIMEOUT),
def test_booted_node_shut_down_when_never_listed(self):
setup = self.start_node_boot()
+ self.cloud_factory().node_start_time.return_value = time.time() - 3601
self.daemon.node_up(setup).get(self.TIMEOUT)
self.assertEqual(1, self.alive_monitor_count())
self.assertFalse(self.node_shutdown.start.called)
- self.timer.deliver()
+ now = time.time()
+ self.monitor_list()[0].tell_proxy().consider_shutdown()
+ self.busywait(lambda: self.node_shutdown.start.called)
self.stop_proxy(self.daemon)
self.assertShutdownCancellable(False)
def test_booted_node_shut_down_when_never_paired(self):
cloud_node = testutil.cloud_node_mock(2)
setup = self.start_node_boot(cloud_node)
+ self.cloud_factory().node_start_time.return_value = time.time() - 3601
self.daemon.node_up(setup).get(self.TIMEOUT)
self.assertEqual(1, self.alive_monitor_count())
self.daemon.update_cloud_nodes([cloud_node])
- self.timer.deliver()
+ self.monitor_list()[0].tell_proxy().consider_shutdown()
+ self.busywait(lambda: self.node_shutdown.start.called)
self.stop_proxy(self.daemon)
self.assertShutdownCancellable(False)
cloud_node = testutil.cloud_node_mock(4)
arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
setup = self.start_node_boot(cloud_node, arv_node)
+ self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
self.daemon.node_up(setup).get(self.TIMEOUT)
self.assertEqual(1, self.alive_monitor_count())
+ self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-3601
self.daemon.update_cloud_nodes([cloud_node])
- self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
- self.timer.deliver()
+ self.busywait(lambda: self.node_shutdown.start.called)
self.stop_proxy(self.daemon)
self.assertShutdownCancellable(False)
def test_shutdown_declined_at_wishlist_capacity(self):
cloud_node = testutil.cloud_node_mock(1)
+ arv_node = testutil.arvados_node_mock(1)
size = testutil.MockSize(1)
- self.make_daemon(cloud_nodes=[cloud_node], want_sizes=[size])
+ self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], want_sizes=[size])
self.assertEqual(1, self.alive_monitor_count())
monitor = self.monitor_list()[0].proxy()
self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
def test_shutdown_declined_below_min_nodes(self):
cloud_node = testutil.cloud_node_mock(1)
- self.make_daemon(cloud_nodes=[cloud_node], min_nodes=1)
+ arv_node = testutil.arvados_node_mock(1)
+ self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], min_nodes=1)
self.assertEqual(1, self.alive_monitor_count())
monitor = self.monitor_list()[0].proxy()
self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
def test_nodes_shutting_down_replaced_below_max_nodes(self):
size = testutil.MockSize(6)
cloud_node = testutil.cloud_node_mock(6, size=size)
- self.make_daemon([cloud_node], [testutil.arvados_node_mock(6)],
+ self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
avail_sizes=[(size, {"cores":1})])
self.assertEqual(1, self.alive_monitor_count())
monitor = self.monitor_list()[0].proxy()
self.stop_proxy(self.daemon)
self.assertEqual(1, self.last_shutdown.stop.call_count)
- def busywait(self, f):
- n = 0
- while not f() and n < 10:
- time.sleep(.1)
- n += 1
- self.assertTrue(f())
-
def test_node_create_two_sizes(self):
small = testutil.MockSize(1)
big = testutil.MockSize(2)
self.daemon.node_can_shutdown(c.actor)
booting = self.daemon.booting.get()
- shutdowns = self.daemon.shutdowns.get()
+ cloud_nodes = self.daemon.cloud_nodes.get()
self.stop_proxy(self.daemon)
# shutting down a small node
sizecounts = {a[0].id: 0 for a in avail_sizes}
- for b in shutdowns.itervalues():
- sizecounts[b.cloud_node.get().size.id] += 1
+ for b in cloud_nodes.nodes.itervalues():
+ if b.shutdown_actor is not None:
+ sizecounts[b.cloud_node.size.id] += 1
self.assertEqual(1, sizecounts[small.id])
self.assertEqual(0, sizecounts[big.id])
import errno
import logging
+import time
import threading
import unittest
def doStuff(self):
raise self.exp
+ def ping(self):
+ # Called by WatchdogActorTest, this delay is longer than the test timeout
+ # of 1 second, which should cause the watchdog ping to fail.
+ time.sleep(2)
+ return True
+
class ActorUnhandledExceptionTest(unittest.TestCase):
def test_fatal_error(self):
for e in (MemoryError(), threading.ThreadError(), OSError(errno.ENOMEM, "")):
- with mock.patch('os.killpg') as killpg_mock:
+ with mock.patch('os.kill') as kill_mock:
act = BogusActor.start(e).tell_proxy()
act.doStuff()
act.actor_ref.stop(block=True)
- self.assertTrue(killpg_mock.called)
-
- def test_nonfatal_error(self):
- with mock.patch('os.killpg') as killpg_mock:
- act = BogusActor.start(OSError(errno.ENOENT, "")).tell_proxy()
- act.doStuff()
- act.actor_ref.stop(block=True)
- self.assertFalse(killpg_mock.called)
+ self.assertTrue(kill_mock.called)
+
+ @mock.patch('os.kill')
+ def test_nonfatal_error(self, kill_mock):
+ act = BogusActor.start(OSError(errno.ENOENT, "")).tell_proxy()
+ act.doStuff()
+ act.actor_ref.stop(block=True)
+ self.assertFalse(kill_mock.called)
+
+class WatchdogActorTest(unittest.TestCase):
+ @mock.patch('os.kill')
+ def test_time_timout(self, kill_mock):
+ act = BogusActor.start(OSError(errno.ENOENT, ""))
+ watch = arvnodeman.baseactor.WatchdogActor.start(1, act)
+ watch.stop(block=True)
+ act.stop(block=True)
+ self.assertTrue(kill_mock.called)
import threading
import time
+import libcloud.common.types as cloud_types
import mock
import pykka
if result is not unassigned:
return result
+ def busywait(self, f):
+ n = 0
+ while not f() and n < 10:
+ time.sleep(.1)
+ n += 1
+ self.assertTrue(f())
+
class DriverTestMixin(object):
def setUp(self):
self.assertTrue(self.driver_mock.called)
self.assertIs(driver.real, driver_mock2)
+ def test_create_can_find_node_after_timeout(self, create_kwargs={}, node_extra={}):
+ driver = self.new_driver(create_kwargs=create_kwargs)
+ arv_node = arvados_node_mock()
+ cloud_node = cloud_node_mock(**node_extra)
+ cloud_node.name = driver.create_cloud_name(arv_node)
+ create_method = self.driver_mock().create_node
+ create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+ list_method = self.driver_mock().list_nodes
+ list_method.return_value = [cloud_node]
+ actual = driver.create_node(MockSize(1), arv_node)
+ self.assertIs(cloud_node, actual)
+
+ def test_create_can_raise_exception_after_timeout(self):
+ driver = self.new_driver()
+ arv_node = arvados_node_mock()
+ create_method = self.driver_mock().create_node
+ create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+ list_method = self.driver_mock().list_nodes
+ list_method.return_value = []
+ with self.assertRaises(cloud_types.LibcloudError) as exc_test:
+ driver.create_node(MockSize(1), arv_node)
+ self.assertIs(create_method.side_effect, exc_test.exception)
+
+ def check_node_found_after_timeout_has_fixed_size(self, size, cloud_node,
+ create_kwargs={}):
+ # This method needs to be called explicitly by driver test suites
+ # that need it.
+ self.driver_mock().list_sizes.return_value = [size]
+ driver = self.new_driver(create_kwargs=create_kwargs)
+ arv_node = arvados_node_mock()
+ cloud_node.name = driver.create_cloud_name(arv_node)
+ create_method = self.driver_mock().create_node
+ create_method.side_effect = cloud_types.LibcloudError("fake timeout")
+ self.driver_mock().list_nodes.return_value = [cloud_node]
+ actual = driver.create_node(size, arv_node)
+ self.assertIs(size, actual.size)
+
+
class RemotePollLoopActorTestMixin(ActorTestMixin):
def build_monitor(self, *args, **kwargs):
self.timer = mock.MagicMock(name='timer_mock')
ARVADOS_ROOT="$ARVBOX_DATA/arvados"
fi
-if test -z "$ARVADOS_DEV_ROOT" ; then
- ARVADOS_DEV_ROOT="$ARVBOX_DATA/arvados-dev"
-fi
-
if test -z "$SSO_ROOT" ; then
SSO_ROOT="$ARVBOX_DATA/sso-devise-omniauth-provider"
fi
VAR_DATA="$ARVBOX_DATA/var"
PASSENGER="$ARVBOX_DATA/passenger"
GEMS="$ARVBOX_DATA/gems"
+PIPCACHE="$ARVBOX_DATA/pip"
+GOSTUFF="$ARVBOX_DATA/gopath"
getip() {
docker inspect $ARVBOX_CONTAINER | grep \"IPAddress\" | head -n1 | tr -d ' ":,\n' | cut -c10-
}
run() {
+ if docker ps -a --filter "status=running" | grep -E "$ARVBOX_CONTAINER$" -q ; then
+ echo "Container $ARVBOX_CONTAINER is already running"
+ exit 0
+ fi
+
if docker ps -a | grep -E "$ARVBOX_CONTAINER$" -q ; then
- echo "Container $ARVBOX_CONTAINER is already running, use stop, restart or rebuild"
+ echo "Container $ARVBOX_CONTAINER already exists but is not running; use restart or rebuild"
exit 1
fi
--publish=25100:25100
--publish=25107:25107
--publish=25108:25108
- --publish=8001:8001"
+ --publish=8001:8001
+ --publish=8002:8002"
else
PUBLIC=""
fi
updateconf
wait_for_arvbox
else
- mkdir -p "$PG_DATA" "$VAR_DATA" "$PASSENGER" "$GEMS"
+ mkdir -p "$PG_DATA" "$VAR_DATA" "$PASSENGER" "$GEMS" "$PIPCACHE" "$GOSTUFF"
+
if ! test -d "$ARVADOS_ROOT" ; then
git clone https://github.com/curoverse/arvados.git "$ARVADOS_ROOT"
if test "$1" = test ; then
shift
- if ! test -d "$ARVADOS_DEV_ROOT" ; then
- git clone https://github.com/curoverse/arvados-dev.git "$ARVADOS_DEV_ROOT"
- fi
-
mkdir -p $VAR_DATA/test
docker run \
--name=$ARVBOX_CONTAINER \
--privileged \
"--volume=$ARVADOS_ROOT:/usr/src/arvados:rw" \
- "--volume=$ARVADOS_DEV_ROOT:/usr/src/arvados-dev:rw" \
"--volume=$SSO_ROOT:/usr/src/sso:rw" \
"--volume=$PG_DATA:/var/lib/postgresql:rw" \
"--volume=$VAR_DATA:/var/lib/arvados:rw" \
"--volume=$PASSENGER:/var/lib/passenger:rw" \
"--volume=$GEMS:/var/lib/gems:rw" \
+ "--volume=$PIPCACHE:/var/lib/pip:rw" \
+ "--volume=$GOSTUFF:/var/lib/gopath:rw" \
arvados/arvbox-dev \
/usr/local/bin/runsvinit -svdir=/etc/test-service
docker exec -ti \
$ARVBOX_CONTAINER \
/usr/local/lib/arvbox/runsu.sh \
- /usr/src/arvados-dev/jenkins/run-tests.sh \
+ /usr/src/arvados/build/run-tests.sh \
--temp /var/lib/arvados/test \
WORKSPACE=/usr/src/arvados \
GEM_HOME=/var/lib/gems \
"--volume=$VAR_DATA:/var/lib/arvados:rw" \
"--volume=$PASSENGER:/var/lib/passenger:rw" \
"--volume=$GEMS:/var/lib/gems:rw" \
+ "--volume=$PIPCACHE:/var/lib/pip:rw" \
+ "--volume=$GOSTUFF:/var/lib/gopath:rw" \
$PUBLIC \
arvados/arvbox-dev
updateconf
echo "Could not find Dockerfile (expected it at $ARVBOX_DOCKER/Dockerfile.base)"
exit 1
fi
- docker build -t arvados/arvbox-base -f "$ARVBOX_DOCKER/Dockerfile.base" "$ARVBOX_DOCKER"
+ docker build $NO_CACHE -t arvados/arvbox-base -f "$ARVBOX_DOCKER/Dockerfile.base" "$ARVBOX_DOCKER"
if test "$1" = localdemo -o "$1" = publicdemo ; then
- docker build -t arvados/arvbox-demo -f "$ARVBOX_DOCKER/Dockerfile.demo" "$ARVBOX_DOCKER"
+ docker build $NO_CACHE -t arvados/arvbox-demo -f "$ARVBOX_DOCKER/Dockerfile.demo" "$ARVBOX_DOCKER"
else
- docker build -t arvados/arvbox-dev -f "$ARVBOX_DOCKER/Dockerfile.dev" "$ARVBOX_DOCKER"
+ docker build $NO_CACHE -t arvados/arvbox-dev -f "$ARVBOX_DOCKER/Dockerfile.dev" "$ARVBOX_DOCKER"
fi
}
build $@
;;
+ rebuild)
+ check $@
+ NO_CACHE=--no-cache build $@
+ ;;
+
start|run)
check $@
run $@
run $@
;;
- rebuild)
+ reboot)
check $@
stop
build $@
*)
echo "Arvados-in-a-box http://arvados.org"
echo
- echo "$(basename $0) (build|start|run|open|shell|ip|stop|rebuild|reset|destroy|log|svrestart)"
- echo
- echo "build <config> build arvbox Docker image"
+ echo "build <config> build arvbox Docker image"
+ echo "rebuild <config> build arvbox Docker image, no layer cache"
echo "start|run <config> start $ARVBOX_CONTAINER container"
echo "open open arvbox workbench in a web browser"
echo "shell enter arvbox shell"
echo "status print some information about current arvbox"
echo "stop stop arvbox container"
echo "restart <config> stop, then run again"
- echo "rebuild <config> stop, build arvbox Docker image, run"
+ echo "reboot <config> stop, build arvbox Docker image, run"
echo "reset delete arvbox arvados data (be careful!)"
echo "destroy delete all arvbox code and data (be careful!)"
echo "log <service> tail log of specified service"
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get -yq install \
- postgresql-9.4 git gcc golang-go runit \
+ postgresql-9.4 git gcc runit \
ruby rake bundler curl libpq-dev \
libcurl4-openssl-dev libssl-dev zlib1g-dev libpcre3-dev \
openssh-server python-setuptools netcat-traditional \
libjson-perl nginx gitolite3 lsof python-epydoc graphviz \
apt-transport-https ca-certificates slurm-wlm
+RUN cd /usr/local && \
+ curl -O http://storage.googleapis.com/golang/go1.6.2.linux-amd64.tar.gz && \
+ tar -xzf go1.6.2.linux-amd64.tar.gz && \
+ rm go1.6.2.linux-amd64.tar.gz && \
+ cd bin && \
+ ln -s /usr/local/go/bin/* .
+
VOLUME /var/lib/docker
VOLUME /var/log/nginx
VOLUME /etc/ssl/private
ADD crunch-setup.sh gitolite.rc \
keep-setup.sh common.sh createusers.sh \
logger runsu.sh waitforpostgres.sh \
- application_yml_override.py \
+ application_yml_override.py api-setup.sh \
/usr/local/lib/arvbox/
# Start the supervisor.
RUN sudo -u arvbox /var/lib/arvbox/service/workbench/run-service --only-deps
RUN sudo -u arvbox /var/lib/arvbox/service/doc/run-service --only-deps
RUN sudo -u arvbox /var/lib/arvbox/service/vm/run-service --only-deps
+RUN sudo -u arvbox /var/lib/arvbox/service/keep-web/run-service --only-deps
+RUN sudo -u arvbox /var/lib/arvbox/service/keepproxy/run-service --only-deps
+RUN sudo -u arvbox /var/lib/arvbox/service/arv-git-httpd/run-service --only-deps
+RUN sudo -u arvbox /usr/local/lib/arvbox/keep-setup.sh --only-deps
RUN sudo -u arvbox /var/lib/arvbox/service/sdk/run-service
--- /dev/null
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+set -u
+
+if ! test -s /var/lib/arvados/api_uuid_prefix ; then
+ ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
+fi
+uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
+
+if ! test -s /var/lib/arvados/api_secret_token ; then
+ ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
+fi
+secret_token=$(cat /var/lib/arvados/api_secret_token)
+
+if ! test -s /var/lib/arvados/blob_signing_key ; then
+ ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
+fi
+blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
+
+# self signed key will be created by SSO server script.
+test -s /var/lib/arvados/self-signed.key
+
+sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
+
+if test -s /var/lib/arvados/vm-uuid ; then
+ vm_uuid=$(cat /var/lib/arvados/vm-uuid)
+else
+ vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
+ echo $vm_uuid > /var/lib/arvados/vm-uuid
+fi
+
+cat >config/application.yml <<EOF
+development:
+ uuid_prefix: $uuid_prefix
+ secret_token: $secret_token
+ blob_signing_key: $blob_signing_key
+ sso_app_secret: $sso_app_secret
+ sso_app_id: arvados-server
+ sso_provider_url: "https://$localip:${services[sso]}"
+ sso_insecure: true
+ workbench_address: "http://$localip/"
+ websocket_address: "ws://$localip:${services[websockets]}/websocket"
+ git_repo_ssh_base: "git@$localip:"
+ git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
+ new_users_are_active: true
+ auto_admin_first_user: true
+ auto_setup_new_users: true
+ auto_setup_new_users_with_vm_uuid: $vm_uuid
+ auto_setup_new_users_with_repository: true
+ default_collection_replication: 1
+EOF
+
+(cd config && /usr/local/lib/arvbox/application_yml_override.py)
+
+if ! test -f /var/lib/arvados/api_database_pw ; then
+ ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
+fi
+database_pw=$(cat /var/lib/arvados/api_database_pw)
+
+if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
+ psql postgres -c "create user arvados with password '$database_pw'"
+ psql postgres -c "ALTER USER arvados CREATEDB;"
+fi
+
+sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
+
+if ! test -f /var/lib/arvados/api_database_setup ; then
+ bundle exec rake db:setup
+ touch /var/lib/arvados/api_database_setup
+fi
+
+if ! test -s /var/lib/arvados/superuser_token ; then
+ bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
+fi
+
+rm -rf tmp
+
+bundle exec rake db:migrate
[keepstore1]=25108
[ssh]=22
[doc]=8001
+ [websockets]=8002
)
if test "$(id arvbox -u 2>/dev/null)" = 0 ; then
else
frozen=""
fi
- if ! flock /var/lib/arvados/gems.lock bundle install --path $GEM_HOME --local --no-deployment $frozen "$@" ; then
- flock /var/lib/arvados/gems.lock bundle install --path $GEM_HOME --no-deployment $frozen "$@"
+ if ! flock /var/lib/gems/gems.lock bundle install --path $GEM_HOME --local --no-deployment $frozen "$@" ; then
+ flock /var/lib/gems/gems.lock bundle install --path $GEM_HOME --no-deployment $frozen "$@"
fi
}
pip_install() {
- pushd /var/lib/arvados/pip
+ pushd /var/lib/pip
for p in $(ls http*.tar.gz) ; do
if test -f $p ; then
ln -sf $p $(echo $p | sed 's/.*%2F\(.*\)/\1/')
done
popd
- if ! pip install --no-index --find-links /var/lib/arvados/pip $1 ; then
+ if ! pip install --no-index --find-links /var/lib/pip $1 ; then
pip install $1
fi
}
HOSTGID=$(ls -nd /usr/src/arvados | sed 's/ */ /' | cut -d' ' -f5)
FUSEGID=$(ls -nd /dev/fuse | sed 's/ */ /' | cut -d' ' -f5)
- mkdir -p /var/lib/arvados/git /var/lib/gems /var/lib/passenger
+ mkdir -p /var/lib/arvados/git /var/lib/gems \
+ /var/lib/passenger /var/lib/gopath /var/lib/pip
groupadd --gid $HOSTGID --non-unique arvbox
groupadd --gid $FUSEGID --non-unique fuse
chown arvbox:arvbox -R /usr/local /var/lib/arvados /var/lib/gems \
/var/lib/passenger /var/lib/postgresql \
- /var/lib/nginx /var/log/nginx /etc/ssl/private
+ /var/lib/nginx /var/log/nginx /etc/ssl/private \
+ /var/lib/gopath /var/lib/pip
mkdir -p /var/lib/gems/ruby/2.1.0
chown arvbox:arvbox -R /var/lib/gems/ruby/2.1.0
. /usr/local/lib/arvbox/common.sh
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
export GOPATH=$PWD
mkdir -p "$GOPATH/src/git.curoverse.com"
ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/crunchstat"
-install bin/crunchstat /usr/local/bin
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/crunchstat"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/sdk/go/crunchrunner"
+install bin/crunchstat bin/crunchrunner /usr/local/bin
export ARVADOS_API_HOST=$localip:${services[api]}
export ARVADOS_API_HOST_INSECURE=1
export HOME=/tmp/$1
cd /usr/src/arvados/services/api
-exec bundle exec ./script/crunch-dispatch.rb development
+if test "$1" = "crunch0" ; then
+ exec bundle exec ./script/crunch-dispatch.rb development --jobs --pipelines
+else
+ exec bundle exec ./script/crunch-dispatch.rb development --jobs
+fi
. /usr/local/lib/arvbox/common.sh
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
export GOPATH=$PWD
mkdir -p "$GOPATH/src/git.curoverse.com"
ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/keepstore"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/keepstore"
install bin/keepstore /usr/local/bin
+if test "$1" = "--only-deps" ; then
+ exit
+fi
+
mkdir -p /var/lib/arvados/$1
export ARVADOS_API_HOST=$localip:${services[api]}
-listen=:$2 \
-enforce-permissions=true \
-blob-signing-key-file=/var/lib/arvados/blob_signing_key \
+ -data-manager-token-file=/var/lib/arvados/superuser_token \
-max-buffers=20 \
-volume=/var/lib/arvados/$1
exit
fi
-set -u
-
-if ! test -s /var/lib/arvados/api_uuid_prefix ; then
- ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
-fi
-uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
-
-if ! test -s /var/lib/arvados/api_secret_token ; then
- ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
-fi
-secret_token=$(cat /var/lib/arvados/api_secret_token)
-
-if ! test -s /var/lib/arvados/blob_signing_key ; then
- ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
-fi
-blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
-
-# self signed key will be created by SSO server script.
-test -s /var/lib/arvados/self-signed.key
-
-sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
-
-if test -s /var/lib/arvados/vm-uuid ; then
- vm_uuid=$(cat /var/lib/arvados/vm-uuid)
-else
- vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
- echo $vm_uuid > /var/lib/arvados/vm-uuid
-fi
-
-cat >config/application.yml <<EOF
-development:
- uuid_prefix: $uuid_prefix
- secret_token: $secret_token
- blob_signing_key: $blob_signing_key
- sso_app_secret: $sso_app_secret
- sso_app_id: arvados-server
- sso_provider_url: "https://$localip:${services[sso]}"
- sso_insecure: true
- workbench_address: "http://$localip/"
- git_repo_ssh_base: "git@$localip:"
- git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
- new_users_are_active: true
- auto_admin_first_user: true
- auto_setup_new_users: true
- auto_setup_new_users_with_vm_uuid: $vm_uuid
- auto_setup_new_users_with_repository: true
- default_collection_replication: 1
-EOF
-
-(cd config && /usr/local/lib/arvbox/application_yml_override.py)
-
-if ! test -f /var/lib/arvados/api_database_pw ; then
- ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
-fi
-database_pw=$(cat /var/lib/arvados/api_database_pw)
-
-if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
- psql postgres -c "create user arvados with password '$database_pw'"
- psql postgres -c "ALTER USER arvados CREATEDB;"
-fi
-
-sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
-
-if ! test -f /var/lib/arvados/api_database_setup ; then
- bundle exec rake db:setup
- touch /var/lib/arvados/api_database_setup
-fi
-
-if ! test -s /var/lib/arvados/superuser_token ; then
- bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
-fi
-
-rm -rf tmp
-
-bundle exec rake db:migrate
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
set +u
if test "$1" = "--only-setup" ; then
exit
fi
-ARVADOS_WEBSOCKETS=1 exec bundle exec passenger start --port=${services[api]} \
+exec bundle exec passenger start --port=${services[api]} \
--runtime-dir=/var/lib/passenger \
--ssl --ssl-certificate=/var/lib/arvados/self-signed.pem \
--ssl-certificate-key=/var/lib/arvados/self-signed.key
#!/bin/bash
exec 2>&1
-set -eux -o pipefail
+set -ex -o pipefail
. /usr/local/lib/arvbox/common.sh
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
export GOPATH=$PWD
mkdir -p "$GOPATH/src/git.curoverse.com"
ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/arv-git-httpd"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/arv-git-httpd"
install bin/arv-git-httpd /usr/local/bin
+if test "$1" = "--only-deps" ; then
+ exit
+fi
+
export ARVADOS_API_HOST=$localip:${services[api]}
export ARVADOS_API_HOST_INSECURE=1
export GITOLITE_HTTP_HOME=/var/lib/arvados/git
. /usr/local/lib/arvbox/common.sh
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
export GOPATH=$PWD
mkdir -p "$GOPATH/src/git.curoverse.com"
ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/crunch-run"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/crunch-dispatch-local"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/crunch-run"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/crunch-dispatch-local"
install bin/crunch-run bin/crunch-dispatch-local /usr/local/bin
export ARVADOS_API_HOST=$localip:${services[api]}
#!/bin/bash
exec 2>&1
-set -eux -o pipefail
+set -ex -o pipefail
. /usr/local/lib/arvbox/common.sh
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
export GOPATH=$PWD
mkdir -p "$GOPATH/src/git.curoverse.com"
ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/keep-web"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/keep-web"
install bin/keep-web /usr/local/bin
+if test "$1" = "--only-deps" ; then
+ exit
+fi
+
export ARVADOS_API_HOST=$localip:${services[api]}
export ARVADOS_API_HOST_INSECURE=1
export ARVADOS_API_TOKEN=$(cat /var/lib/arvados/superuser_token)
exec 2>&1
sleep 2
-set -eux -o pipefail
+set -ex -o pipefail
. /usr/local/lib/arvbox/common.sh
-mkdir -p /var/lib/arvados/gostuff
-cd /var/lib/arvados/gostuff
+mkdir -p /var/lib/gopath
+cd /var/lib/gopath
export GOPATH=$PWD
mkdir -p "$GOPATH/src/git.curoverse.com"
ln -sfn "/usr/src/arvados" "$GOPATH/src/git.curoverse.com/arvados.git"
-flock /var/lib/arvados/gostuff.lock go get -t "git.curoverse.com/arvados.git/services/keepproxy"
+flock /var/lib/gopath/gopath.lock go get -t "git.curoverse.com/arvados.git/services/keepproxy"
install bin/keepproxy /usr/local/bin
+if test "$1" = "--only-deps" ; then
+ exit
+fi
+
export ARVADOS_API_HOST=$localip:${services[api]}
export ARVADOS_API_HOST_INSECURE=1
export ARVADOS_API_TOKEN=$(cat /var/lib/arvados/superuser_token)
. /usr/local/lib/arvbox/common.sh
-mkdir -p ~/.pip /var/lib/arvados/pip
+mkdir -p ~/.pip /var/lib/pip
cat > ~/.pip/pip.conf <<EOF
[global]
-download_cache = /var/lib/arvados/pip
+download_cache = /var/lib/pip
EOF
cd /usr/src/arvados/sdk/cli
--- /dev/null
+/usr/local/lib/arvbox/logger
\ No newline at end of file
--- /dev/null
+/usr/local/lib/arvbox/runsu.sh
\ No newline at end of file
--- /dev/null
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+run_bundler --without=development
+
+if test "$1" = "--only-deps" ; then
+ exit
+fi
+
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
+
+set +u
+if test "$1" = "--only-setup" ; then
+ exit
+fi
+
+export ARVADOS_WEBSOCKETS=ws-only
+
+# serving ssl directly doesn't work, gets
+# Rack app error: #<TypeError: no implicit conversion of Puma::MiniSSL::Socket into Integer>
+#exec bundle exec puma -b "ssl://0.0.0.0:${services[websockets]}?cert=/var/lib/arvados/self-signed.pem&key=/var/lib/arvados/self-signed.key"
+
+exec bundle exec puma -p${services[websockets]}
import arvados
import Queue
import threading
+import _strptime
from crunchstat_summary import logger
import re
import sys
import threading
+import _strptime
from arvados.api import OrderedJsonModel
from crunchstat_summary import logger
--- /dev/null
+keep-block-check
--- /dev/null
+package main
+
+import (
+ "crypto/tls"
+ "errors"
+ "flag"
+ "fmt"
+ "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+ "git.curoverse.com/arvados.git/sdk/go/keepclient"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "os"
+ "regexp"
+ "strings"
+ "time"
+)
+
+func main() {
+ err := doMain(os.Args[1:])
+ if err != nil {
+ log.Fatalf("%v", err)
+ }
+}
+
+func doMain(args []string) error {
+ flags := flag.NewFlagSet("keep-block-check", flag.ExitOnError)
+
+ configFile := flags.String(
+ "config",
+ "",
+ "Configuration filename. May be either a pathname to a config file, or (for example) 'foo' as shorthand for $HOME/.config/arvados/foo.conf file. This file is expected to specify the values for ARVADOS_API_TOKEN, ARVADOS_API_HOST, ARVADOS_API_HOST_INSECURE, and ARVADOS_BLOB_SIGNING_KEY for the source.")
+
+ keepServicesJSON := flags.String(
+ "keep-services-json",
+ "",
+ "An optional list of available keepservices. "+
+ "If not provided, this list is obtained from api server configured in config-file.")
+
+ locatorFile := flags.String(
+ "block-hash-file",
+ "",
+ "Filename containing the block hashes to be checked. This is required. "+
+ "This file contains the block hashes one per line.")
+
+ prefix := flags.String(
+ "prefix",
+ "",
+ "Block hash prefix. When a prefix is specified, only hashes listed in the file with this prefix will be checked.")
+
+ blobSignatureTTLFlag := flags.Duration(
+ "blob-signature-ttl",
+ 0,
+ "Lifetime of blob permission signatures on the keepservers. If not provided, this will be retrieved from the API server's discovery document.")
+
+ verbose := flags.Bool(
+ "v",
+ false,
+ "Log progress of each block verification")
+
+ // Parse args; omit the first arg which is the command name
+ flags.Parse(args)
+
+ config, blobSigningKey, err := loadConfig(*configFile)
+ if err != nil {
+ return fmt.Errorf("Error loading configuration from file: %s", err.Error())
+ }
+
+ // get list of block locators to be checked
+ blockLocators, err := getBlockLocators(*locatorFile, *prefix)
+ if err != nil {
+ return fmt.Errorf("Error reading block hashes to be checked from file: %s", err.Error())
+ }
+
+ // setup keepclient
+ kc, blobSignatureTTL, err := setupKeepClient(config, *keepServicesJSON, *blobSignatureTTLFlag)
+ if err != nil {
+ return fmt.Errorf("Error configuring keepclient: %s", err.Error())
+ }
+
+ return performKeepBlockCheck(kc, blobSignatureTTL, blobSigningKey, blockLocators, *verbose)
+}
+
+type apiConfig struct {
+ APIToken string
+ APIHost string
+ APIHostInsecure bool
+ ExternalClient bool
+}
+
+// Load config from given file
+func loadConfig(configFile string) (config apiConfig, blobSigningKey string, err error) {
+ if configFile == "" {
+ err = errors.New("Client config file not specified")
+ return
+ }
+
+ config, blobSigningKey, err = readConfigFromFile(configFile)
+ return
+}
+
+var matchTrue = regexp.MustCompile("^(?i:1|yes|true)$")
+
+// Read config from file
+func readConfigFromFile(filename string) (config apiConfig, blobSigningKey string, err error) {
+ if !strings.Contains(filename, "/") {
+ filename = os.Getenv("HOME") + "/.config/arvados/" + filename + ".conf"
+ }
+
+ content, err := ioutil.ReadFile(filename)
+
+ if err != nil {
+ return
+ }
+
+ lines := strings.Split(string(content), "\n")
+ for _, line := range lines {
+ if line == "" {
+ continue
+ }
+
+ kv := strings.SplitN(line, "=", 2)
+ if len(kv) == 2 {
+ key := strings.TrimSpace(kv[0])
+ value := strings.TrimSpace(kv[1])
+
+ switch key {
+ case "ARVADOS_API_TOKEN":
+ config.APIToken = value
+ case "ARVADOS_API_HOST":
+ config.APIHost = value
+ case "ARVADOS_API_HOST_INSECURE":
+ config.APIHostInsecure = matchTrue.MatchString(value)
+ case "ARVADOS_EXTERNAL_CLIENT":
+ config.ExternalClient = matchTrue.MatchString(value)
+ case "ARVADOS_BLOB_SIGNING_KEY":
+ blobSigningKey = value
+ }
+ }
+ }
+
+ return
+}
+
+// setup keepclient using the config provided
+func setupKeepClient(config apiConfig, keepServicesJSON string, blobSignatureTTL time.Duration) (kc *keepclient.KeepClient, ttl time.Duration, err error) {
+ arv := arvadosclient.ArvadosClient{
+ ApiToken: config.APIToken,
+ ApiServer: config.APIHost,
+ ApiInsecure: config.APIHostInsecure,
+ Client: &http.Client{Transport: &http.Transport{
+ TLSClientConfig: &tls.Config{InsecureSkipVerify: config.APIHostInsecure}}},
+ External: config.ExternalClient,
+ }
+
+ // if keepServicesJSON is provided, use it to load services; else, use DiscoverKeepServers
+ if keepServicesJSON == "" {
+ kc, err = keepclient.MakeKeepClient(&arv)
+ if err != nil {
+ return
+ }
+ } else {
+ kc = keepclient.New(&arv)
+ err = kc.LoadKeepServicesFromJSON(keepServicesJSON)
+ if err != nil {
+ return
+ }
+ }
+
+ // Get if blobSignatureTTL is not provided
+ ttl = blobSignatureTTL
+ if blobSignatureTTL == 0 {
+ value, err := arv.Discovery("blobSignatureTtl")
+ if err == nil {
+ ttl = time.Duration(int(value.(float64))) * time.Second
+ } else {
+ return nil, 0, err
+ }
+ }
+
+ return
+}
+
+// Get list of unique block locators from the given file
+func getBlockLocators(locatorFile, prefix string) (locators []string, err error) {
+ if locatorFile == "" {
+ err = errors.New("block-hash-file not specified")
+ return
+ }
+
+ content, err := ioutil.ReadFile(locatorFile)
+ if err != nil {
+ return
+ }
+
+ locatorMap := make(map[string]bool)
+ for _, line := range strings.Split(string(content), "\n") {
+ line = strings.TrimSpace(line)
+ if line == "" || !strings.HasPrefix(line, prefix) || locatorMap[line] {
+ continue
+ }
+ locators = append(locators, line)
+ locatorMap[line] = true
+ }
+
+ return
+}
+
+// Get block headers from keep. Log any errors.
+func performKeepBlockCheck(kc *keepclient.KeepClient, blobSignatureTTL time.Duration, blobSigningKey string, blockLocators []string, verbose bool) error {
+ totalBlocks := len(blockLocators)
+ notFoundBlocks := 0
+ current := 0
+ for _, locator := range blockLocators {
+ current++
+ if verbose {
+ log.Printf("Verifying block %d of %d: %v", current, totalBlocks, locator)
+ }
+ getLocator := locator
+ if blobSigningKey != "" {
+ expiresAt := time.Now().AddDate(0, 0, 1)
+ getLocator = keepclient.SignLocator(locator, kc.Arvados.ApiToken, expiresAt, blobSignatureTTL, []byte(blobSigningKey))
+ }
+
+ _, _, err := kc.Ask(getLocator)
+ if err != nil {
+ notFoundBlocks++
+ log.Printf("Error verifying block %v: %v", locator, err)
+ }
+ }
+
+ log.Printf("Verify block totals: %d attempts, %d successes, %d errors", totalBlocks, totalBlocks-notFoundBlocks, notFoundBlocks)
+
+ if notFoundBlocks > 0 {
+ return fmt.Errorf("Block verification failed for %d out of %d blocks with matching prefix.", notFoundBlocks, totalBlocks)
+ }
+
+ return nil
+}
--- /dev/null
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "os"
+ "regexp"
+ "strings"
+ "testing"
+ "time"
+
+ "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+ "git.curoverse.com/arvados.git/sdk/go/keepclient"
+
+ . "gopkg.in/check.v1"
+)
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+ TestingT(t)
+}
+
+// Gocheck boilerplate
+var _ = Suite(&ServerRequiredSuite{})
+var _ = Suite(&DoMainTestSuite{})
+
+type ServerRequiredSuite struct{}
+type DoMainTestSuite struct{}
+
+var kc *keepclient.KeepClient
+var logBuffer bytes.Buffer
+
+var TestHash = "aaaa09c290d0fb1ca068ffaddf22cbd0"
+var TestHash2 = "aaaac516f788aec4f30932ffb6395c39"
+
+var blobSignatureTTL = time.Duration(2*7*24) * time.Hour
+
+func (s *ServerRequiredSuite) SetUpSuite(c *C) {
+ arvadostest.StartAPI()
+}
+
+func (s *ServerRequiredSuite) TearDownSuite(c *C) {
+ arvadostest.StopAPI()
+ arvadostest.ResetEnv()
+}
+
+func (s *ServerRequiredSuite) SetUpTest(c *C) {
+ logOutput := io.MultiWriter(&logBuffer)
+ log.SetOutput(logOutput)
+}
+
+func (s *ServerRequiredSuite) TearDownTest(c *C) {
+ arvadostest.StopKeep(2)
+ log.SetOutput(os.Stdout)
+ log.Printf("%v", logBuffer.String())
+}
+
+func (s *DoMainTestSuite) SetUpSuite(c *C) {
+}
+
+func (s *DoMainTestSuite) SetUpTest(c *C) {
+ logOutput := io.MultiWriter(&logBuffer)
+ log.SetOutput(logOutput)
+}
+
+func (s *DoMainTestSuite) TearDownTest(c *C) {
+ log.SetOutput(os.Stdout)
+ log.Printf("%v", logBuffer.String())
+}
+
+func setupKeepBlockCheck(c *C, enforcePermissions bool, keepServicesJSON string) {
+ setupKeepBlockCheckWithTTL(c, enforcePermissions, keepServicesJSON, blobSignatureTTL)
+}
+
+func setupKeepBlockCheckWithTTL(c *C, enforcePermissions bool, keepServicesJSON string, ttl time.Duration) {
+ var config apiConfig
+ config.APIHost = os.Getenv("ARVADOS_API_HOST")
+ config.APIToken = arvadostest.DataManagerToken
+ config.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
+
+ // Start Keep servers
+ arvadostest.StartKeep(2, enforcePermissions)
+
+ // setup keepclients
+ var err error
+ kc, ttl, err = setupKeepClient(config, keepServicesJSON, ttl)
+ c.Assert(ttl, Equals, blobSignatureTTL)
+ c.Check(err, IsNil)
+}
+
+// Setup test data
+func setupTestData(c *C) []string {
+ allLocators := []string{}
+
+ // Put a few blocks
+ for i := 0; i < 5; i++ {
+ hash, _, err := kc.PutB([]byte(fmt.Sprintf("keep-block-check-test-data-%d", i)))
+ c.Check(err, IsNil)
+ allLocators = append(allLocators, strings.Split(hash, "+A")[0])
+ }
+
+ return allLocators
+}
+
+func setupConfigFile(c *C, fileName string) string {
+ // Setup a config file
+ file, err := ioutil.TempFile(os.TempDir(), fileName)
+ c.Check(err, IsNil)
+
+ // Add config to file. While at it, throw some extra white space
+ fileContent := "ARVADOS_API_HOST=" + os.Getenv("ARVADOS_API_HOST") + "\n"
+ fileContent += "ARVADOS_API_TOKEN=" + arvadostest.DataManagerToken + "\n"
+ fileContent += "\n"
+ fileContent += "ARVADOS_API_HOST_INSECURE=" + os.Getenv("ARVADOS_API_HOST_INSECURE") + "\n"
+ fileContent += " ARVADOS_EXTERNAL_CLIENT = false \n"
+ fileContent += " NotANameValuePairAndShouldGetIgnored \n"
+ fileContent += "ARVADOS_BLOB_SIGNING_KEY=abcdefg\n"
+
+ _, err = file.Write([]byte(fileContent))
+ c.Check(err, IsNil)
+
+ return file.Name()
+}
+
+func setupBlockHashFile(c *C, name string, blocks []string) string {
+ // Setup a block hash file
+ file, err := ioutil.TempFile(os.TempDir(), name)
+ c.Check(err, IsNil)
+
+ // Add the hashes to the file. While at it, throw some extra white space
+ fileContent := ""
+ for _, hash := range blocks {
+ fileContent += fmt.Sprintf(" %s \n", hash)
+ }
+ fileContent += "\n"
+ _, err = file.Write([]byte(fileContent))
+ c.Check(err, IsNil)
+
+ return file.Name()
+}
+
+func checkErrorLog(c *C, blocks []string, prefix, suffix string) {
+ for _, hash := range blocks {
+ expected := prefix + `.*` + hash + `.*` + suffix
+ match, _ := regexp.MatchString(expected, logBuffer.String())
+ c.Assert(match, Equals, true)
+ }
+}
+
+func checkNoErrorsLogged(c *C, prefix, suffix string) {
+ expected := prefix + `.*` + suffix
+ match, _ := regexp.MatchString(expected, logBuffer.String())
+ c.Assert(match, Equals, false)
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck(c *C) {
+ setupKeepBlockCheck(c, false, "")
+ allLocators := setupTestData(c)
+ err := performKeepBlockCheck(kc, blobSignatureTTL, "", allLocators, true)
+ c.Check(err, IsNil)
+ checkNoErrorsLogged(c, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheckWithBlobSigning(c *C) {
+ setupKeepBlockCheck(c, true, "")
+ allLocators := setupTestData(c)
+ err := performKeepBlockCheck(kc, blobSignatureTTL, arvadostest.BlobSigningKey, allLocators, true)
+ c.Check(err, IsNil)
+ checkNoErrorsLogged(c, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheckWithBlobSigningAndTTLFromDiscovery(c *C) {
+ setupKeepBlockCheckWithTTL(c, true, "", 0)
+ allLocators := setupTestData(c)
+ err := performKeepBlockCheck(kc, blobSignatureTTL, arvadostest.BlobSigningKey, allLocators, true)
+ c.Check(err, IsNil)
+ checkNoErrorsLogged(c, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_NoSuchBlock(c *C) {
+ setupKeepBlockCheck(c, false, "")
+ allLocators := setupTestData(c)
+ allLocators = append(allLocators, TestHash)
+ allLocators = append(allLocators, TestHash2)
+ err := performKeepBlockCheck(kc, blobSignatureTTL, "", allLocators, true)
+ c.Check(err, NotNil)
+ c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 7 blocks with matching prefix.")
+ checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_NoSuchBlock_WithMatchingPrefix(c *C) {
+ setupKeepBlockCheck(c, false, "")
+ allLocators := setupTestData(c)
+ allLocators = append(allLocators, TestHash)
+ allLocators = append(allLocators, TestHash2)
+ locatorFile := setupBlockHashFile(c, "block-hash", allLocators)
+ defer os.Remove(locatorFile)
+ locators, err := getBlockLocators(locatorFile, "aaa")
+ c.Check(err, IsNil)
+ err = performKeepBlockCheck(kc, blobSignatureTTL, "", locators, true)
+ c.Check(err, NotNil)
+ // Of the 7 blocks in allLocators, only two match the prefix and hence only those are checked
+ c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+ checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "Block not found")
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_NoSuchBlock_WithPrefixMismatch(c *C) {
+ setupKeepBlockCheck(c, false, "")
+ allLocators := setupTestData(c)
+ allLocators = append(allLocators, TestHash)
+ allLocators = append(allLocators, TestHash2)
+ locatorFile := setupBlockHashFile(c, "block-hash", allLocators)
+ defer os.Remove(locatorFile)
+ locators, err := getBlockLocators(locatorFile, "999")
+ c.Check(err, IsNil)
+ err = performKeepBlockCheck(kc, blobSignatureTTL, "", locators, true)
+ c.Check(err, IsNil) // there were no matching locators in file and hence nothing was checked
+}
+
+func (s *ServerRequiredSuite) TestBlockCheck_BadSignature(c *C) {
+ setupKeepBlockCheck(c, true, "")
+ setupTestData(c)
+ err := performKeepBlockCheck(kc, blobSignatureTTL, "badblobsigningkey", []string{TestHash, TestHash2}, false)
+ c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+ checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "HTTP 403")
+ // verbose logging not requested
+ c.Assert(strings.Contains(logBuffer.String(), "Verifying block 1 of 2"), Equals, false)
+}
+
+var testKeepServicesJSON = `{
+ "kind":"arvados#keepServiceList",
+ "etag":"",
+ "self_link":"",
+ "offset":null, "limit":null,
+ "items":[
+ {"href":"/keep_services/zzzzz-bi6l4-123456789012340",
+ "kind":"arvados#keepService",
+ "uuid":"zzzzz-bi6l4-123456789012340",
+ "service_host":"keep0.zzzzz.arvadosapi.com",
+ "service_port":25107,
+ "service_ssl_flag":false,
+ "service_type":"disk",
+ "read_only":false },
+ {"href":"/keep_services/zzzzz-bi6l4-123456789012341",
+ "kind":"arvados#keepService",
+ "uuid":"zzzzz-bi6l4-123456789012341",
+ "service_host":"keep0.zzzzz.arvadosapi.com",
+ "service_port":25108,
+ "service_ssl_flag":false,
+ "service_type":"disk",
+ "read_only":false }
+ ],
+ "items_available":2 }`
+
+// Setup block-check using keepServicesJSON with fake keepservers.
+// Expect error during performKeepBlockCheck due to unreachable keepservers.
+func (s *ServerRequiredSuite) TestErrorDuringKeepBlockCheck_FakeKeepservers(c *C) {
+ setupKeepBlockCheck(c, false, testKeepServicesJSON)
+ err := performKeepBlockCheck(kc, blobSignatureTTL, "", []string{TestHash, TestHash2}, true)
+ c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+ checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "")
+}
+
+// Test keep-block-check initialization with keepServicesJSON
+func (s *ServerRequiredSuite) TestKeepBlockCheck_InitializeWithKeepServicesJSON(c *C) {
+ setupKeepBlockCheck(c, false, testKeepServicesJSON)
+ found := 0
+ for k := range kc.LocalRoots() {
+ if k == "zzzzz-bi6l4-123456789012340" || k == "zzzzz-bi6l4-123456789012341" {
+ found++
+ }
+ }
+ c.Check(found, Equals, 2)
+}
+
+// Test loadConfig func
+func (s *ServerRequiredSuite) TestLoadConfig(c *C) {
+ // Setup config file
+ configFile := setupConfigFile(c, "config")
+ defer os.Remove(configFile)
+
+ // load configuration from the file
+ config, blobSigningKey, err := loadConfig(configFile)
+ c.Check(err, IsNil)
+
+ c.Assert(config.APIHost, Equals, os.Getenv("ARVADOS_API_HOST"))
+ c.Assert(config.APIToken, Equals, arvadostest.DataManagerToken)
+ c.Assert(config.APIHostInsecure, Equals, matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE")))
+ c.Assert(config.ExternalClient, Equals, false)
+ c.Assert(blobSigningKey, Equals, "abcdefg")
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoConfig(c *C) {
+ args := []string{"-prefix", "a"}
+ err := doMain(args)
+ c.Check(err, NotNil)
+ c.Assert(strings.Contains(err.Error(), "config file not specified"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoSuchConfigFile(c *C) {
+ args := []string{"-config", "no-such-file"}
+ err := doMain(args)
+ c.Check(err, NotNil)
+ c.Assert(strings.Contains(err.Error(), "no such file or directory"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoBlockHashFile(c *C) {
+ config := setupConfigFile(c, "config")
+ defer os.Remove(config)
+
+ // Start keepservers.
+ arvadostest.StartKeep(2, false)
+ defer arvadostest.StopKeep(2)
+
+ args := []string{"-config", config}
+ err := doMain(args)
+ c.Assert(strings.Contains(err.Error(), "block-hash-file not specified"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain_WithNoSuchBlockHashFile(c *C) {
+ config := setupConfigFile(c, "config")
+ defer os.Remove(config)
+
+ arvadostest.StartKeep(2, false)
+ defer arvadostest.StopKeep(2)
+
+ args := []string{"-config", config, "-block-hash-file", "no-such-file"}
+ err := doMain(args)
+ c.Assert(strings.Contains(err.Error(), "no such file or directory"), Equals, true)
+}
+
+func (s *DoMainTestSuite) Test_doMain(c *C) {
+ // Start keepservers.
+ arvadostest.StartKeep(2, false)
+ defer arvadostest.StopKeep(2)
+
+ config := setupConfigFile(c, "config")
+ defer os.Remove(config)
+
+ locatorFile := setupBlockHashFile(c, "block-hash", []string{TestHash, TestHash2})
+ defer os.Remove(locatorFile)
+
+ args := []string{"-config", config, "-block-hash-file", locatorFile, "-v"}
+ err := doMain(args)
+ c.Check(err, NotNil)
+ c.Assert(err.Error(), Equals, "Block verification failed for 2 out of 2 blocks with matching prefix.")
+ checkErrorLog(c, []string{TestHash, TestHash2}, "Error verifying block", "Block not found")
+ c.Assert(strings.Contains(logBuffer.String(), "Verifying block 1 of 2"), Equals, true)
+}
"",
"Index prefix")
+ srcBlobSignatureTTLFlag := flags.Duration(
+ "src-blob-signature-ttl",
+ 0,
+ "Lifetime of blob permission signatures on source keepservers. If not provided, this will be retrieved from the API server's discovery document.")
+
// Parse args; omit the first arg which is the command name
flags.Parse(os.Args[1:])
}
// setup src and dst keepclients
- kcSrc, err := setupKeepClient(srcConfig, *srcKeepServicesJSON, false, 0)
+ kcSrc, srcBlobSignatureTTL, err := setupKeepClient(srcConfig, *srcKeepServicesJSON, false, 0, *srcBlobSignatureTTLFlag)
if err != nil {
return fmt.Errorf("Error configuring src keepclient: %s", err.Error())
}
- kcDst, err := setupKeepClient(dstConfig, *dstKeepServicesJSON, true, *replications)
+ kcDst, _, err := setupKeepClient(dstConfig, *dstKeepServicesJSON, true, *replications, 0)
if err != nil {
return fmt.Errorf("Error configuring dst keepclient: %s", err.Error())
}
// Copy blocks not found in dst from src
- err = performKeepRsync(kcSrc, kcDst, srcBlobSigningKey, *prefix)
+ err = performKeepRsync(kcSrc, kcDst, srcBlobSignatureTTL, srcBlobSigningKey, *prefix)
if err != nil {
return fmt.Errorf("Error while syncing data: %s", err.Error())
}
}
// setup keepclient using the config provided
-func setupKeepClient(config apiConfig, keepServicesJSON string, isDst bool, replications int) (kc *keepclient.KeepClient, err error) {
+func setupKeepClient(config apiConfig, keepServicesJSON string, isDst bool, replications int, srcBlobSignatureTTL time.Duration) (kc *keepclient.KeepClient, blobSignatureTTL time.Duration, err error) {
arv := arvadosclient.ArvadosClient{
ApiToken: config.APIToken,
ApiServer: config.APIHost,
if keepServicesJSON == "" {
kc, err = keepclient.MakeKeepClient(&arv)
if err != nil {
- return nil, err
+ return nil, 0, err
}
} else {
kc = keepclient.New(&arv)
err = kc.LoadKeepServicesFromJSON(keepServicesJSON)
if err != nil {
- return kc, err
+ return kc, 0, err
}
}
if err == nil {
replications = int(value.(float64))
} else {
- return nil, err
+ return nil, 0, err
}
}
kc.Want_replicas = replications
}
- return kc, nil
+ // If srcBlobSignatureTTL is not provided, get it from API server discovery doc
+ blobSignatureTTL = srcBlobSignatureTTL
+ if !isDst && srcBlobSignatureTTL == 0 {
+ value, err := arv.Discovery("blobSignatureTtl")
+ if err == nil {
+ blobSignatureTTL = time.Duration(int(value.(float64))) * time.Second
+ } else {
+ return nil, 0, err
+ }
+ }
+
+ return kc, blobSignatureTTL, nil
}
// Get unique block locators from src and dst
// Copy any blocks missing in dst
-func performKeepRsync(kcSrc, kcDst *keepclient.KeepClient, blobSigningKey, prefix string) error {
+func performKeepRsync(kcSrc, kcDst *keepclient.KeepClient, srcBlobSignatureTTL time.Duration, blobSigningKey, prefix string) error {
// Get unique locators from src
srcIndex, err := getUniqueLocators(kcSrc, prefix)
if err != nil {
log.Printf("Before keep-rsync, there are %d blocks in src and %d blocks in dst. Start copying %d blocks from src not found in dst.",
len(srcIndex), len(dstIndex), len(toBeCopied))
- err = copyBlocksToDst(toBeCopied, kcSrc, kcDst, blobSigningKey)
+ err = copyBlocksToDst(toBeCopied, kcSrc, kcDst, srcBlobSignatureTTL, blobSigningKey)
return err
}
}
// Copy blocks from src to dst; only those that are missing in dst are copied
-func copyBlocksToDst(toBeCopied []string, kcSrc, kcDst *keepclient.KeepClient, blobSigningKey string) error {
+func copyBlocksToDst(toBeCopied []string, kcSrc, kcDst *keepclient.KeepClient, srcBlobSignatureTTL time.Duration, blobSigningKey string) error {
total := len(toBeCopied)
startedAt := time.Now()
getLocator := locator
expiresAt := time.Now().AddDate(0, 0, 1)
if blobSigningKey != "" {
- getLocator = keepclient.SignLocator(getLocator, kcSrc.Arvados.ApiToken, expiresAt, []byte(blobSigningKey))
+ getLocator = keepclient.SignLocator(getLocator, kcSrc.Arvados.ApiToken, expiresAt, srcBlobSignatureTTL, []byte(blobSigningKey))
}
reader, len, _, err := kcSrc.Get(getLocator)
var kcSrc, kcDst *keepclient.KeepClient
var srcKeepServicesJSON, dstKeepServicesJSON, blobSigningKey string
+var blobSignatureTTL = time.Duration(2*7*24) * time.Hour
func (s *ServerRequiredSuite) SetUpTest(c *C) {
// reset all variables between tests
dstConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
if enforcePermissions {
- blobSigningKey = "zfhgfenhffzltr9dixws36j1yhksjoll2grmku38mi7yxd66h5j4q9w4jzanezacp8s6q0ro3hxakfye02152hncy6zml2ed0uc"
+ blobSigningKey = arvadostest.BlobSigningKey
}
// Start Keep servers
// setup keepclients
var err error
- kcSrc, err = setupKeepClient(srcConfig, srcKeepServicesJSON, false, 0)
+ kcSrc, _, err = setupKeepClient(srcConfig, srcKeepServicesJSON, false, 0, blobSignatureTTL)
c.Check(err, IsNil)
- kcDst, err = setupKeepClient(dstConfig, dstKeepServicesJSON, true, replications)
+ kcDst, _, err = setupKeepClient(dstConfig, dstKeepServicesJSON, true, replications, 0)
c.Check(err, IsNil)
for uuid := range kcSrc.LocalRoots() {
c.Assert(err, Equals, nil)
locator = strings.Split(locator, "+")[0]
- _, _, _, err = kc2.Get(keepclient.SignLocator(locator, kc2.Arvados.ApiToken, time.Now().AddDate(0, 0, 1), []byte(blobSigningKey)))
+ _, _, _, err = kc2.Get(keepclient.SignLocator(locator, kc2.Arvados.ApiToken, time.Now().AddDate(0, 0, 1), blobSignatureTTL, []byte(blobSigningKey)))
c.Assert(err, NotNil)
c.Check(err.Error(), Equals, "Block not found")
}
// setupTestData
setupTestData(c, prefix)
- err := performKeepRsync(kcSrc, kcDst, blobSigningKey, prefix)
+ err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, blobSigningKey, prefix)
c.Check(err, IsNil)
// Now GetIndex from dst and verify that all 5 from src and the 2 extra blocks are found
setupRsync(c, false, 1)
- err := performKeepRsync(kcSrc, kcDst, "", "")
+ err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, "", "")
log.Printf("Err = %v", err)
c.Check(strings.Contains(err.Error(), "no such host"), Equals, true)
}
setupRsync(c, false, 1)
- err := performKeepRsync(kcSrc, kcDst, "", "")
+ err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, "", "")
log.Printf("Err = %v", err)
c.Check(strings.Contains(err.Error(), "no such host"), Equals, true)
}
// Change blob signing key to a fake key, so that Get from src fails
blobSigningKey = "thisisfakeblobsigningkey"
- err := performKeepRsync(kcSrc, kcDst, blobSigningKey, "")
+ err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, blobSigningKey, "")
c.Check(strings.Contains(err.Error(), "HTTP 403 \"Forbidden\""), Equals, true)
}
// Increase Want_replicas on dst to result in insufficient replicas error during Put
kcDst.Want_replicas = 2
- err := performKeepRsync(kcSrc, kcDst, blobSigningKey, "")
+ err := performKeepRsync(kcSrc, kcDst, blobSignatureTTL, blobSigningKey, "")
c.Check(strings.Contains(err.Error(), "Could not write sufficient replicas"), Equals, true)
}
c.Assert(strings.Contains(err.Error(), "no such file or directory"), Equals, true)
}
+func (s *ServerNotRequiredSuite) TestSetupKeepClient_NoBlobSignatureTTL(c *C) {
+ var srcConfig apiConfig
+ srcConfig.APIHost = os.Getenv("ARVADOS_API_HOST")
+ srcConfig.APIToken = arvadostest.DataManagerToken
+ srcConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
+ arvadostest.StartKeep(2, false)
+
+ _, ttl, err := setupKeepClient(srcConfig, srcKeepServicesJSON, false, 0, 0)
+ c.Check(err, IsNil)
+ c.Assert(ttl, Equals, blobSignatureTTL)
+}
+
func setupConfigFile(c *C, name string) *os.File {
// Setup a config file
file, err := ioutil.TempFile(os.TempDir(), name)