gem 'RedCloth'
gem 'piwik_analytics'
-gem 'httpclient'
+gem 'httpclient', '~> 2.5.0'
# This fork has Rails 4 compatible routes
gem 'themes_for_rails', git: 'https://github.com/holtkampw/themes_for_rails', ref: '1fd2d7897d75ae0d6375f4c390df87b8e91ad417'
headless (1.0.1)
highline (1.6.20)
hike (1.2.3)
- httpclient (2.3.4.1)
+ httpclient (2.5.0)
i18n (0.6.9)
jquery-rails (3.0.4)
railties (>= 3.0, < 5.0)
coffee-rails
deep_merge
headless
- httpclient
+ httpclient (~> 2.5.0)
jquery-rails
less
less-rails
if ($(elm).attr('data-utc-date-opts') && $(elm).attr('data-utc-date-opts').match(/noseconds/)) {
$(elm).text((ts.getHours() > 12 ? (ts.getHours()-12) : ts.getHours())
+ ":" + (ts.getMinutes() < 10 ? '0' : '') + ts.getMinutes()
- + (ts.getHours() > 12 ? " PM " : " AM ")
+ + (ts.getHours() >= 12 ? " PM " : " AM ")
+ ts.toLocaleDateString());
} else {
$(elm).text(ts.toLocaleTimeString() + " " + ts.toLocaleDateString());
$container.append(spinner);
$container.attr('data-infinite-serial', serial);
- // Combine infiniteContentParams from multiple sources. This
- // mechanism allows each of several components to set and
- // update its own set of filters, without having to worry
- // about stomping on some other component's filters.
- //
- // For example, filterable.js writes filters in
- // infiniteContentParamsFilterable ("search for text foo")
- // without worrying about clobbering the filters set up by the
- // tab pane ("only show jobs and pipelines in this tab").
- params = {};
- $.each($container.data(), function(datakey, datavalue) {
- // Note: We attach these data to DOM elements using
- // <element data-foo-bar="baz">. We store/retrieve them
- // using $('element').data('foo-bar'), although
- // .data('fooBar') would also work. The "all data" hash
- // returned by $('element').data(), however, always has
- // keys like 'fooBar'. In other words, where we have a
- // choice, we stick with the 'foo-bar' style to be
- // consistent with HTML. Here, our only option is
- // 'fooBar'.
- if (/^infiniteContentParams/.exec(datakey)) {
- if (datavalue instanceof Object) {
- $.each(datavalue, function(hkey, hvalue) {
- if (hvalue instanceof Array) {
- params[hkey] = (params[hkey] || []).concat(hvalue);
- } else if (hvalue instanceof Object) {
- $.extend(params[hkey], hvalue);
- } else {
- params[hkey] = hvalue;
- }
- });
+ if (src == $container.attr('data-infinite-content-href0')) {
+ // If we're loading the first page, collect filters from
+ // various sources.
+ params = mergeInfiniteContentParams($container);
+ $.each(params, function(k,v) {
+ if (v instanceof Object) {
+ params[k] = JSON.stringify(v);
}
- }
- });
- $.each(params, function(k,v) {
- if (v instanceof Object) {
- params[k] = JSON.stringify(v);
- }
- });
+ });
+ } else {
+ // If we're loading page >1, ignore other filtering
+ // mechanisms and just use the "next page" URI from the
+ // previous page's response. Aside from avoiding race
+ // conditions (where page 2 could have different filters
+ // than page 1), this allows the server to use filters in
+ // the "next page" URI to achieve paging. (To apply any
+ // new filters effectively, we need to load page 1 again
+ // anyway.)
+ params = {};
+ }
$.ajax(src,
{dataType: 'json',
$('.infinite-scroller').add(window).trigger('scroll');
}
+function mergeInfiniteContentParams($container) {
+ var params = {};
+ // Combine infiniteContentParams from multiple sources. This
+ // mechanism allows each of several components to set and
+ // update its own set of filters, without having to worry
+ // about stomping on some other component's filters.
+ //
+ // For example, filterable.js writes filters in
+ // infiniteContentParamsFilterable ("search for text foo")
+ // without worrying about clobbering the filters set up by the
+ // tab pane ("only show jobs and pipelines in this tab").
+ $.each($container.data(), function(datakey, datavalue) {
+ // Note: We attach these data to DOM elements using
+ // <element data-foo-bar="baz">. We store/retrieve them
+ // using $('element').data('foo-bar'), although
+ // .data('fooBar') would also work. The "all data" hash
+ // returned by $('element').data(), however, always has
+ // keys like 'fooBar'. In other words, where we have a
+ // choice, we stick with the 'foo-bar' style to be
+ // consistent with HTML. Here, our only option is
+ // 'fooBar'.
+ if (/^infiniteContentParams/.exec(datakey)) {
+ if (datavalue instanceof Object) {
+ $.each(datavalue, function(hkey, hvalue) {
+ if (hvalue instanceof Array) {
+ params[hkey] = (params[hkey] || []).
+ concat(hvalue);
+ } else if (hvalue instanceof Object) {
+ $.extend(params[hkey], hvalue);
+ } else {
+ params[hkey] = hvalue;
+ }
+ });
+ }
+ }
+ });
+ return params;
+}
+
$(document).
on('click', 'div.infinite-retry button', function() {
var $retry_div = $(this).closest('.infinite-retry');
min-width: 1em;
padding: 0px 2px 0px 0px;
}
-
+.task-summary-status {
+ font-size: 80%;
+}
#page-wrapper > div > h2 {
margin-top: 0px;
}
files.each do |m|
mt = chash[m[1]+m[2]].andand.manifest_text
if not m[4].nil? and m[4].size > 1
- combined += arv_normalize mt, '--extract', m[4][1..-1]
+ combined += arv_normalize mt, '--extract', ".#{m[4]}"
else
combined += mt
end
end
end
+ def find_object_by_uuid
+ if not Keep::Locator.parse params[:id]
+ super
+ end
+ end
+
def show
return super if !@object
if current_user
- jobs_with = lambda do |conds|
- Job.limit(RELATION_LIMIT).where(conds)
- .results.sort_by { |j| j.finished_at || j.created_at }
- end
- @output_of = jobs_with.call(output: @object.portable_data_hash)
- @log_of = jobs_with.call(log: @object.portable_data_hash)
- @project_links = Link.limit(RELATION_LIMIT).order("modified_at DESC")
- .where(head_uuid: @object.uuid, link_class: 'name').results
- project_hash = Group.where(uuid: @project_links.map(&:tail_uuid)).to_hash
- @projects = project_hash.values
-
- if @object.uuid.match /[0-9a-f]{32}/
- @same_pdh = Collection.filter([["portable_data_hash", "=", @object.portable_data_hash]])
- owners = @same_pdh.map {|s| s.owner_uuid}.to_a
+ if Keep::Locator.parse params["uuid"]
+ @same_pdh = Collection.filter([["portable_data_hash", "=", @object.portable_data_hash]]).limit(1000)
+ if @same_pdh.results.size == 1
+ redirect_to collection_path(@same_pdh[0]["uuid"])
+ return
+ end
+ owners = @same_pdh.map(&:owner_uuid).to_a.uniq
preload_objects_for_dataclass Group, owners
preload_objects_for_dataclass User, owners
+ render 'hash_matches'
+ return
+ else
+ jobs_with = lambda do |conds|
+ Job.limit(RELATION_LIMIT).where(conds)
+ .results.sort_by { |j| j.finished_at || j.created_at }
+ end
+ @output_of = jobs_with.call(output: @object.portable_data_hash)
+ @log_of = jobs_with.call(log: @object.portable_data_hash)
+ @project_links = Link.limit(RELATION_LIMIT).order("modified_at DESC")
+ .where(head_uuid: @object.uuid, link_class: 'name').results
+ project_hash = Group.where(uuid: @project_links.map(&:tail_uuid)).to_hash
+ @projects = project_hash.values
+
+ @permissions = Link.limit(RELATION_LIMIT).order("modified_at DESC")
+ .where(head_uuid: @object.uuid, link_class: 'permission',
+ name: 'can_read').results
+ @logs = Log.limit(RELATION_LIMIT).order("created_at DESC")
+ .where(object_uuid: @object.uuid).results
+ @is_persistent = Link.limit(1)
+ .where(head_uuid: @object.uuid, tail_uuid: current_user.uuid,
+ link_class: 'resources', name: 'wants')
+ .results.any?
+ @search_sharing = search_scopes
+
+ if params["tab_pane"] == "Provenance_graph"
+ @prov_svg = ProvenanceHelper::create_provenance_graph(@object.provenance, "provenance_svg",
+ {:request => request,
+ :direction => :bottom_up,
+ :combine_jobs => :script_only}) rescue nil
+ end
+ if params["tab_pane"] == "Used_by"
+ @used_by_svg = ProvenanceHelper::create_provenance_graph(@object.used_by, "used_by_svg",
+ {:request => request,
+ :direction => :top_down,
+ :combine_jobs => :script_only,
+ :pdata_only => true}) rescue nil
+ end
end
-
- @permissions = Link.limit(RELATION_LIMIT).order("modified_at DESC")
- .where(head_uuid: @object.uuid, link_class: 'permission',
- name: 'can_read').results
- @logs = Log.limit(RELATION_LIMIT).order("created_at DESC")
- .where(object_uuid: @object.uuid).results
- @is_persistent = Link.limit(1)
- .where(head_uuid: @object.uuid, tail_uuid: current_user.uuid,
- link_class: 'resources', name: 'wants')
- .results.any?
- @search_sharing = search_scopes
- end
-
- if params["tab_pane"] == "Provenance_graph"
- @prov_svg = ProvenanceHelper::create_provenance_graph(@object.provenance, "provenance_svg",
- {:request => request,
- :direction => :bottom_up,
- :combine_jobs => :script_only}) rescue nil
- end
- if params["tab_pane"] == "Used_by"
- @used_by_svg = ProvenanceHelper::create_provenance_graph(@object.used_by, "used_by_svg",
- {:request => request,
- :direction => :top_down,
- :combine_jobs => :script_only,
- :pdata_only => true}) rescue nil
end
super
end
component.delete :job
end
@object.state = 'New'
+
+ # set owner_uuid to that of source, provided it is a project and wriable by current user
+ current_project = Group.find(source.owner_uuid) rescue nil
+ if (current_project && current_project.writable_by.andand.include?(current_user.uuid))
+ @object.owner_uuid = source.owner_uuid
+ end
+
super
end
limit: @limit,
include_linked: true,
filters: (@filters - kind_filters + [['uuid', 'is_a', type]]),
- offset: @offset)
+ )
objects.each do |object|
@name_link_for[object.andand.uuid] = objects.links_for(object, 'name').first
end
def render_editable_attribute(object, attr, attrvalue=nil, htmloptions={})
attrvalue = object.send(attr) if attrvalue.nil?
- if !object.attribute_editable?(attr, :ever) or
- (!object.editable? and
- !object.owner_uuid.in?(my_projects.collect(&:uuid)))
+ if not object.attribute_editable?(attr)
if attrvalue && attrvalue.length > 0
return render_attribute_as_textile( object, attr, attrvalue, false )
else
preconfigured_search_str = value_info[:search_for]
end
- if !object or
- !object.attribute_editable?(attr, :ever) or
- (!object.editable? and
- !object.owner_uuid.in?(my_projects.collect(&:uuid)))
+ if not object.andand.attribute_editable?(attr)
return link_to_if_arvados_object attrvalue
end
def render_pipeline_job pj
pj[:progress_bar] = render partial: 'job_progress', locals: {:j => pj[:job]}
pj[:output_link] = link_to_if_arvados_object pj[:output]
- pj[:job_link] = link_to_if_arvados_object pj[:job][:uuid]
+ pj[:job_link] = link_to_if_arvados_object pj[:job][:uuid] if pj[:job]
pj
end
# Merge (started_at, finished_at) time range into the list of time ranges in
- # timestamps (timestamps must be sorted and non-overlapping).
+ # timestamps (timestamps must be sorted and non-overlapping).
# return the updated timestamps list.
def merge_range timestamps, started_at, finished_at
# in the comments below, 'i' is the entry in the timestamps array and 'j'
timestamps << [started_at, finished_at]
end
-
+
# Accept a list of objects with [:started_at] and [:finshed_at] keys and
# merge overlapping ranges to compute the time spent running after periods of
# overlapping execution are factored out.
if round_to_min and seconds >= 30
minutes += 1
- end
+ end
if use_words
s = []
class ApiClientAuthorization < ArvadosBase
- def attribute_editable? attr, *args
- ['expires_at', 'default_owner_uuid'].index attr
+ def editable_attributes
+ %w(expires_at default_owner_uuid)
end
def self.creatable?
false
(current_user.is_admin or
current_user.uuid == self.owner_uuid or
new_record? or
- (writable_by.include? current_user.uuid rescue false))) or false
+ (respond_to?(:writable_by) ?
+ writable_by.include?(current_user.uuid) :
+ (ArvadosBase.find(owner_uuid).writable_by.include? current_user.uuid rescue false)))) or false
+ end
+
+ # Array of strings that are the names of attributes that can be edited
+ # with X-Editable.
+ def editable_attributes
+ self.class.columns.map(&:name) -
+ %w(created_at modified_at modified_by_user_uuid modified_by_client_uuid updated_at)
end
def attribute_editable?(attr, ever=nil)
- if %w(created_at modified_at modified_by_user_uuid modified_by_client_uuid updated_at).include? attr.to_s
+ if not editable_attributes.include?(attr.to_s)
false
elsif not (current_user.andand.is_active)
false
class AuthorizedKey < ArvadosBase
- def attribute_editable? attr, *args
- if attr.to_s == 'authorized_user_uuid'
- current_user and current_user.is_admin
+ def attribute_editable?(attr, ever=nil)
+ if (attr.to_s == 'authorized_user_uuid') and (not ever)
+ current_user.andand.is_admin
else
super
end
dir_to_tree.call('.')
end
- def attribute_editable? attr, *args
- if %w(name description manifest_text).include? attr.to_s
- true
- else
- super
- end
+ def editable_attributes
+ %w(name description manifest_text)
end
def self.creatable?
"#{script} job"
end
- def attribute_editable? attr, *args
- if attr.to_sym == :description
- super && attr.to_sym == :description
- else
- false
- end
+ def editable_attributes
+ %w(description)
end
def self.creatable?
arvados_api_client.api("jobs/", "queue_size", {"_method"=> "GET"})[:queue_size] rescue 0
end
- def self.queue
+ def self.queue
arvados_api_client.unpack_api_response arvados_api_client.api("jobs/", "queue", {"_method"=> "GET"})
end
end
end
- def attribute_editable? attr, *args
- super && (attr.to_sym == :name || attr.to_sym == :description ||
- (attr.to_sym == :components and
- (self.state == 'New' || self.state == 'Ready')))
+ def editable_attributes
+ %w(name description components)
+ end
+
+ def attribute_editable?(name, ever=nil)
+ (ever or %w(New Ready).include?(state)) and super
end
def attributes_for_display
super.reject { |k,v| %w(owner_uuid default_owner_uuid identity_url prefs).index k }
end
- def attribute_editable? attr, *args
- (not (self.uuid.andand.match(/000000000000000$/) and self.is_admin)) and super
+ def attribute_editable?(attr, ever=nil)
+ (ever or not (self.uuid.andand.match(/000000000000000$/) and
+ self.is_admin)) and super
end
def friendly_link_name lookup=nil
def attributes_for_display
super.append ['current_user_logins', @current_user_logins]
end
- def attribute_editable? attr, *args
- attr != 'current_user_logins' and super
+ def editable_attributes
+ super - %w(current_user_logins)
end
def self.attribute_info
merger = ->(k,a,b) { a.merge(b, &merger) }
-<%
- failed = j[:tasks_summary][:failed] || 0 rescue 0
- done = j[:tasks_summary][:done] || 0 rescue 0
- running = j[:tasks_summary][:running] || 0 rescue 0
- todo = j[:tasks_summary][:todo] || 0 rescue 0
-
- if j[:success] == false and done + running + failed == 0
- # The job failed but no tasks were ever started (i.e. crunch-dispatch
- # was unable to start the job). Display a full 100% failed progress bar.
- failed_percent = 100
- success_percent = 0
- running_percent = 0
- elsif done + running + failed + todo == 0
- # No tasks were ever created for this job;
- # render an empty progress bar.
- failed_percent = 0
- success_percent = 0
- running_percent = 0
- else
- percent_total_tasks = 100.0 / (done + running + failed + todo)
- if defined? scaleby
- percent_total_tasks *= scaleby
- end
- failed_percent = (failed * percent_total_tasks).ceil
- success_percent = (done * percent_total_tasks).ceil
- running_percent = (running * percent_total_tasks).ceil
- end
-%>
-
-<% if not defined? scaleby %>
- <div class="progress">
-<% end %>
+<% if (j.andand[:state] == "Running" or defined? scaleby) and (not defined? show_progress_bar or show_progress_bar) %>
+ <%
+ failed = j[:tasks_summary][:failed] || 0 rescue 0
+ done = j[:tasks_summary][:done] || 0 rescue 0
+ running = j[:tasks_summary][:running] || 0 rescue 0
+ todo = j[:tasks_summary][:todo] || 0 rescue 0
+
+ if done + running + failed + todo == 0
+ # No tasks were ever created for this job;
+ # render an empty progress bar.
+ done_percent = 0
+ else
+ percent_total_tasks = 100.0 / (done + running + failed + todo)
+ if defined? scaleby
+ percent_total_tasks *= scaleby
+ end
+ done_percent = (done+failed) * percent_total_tasks
+ end
+ %>
+
+ <% if not defined? scaleby %>
+ <div class="progress" style="margin-bottom: 0px">
+ <% end %>
+
+ <span class="progress-bar <%= if failed == 0 then 'progress-bar-success' else 'progress-bar-warning' end %>" style="width: <%= done_percent %>%;">
+ </span>
+
+ <% if not defined? scaleby %>
+ </div>
+ <% end %>
+
+<% else %>
+
+<% to_label = {
+ "Cancelled" => "danger",
+ "Complete" => "success",
+ "Running" => "info",
+ "Failed" => "danger",
+ "Queued" => "default",
+ nil => "default"
+ } %>
-<span class="progress-bar progress-bar-success" style="width: <%= success_percent %>%;">
-</span>
-<span class="progress-bar progress-bar-danger" style="width: <%= failed_percent %>%;">
-</span>
-<span class="progress-bar" style="width: <%= running_percent %>%;">
-</span>
+ <span class="label label-<%= to_label[j.andand[:state]] %>">
+ <%= if defined? title
+ title
+ else
+ if j.andand[:state] then j[:state].downcase else "Not ready" end
+ end
+ %></span>
-<% if not defined? scaleby %>
-</div>
<% end %>
+++ /dev/null
-<% to_label = {
- "Cancelled" => "danger",
- "Complete" => "success",
- "Running" => "info",
- "Failed" => "danger",
- "Queued" => "default",
- nil => "default"
- } %>
-
- <span class="label label-<%= to_label[j[:state]] %>"><%= if defined? title then title else j[:state].downcase end %></span>
<% if p.state == 'Complete' %>
- <span class="label label-success">finished</span>
+ <span class="label label-success">complete</span>
<% elsif p.state == 'Failed' %>
<span class="label label-danger">failed</span>
<% elsif p.state == 'RunningOnServer' || p.state == 'RunningOnClient' %>
<i class="fa fa-fw fa-copy"></i> Copy to project...
<% end %>
<% end %>
- <% if @object.owner_uuid == current_user.uuid or (Group.find(@object.owner_uuid).writable_by.include?(current_user.uuid) rescue nil) %>
+ <% if (ArvadosBase.find(@object.owner_uuid).writable_by.include?(current_user.uuid) rescue nil) %>
<%= link_to(
choose_projects_path(
title: "Move this #{object_class} to:",
--- /dev/null
+<div class="row">
+ <div class="col-md-10 col-md-offset-1">
+ <div class="panel panel-info">
+ <div class="panel-heading">
+ <h3 class="panel-title"><%= params["uuid"] %></h3>
+ </div>
+ <div class="panel-body">
+ <p><i>The following collections have this content:</i></p>
+ <% @same_pdh.sort { |a,b| b.created_at <=> a.created_at }.each do |c| %>
+ <div class="row">
+ <div class="col-md-8">
+ <% owner = object_for_dataclass(Group, c.owner_uuid) || object_for_dataclass(User, c.owner_uuid) %>
+ <%= link_to_if_arvados_object owner, {:friendly_name => true} %> / <%= link_to_if_arvados_object c, {:friendly_name => true} %><br>
+ </div>
+ <div class="col-md-4">
+ <%= render_localized_date c.created_at %>
+ </div>
+ </div>
+ <% end %>
+ </div>
+ </div>
+ </div>
+</div>
<%= render_editable_attribute @object, 'description', nil, { 'data-emptytext' => "(No description provided)", 'data-toggle' => 'manual' } %>
</div>
<img src="/favicon.ico" class="pull-right" alt="" style="opacity: 0.3"/>
- <% if defined? @same_pdh %>
- <p>Found in collections:<p>
- <p>
- <% @same_pdh.each do |c| %>
- <%= link_to_if_arvados_object get_object(c.owner_uuid), {:friendly_name => true} %> / <%= link_to_if_arvados_object c, {:friendly_name => true} %><br>
- <% end %>
- </p>
- <% else %>
- <p><i>Content hash:</i><br />
- <span class="arvados-uuid"><%= link_to @object.portable_data_hash, collection_path(@object.portable_data_hash) %></span></p>
- <% end %>
+ <p><i>Content hash:</i><br />
+ <span class="arvados-uuid"><%= link_to @object.portable_data_hash, collection_path(@object.portable_data_hash) %></span>
+ </p>
<%= render partial: "show_source_summary" %>
</div>
</div>
<th>
</th><th>
status
- </th><th>
- progress
</th><th>
uuid
</th><th>
<td>
<i class="icon-plus-sign expand-collapse-row" data-id="<%= j.uuid %>" style="cursor: pointer"></i>
</td>
- <td>
- <%= render partial: 'job_status_label', locals: {:j => j} %>
- </td>
<td>
<div class="inline-progress-container">
<%= render partial: 'job_progress', locals: {:j => j} %>
</tbody>
</table>
-
- <% current_job = pj[:job] if pj[:job] != {} and pj[:job][:uuid] %>
- <div class="panel panel-default">
- <div class="panel-heading">
- <div class="container-fluid">
- <div class="row">
+<% current_job = pj[:job] if pj[:job] != {} and pj[:job][:uuid] %>
+<div class="panel panel-default">
+ <div class="panel-heading">
+ <div class="container-fluid">
+ <div class="row-fluid">
+ <%# column offset 0 %>
+ <div class="col-md-3">
+ <h4 class="panel-title">
+ <a data-toggle="collapse" href="#collapse<%= i %>" style="white-space: nowrap;">
+ <%= pj[:name] %> <span class="caret"></span>
+ </a>
+ </h4>
+ </div>
+
+ <%# column offset 3 %>
+ <div class="col-md-2 pipeline-instance-spacing">
+ <%= pj[:progress_bar] %>
+ </div>
+
+ <% if current_job %>
+ <%# column offset 5 %>
+ <% if current_job[:state] != "Queued" %>
<div class="col-md-3">
- <h4 class="panel-title">
- <a data-toggle="collapse" href="#collapse<%= i %>" style="white-space: nowrap;">
- <%= pj[:name] %> <span class="caret"></span>
- </a>
- </h4>
+ <% if current_job[:started_at] %>
+ <% walltime = ((if current_job[:finished_at] then current_job[:finished_at] else Time.now() end) - current_job[:started_at]) %>
+ <% cputime = tasks.map { |task|
+ if task.started_at and task.job_uuid == current_job[:uuid]
+ (if task.finished_at then task.finished_at else Time.now() end) - task.started_at
+ else
+ 0
+ end
+ }.reduce(:+) || 0 %>
+ <%= render_runtime(walltime, false, false) %>
+ <% if cputime > 0 %> / <%= render_runtime(cputime, false, false) %> (<%= (cputime/walltime).round(1) %>⨯)<% end %>
+ <% end %>
</div>
+ <% end %>
- <% if current_job %>
- <div class="col-md-1">
- <%= render(partial: 'job_status_label', locals: { j: current_job }) %>
+ <% if current_job[:state] == "Queued" %>
+ <%# column offset 5 %>
+ <div class="col-md-6">
+ <% queuetime = Time.now - current_job[:created_at] %>
+ Queued for <%= render_runtime(queuetime, true) %>.
+ <% begin %>
+ <% if current_job[:queue_position] == 0 %>
+ This job is next in the queue to run.
+ <% elsif current_job[:queue_position] == 1 %>
+ There is 1 job in the queue ahead of this one.
+ <% else %>
+ There are <%= current_job[:queue_position] %> jobs in the queue ahead of this one.
+ <% end %>
+ <% rescue %>
+ <% end %>
</div>
-
+ <% elsif current_job[:state] == "Running" %>
+ <%# column offset 8 %>
<div class="col-md-3">
- <% if current_job[:started_at] %>
- <% walltime = ((if current_job[:finished_at] then current_job[:finished_at] else Time.now() end) - current_job[:started_at]) %>
- <% cputime = tasks.map { |task|
- if task.started_at and task.job_uuid == current_job[:uuid]
- (if task.finished_at then task.finished_at else Time.now() end) - task.started_at
- else
- 0
- end
- }.reduce(:+) || 0 %>
- <%= render_runtime(walltime, false, false) %>
- <% if cputime > 0 %> / <%= render_runtime(cputime, false, false) %> (<%= (cputime/walltime).round(1) %>⨯)<% end %>
+ <span class="task-summary-status">
+ <%= current_job[:tasks_summary][:done] %> <%= "task".pluralize(current_job[:tasks_summary][:done]) %> done,
+ <%= current_job[:tasks_summary][:failed] %> failed,
+ <%= current_job[:tasks_summary][:running] %> running,
+ <%= current_job[:tasks_summary][:todo] %> pending
+ </span>
+ </div>
+ <% elsif current_job[:state].in? ["Complete", "Failed", "Cancelled"] %>
+ <%# column offset 8 %>
+ <div class="col-md-4 text-overflow-ellipsis">
+ <% if pj[:output_uuid] %>
+ <%= link_to_if_arvados_object pj[:output_uuid], friendly_name: true %>
+ <% elsif current_job[:output] %>
+ <%= link_to_if_arvados_object current_job[:output], link_text: "Output of #{pj[:name]}" %>
+ <% else %>
+ No output.
<% end %>
</div>
+ <% end %>
- <% if current_job[:state].in? ["Complete", "Failed", "Cancelled"] %>
- <div class="col-md-5 text-overflow-ellipsis">
- <% if pj[:output_uuid] %>
- <%= link_to_if_arvados_object pj[:output_uuid], friendly_name: true %>
- <% elsif current_job[:output] %>
- <%= link_to_if_arvados_object current_job[:output], link_text: "Output of #{pj[:name]}" %>
- <% else %>
- No output.
- <% end %>
- </div>
- <% elsif current_job[:state] == "Running" %>
- <div class="col-md-3 pipeline-instance-spacing">
- <%= pj[:progress_bar] %>
- </div>
- <div class="col-md-1 pipeline-instance-spacing">
- <%= form_tag "/jobs/#{current_job[:uuid]}/cancel", style: "display:inline; padding-left: 1em" do |f| %>
+ <% if current_job[:state].in? ["Queued", "Running"] %>
+ <%# column offset 11 %>
+ <div class="col-md-1 pipeline-instance-spacing">
+ <%= form_tag "/jobs/#{current_job[:uuid]}/cancel", style: "display:inline; padding-left: 1em" do |f| %>
<%= hidden_field_tag :return_to, url_for(@object) %>
<%= button_tag "Cancel", {class: 'btn btn-xs btn-danger', id: "cancel-job-button"} %>
- </div>
- <% end %>
- <% elsif current_job[:state] == "Queued" %>
- <div class="col-md-5">
- <% queuetime = Time.now - current_job[:created_at] %>
- Queued for <%= render_runtime(queuetime, true) %>.
- <% begin %>
- <% if current_job[:queue_position] == 0 %>
- This job is next in the queue to run.
- <% elsif current_job[:queue_position] == 1 %>
- There is 1 job in the queue ahead of this one.
- <% else %>
- There are <%= current_job[:queue_position] %> jobs in the queue ahead of this one.
<% end %>
- <% rescue %>
- <% end %>
</div>
<% end %>
- <% else %>
- <div class="col-md-3 col-md-offset-3">
- <span class="label label-default">Not ready</span>
- </div>
-<% end %>
-</div>
-</div>
-</div>
+ <% end %>
+ </div>
+ </div>
+ </div>
-<div id="collapse<%= i %>" class="panel-collapse collapse <%= if expanded then 'in' end %>">
- <div class="panel-body">
- <div class="container">
+ <div id="collapse<%= i %>" class="panel-collapse collapse <%= if expanded then 'in' end %>">
+ <div class="panel-body">
+ <div class="container">
<% current_component = (if current_job then current_job else pj end) %>
<div class="row">
<div class="col-md-6">
<p>script_parameters:</p>
<pre><%= JSON.pretty_generate(current_component[:script_parameters]) rescue nil %></pre>
</div>
- <% if current_component[:tasks_summary] %>
- <div class="col-md-3">
- <table>
- <% [:done, :running, :failed, :todo].each do |d| %>
- <tr>
- <td style="padding-right: 1em"><%= 'tasks:' if d == :done %></td>
- <td style="padding-right: 1em"><%= d.to_s %></td>
- <td><%= current_component[:tasks_summary][d] %></td>
- </tr>
- <% end %>
- </table>
- </div>
- <% end %>
</div>
+ </div>
</div>
</div>
</div>
-</div>
</p>
<% end %>
-<% tasks = JobTask.filter([['job_uuid', 'in', render_pipeline_jobs.map { |j| j[:job].andand[:uuid] }]]).results %>
-<% runningtime = determine_wallclock_runtime(render_pipeline_jobs.map {|j| j[:job]}) %>
+<% tasks = JobTask.filter([['job_uuid', 'in', render_pipeline_jobs.map { |j| j[:job].andand[:uuid] }.compact]]).results %>
+<% runningtime = determine_wallclock_runtime(render_pipeline_jobs.map {|j| j[:job]}.compact) %>
<p>
<% if @object.started_at %>
This pipeline started at <%= render_localized_date(@object.started_at) %>.
- It
+ It
<% if @object.state == 'Complete' %>
completed in
<% elsif @object.state == 'Failed' %>
else
Time.now - @object.started_at
end %>
-
- <%= if walltime > runningtime
- render_runtime(walltime, true, false)
- else
- render_runtime(runningtime, true, false)
+
+ <%= if walltime > runningtime
+ render_runtime(walltime, true, false)
+ else
+ render_runtime(runningtime, true, false)
end %><% if @object.finished_at %> at <%= render_localized_date(@object.finished_at) %><% end %>.
<% else %>
This pipeline is <%= if @object.state.start_with? 'Running' then 'active' else @object.state.downcase end %>.
<td style="border-top: 0; opacity: 0.5;" colspan="6">
<% ob.components.each do |cname, c| %>
<% if c.is_a?(Hash) and c[:job] %>
- <%= render partial: "job_status_label", locals: {:j => c[:job], :title => cname.to_s } %>
+ <%= render partial: "job_progress", locals: {:j => c[:job], :title => cname.to_s, :show_progress_bar => false } %>
<% else %>
<span class="label label-default"><%= cname.to_s %></span>
<% end %>
<div class="pull-right" style="width: 40%">
<div class="progress" style="margin-bottom: 0px">
- <% running = [] %>
- <% failed = [] %>
- <% completed = [] %>
- <% queued = [] %>
<% p.components.each do |k, v| %>
<% if v.is_a? Hash and v[:job] %>
- <% if v[:job][:state] == "Running" %>
- <% running << k %>
- <% elsif v[:job][:state] == "Failed" or v[:job][:state] == "Cancelled" %>
- <% failed << k %>
- <% elsif v[:job][:state] == "Complete" %>
- <% completed << k %>
- <% elsif v[:job][:state] == "Queued" %>
- <% queued << k %>
- <% end %>
+ <%= render partial: 'job_progress', locals: {:j => v[:job], :scaleby => (1.0/p.components.size)} %>
<% end %>
<% end %>
- <% completed_pct = (completed.size * 100) / p.components.size %>
- <% failed_pct = (failed.size * 100) / p.components.size %>
- <% running_pct = (running.size * 100) / p.components.size %>
- <% queued_pct = (queued.size * 100) / p.components.size %>
-
- <div class="progress-bar progress-bar-success" style="width: <%= completed_pct %>%">
- <span class="sr-only"></span>
- </div>
- <div class="progress-bar progress-bar-danger" style="width: <%= failed_pct %>%">
- <span class="sr-only"></span>
- </div>
- <div class="progress-bar progress-bar-primary" style="width: <%= running_pct %>%">
- <span class="sr-only"></span>
- </div>
- <div class="progress-bar progress-bar-default" style="width: <%= queued_pct %>%">
- <span class="sr-only"></span>
- </div>
</div>
</div>
</div>
+ <%
+ running = p.components.select { |k, c| c.is_a? Hash and c[:job].andand[:state] == "Running" }
+ queued = p.components.select { |k, c| c.is_a? Hash and c[:job].andand[:state] == "Queued" }
+ %>
+
<div class="clearfix">
Started at <%= render_localized_date(p[:started_at] || p[:created_at], "noseconds") %>.
<% pipeline_time = Time.now - (p[:started_at] || p[:created_at]) %>
Active for <%= render_runtime(pipeline_time, false) %>.
<div class="pull-right">
- <% running.each do |k| %>
- <span class="label label-primary"><%= k %></span>
+ <% running.each do |k,v| %>
+ <%= render partial: 'job_progress', locals: {:j => v[:job], :show_progress_bar => false, :title => k} %>
<% end %>
- <% queued.each do |k| %>
- <span class="label label-default"><%= k %></span>
+ <% queued.each do |k,v| %>
+ <%= render partial: 'job_progress', locals: {:j => v[:job], :show_progress_bar => false, :title => k} %>
<% end %>
</div>
</div>
<table id="project_sharing" class="topalign table" style="clear: both; margin-top: 1em;">
<tr>
<th>User/Group Name</th>
+ <th>Email Address</th>
<th colspan="2">Project Access</th>
</tr>
link_name = shared_with.full_name
else
link_name = shared_with.name
- end %>
+ end
+ if shared_with && shared_with.respond_to?(:email)
+ email = shared_with.email
+ end
+ %>
<tr data-object-uuid="<%= link.uuid %>">
<td>
<i class="fa fa-fw <%= fa_icon_class_for_uuid(link.tail_uuid) %>"></i>
<%= link_to_if_arvados_object(link.tail_uuid, link_text: link_name) %>
</td>
+ <td>
+ <%= email %>
+ </td>
<td><%= link_to perm_name_desc_map[link.name], '#', {
"data-emptytext" => "Read",
"data-placement" => "bottom",
<th>Log</th>
<th>Created at</th>
<th>Status</th>
- <th>Progress</th>
</tr>
<%# Preload collections, logs, and pipeline instance objects %>
</small>
</td>
-<td>
- <%= render partial: 'job_status_label', locals: {:j => j} %>
-</td>
<td>
<div class="inline-progress-container">
<%= render partial: 'job_progress', locals: {:j => j} %>
fakefiledata.expects(:read).twice.with() do |length|
# Fail the test if read() is called with length>1MiB:
length < 2**20
+ ## Force the ActionController::Live thread to lose the race to
+ ## verify that @response.body.length actually waits for the
+ ## response (see below):
+ # sleep 3
end.returns("foo\n", nil)
fakefiledata.expects(:close)
foo_file = api_fixture('collections')['foo_file']
uuid: foo_file['uuid'],
file: foo_file['manifest_text'].match(/ \d+:\d+:(\S+)/)[1]
}, session_for(:active)
+ # Wait for the whole response to arrive before deciding whether
+ # mocks' expectations were met. Otherwise, Mocha will fail the
+ # test depending on how slowly the ActionController::Live thread
+ # runs.
+ @response.body.length
end
end
refute user_can_manage(:project_viewer, "asubproject")
end
+ test "subproject_admin can_manage asubproject" do
+ assert user_can_manage(:subproject_admin, "asubproject")
+ end
+
+ test "project admin can remove items from the project" do
+ coll_key = "collection_to_remove_from_subproject"
+ coll_uuid = api_fixture("collections")[coll_key]["uuid"]
+ delete(:remove_item,
+ { id: api_fixture("groups")["asubproject"]["uuid"],
+ item_uuid: coll_uuid,
+ format: "js" },
+ session_for(:subproject_admin))
+ assert_response :success
+ assert_match(/\b#{coll_uuid}\b/, @response.body,
+ "removed object not named in response")
+ end
+
test 'projects#show tab infinite scroll partial obeys limit' do
get_contents_rows(limit: 1, filters: [['uuid','is_a',['arvados#job']]])
assert_response :success
headless.stop
end
end
+
+ test "combine selected collection files from collection subdirectory" do
+ headless = Headless.new
+ headless.start
+ Capybara.current_driver = :selenium
+
+ visit page_with_token('user1_with_load', "/collections/zzzzz-4zz18-filesinsubdir00")
+
+ # now in collection page
+ input_files = page.all('input[type=checkbox]')
+ (0..input_files.count-1).each do |i|
+ input_files[i].click
+ end
+
+ click_button 'Selection...'
+ within('.selection-action-container') do
+ click_link 'Create new collection with selected files'
+ end
+
+ # now in the newly created collection page
+ assert(page.has_text?('file_in_subdir1'), 'file not found - file_in_subdir1')
+ assert(page.has_text?('file1_in_subdir3.txt'), 'file not found - file1_in_subdir3.txt')
+ assert(page.has_text?('file2_in_subdir3.txt'), 'file not found - file2_in_subdir3.txt')
+ assert(page.has_text?('file1_in_subdir4.txt'), 'file not found - file1_in_subdir4.txt')
+ assert(page.has_text?('file2_in_subdir4.txt'), 'file not found - file1_in_subdir4.txt')
+
+ headless.stop
+ end
+
+ test "Collection portable data hash redirect" do
+ di = api_fixture('collections')['docker_image']
+ visit page_with_token('active', "/collections/#{di['portable_data_hash']}")
+
+ # check redirection
+ assert current_path.end_with?("/collections/#{di['uuid']}")
+ assert page.has_text?("docker_image")
+ assert page.has_text?("Activity")
+ assert page.has_text?("Sharing and permissions")
+ end
+
+ test "Collection portable data hash with multiple matches" do
+ pdh = api_fixture('collections')['baz_file']['portable_data_hash']
+ visit page_with_token('admin', "/collections/#{pdh}")
+
+ matches = api_fixture('collections').select {|k,v| v["portable_data_hash"] == pdh}
+ assert matches.size > 1
+
+ matches.each do |k,v|
+ assert page.has_link?(v["name"]), "Page /collections/#{pdh} should contain link '#{v['name']}'"
+ end
+ assert page.has_no_text?("Activity")
+ assert page.has_no_text?("Sharing and permissions")
+ end
end
assert page.has_text? 'Paused'
page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
page.assert_selector 'a,button', text: 'Re-run with latest'
+ page.assert_selector 'a,button', text: 'Re-run options'
# Since it is test env, no jobs are created to run. So, graph not visible
assert_not page.has_text? 'Graph'
wait_for_ajax
end
- # create a pipeline instance
- find('.btn', text: 'Run a pipeline').click
- within('.modal-dialog') do
- find('.selectable', text: 'Two Part Pipeline Template').click
- find('.btn', text: 'Next: choose inputs').click
- end
-
- assert find('p', text: 'Provide a value')
-
- find('div.form-group', text: 'Foo/bar pair').
- find('.btn', text: 'Choose').
- click
-
- within('.modal-dialog') do
- assert_selector 'button.dropdown-toggle', text: 'A Project'
- wait_for_ajax
- first('span', text: 'foo_tag').click
- find('button', text: 'OK').click
- end
- wait_for_ajax
-
- # "Run" button present and enabled
- page.assert_no_selector 'a.disabled,button.disabled', text: 'Run'
- first('a,button', text: 'Run').click
-
- # Pipeline is running. We have a "Pause" button instead now.
- page.assert_no_selector 'a,button', text: 'Run'
- page.assert_selector 'a,button', text: 'Pause'
-
- # Since it is test env, no jobs are created to run. So, graph not visible
- assert_not page.has_text? 'Graph'
+ create_and_run_pipeline_in_aproject true
end
# Create a pipeline instance from within a project and run
test 'Run a pipeline from dashboard' do
visit page_with_token('active_trustedclient')
-
- # create a pipeline instance
- find('.btn', text: 'Run a pipeline').click
- within('.modal-dialog') do
- find('.selectable', text: 'Two Part Pipeline Template').click
- find('.btn', text: 'Next: choose inputs').click
- end
-
- assert find('p', text: 'Provide a value')
-
- find('div.form-group', text: 'Foo/bar pair').
- find('.btn', text: 'Choose').
- click
-
- within('.modal-dialog') do
- assert_selector 'button.dropdown-toggle', text: 'Home'
- wait_for_ajax
- click_button "Home"
- click_link "A Project"
- wait_for_ajax
- first('span', text: 'foo_tag').click
- find('button', text: 'OK').click
- end
- wait_for_ajax
-
- # "Run" button present and enabled
- page.assert_no_selector 'a.disabled,button.disabled', text: 'Run'
- first('a,button', text: 'Run').click
-
- # Pipeline is running. We have a "Pause" button instead now.
- page.assert_no_selector 'a,button', text: 'Run'
- page.assert_selector 'a,button', text: 'Pause'
-
- # Since it is test env, no jobs are created to run. So, graph not visible
- assert_not page.has_text? 'Graph'
+ create_and_run_pipeline_in_aproject false
end
-
test 'view pipeline with job and see graph' do
visit page_with_token('active_trustedclient')
test "Workbench preserves search_for parameter after project switch" do
check_parameter_search("A Project")
end
+
+ [
+ ['active', false, false, false],
+ ['active', false, false, true],
+ ['active', true, false, false],
+ ['active', true, true, false],
+ ['active', true, false, true],
+ ['active', true, true, true],
+ ['project_viewer', false, false, true],
+ ['project_viewer', true, false, true],
+ ['project_viewer', true, true, true],
+ ].each do |user, with_options, choose_options, in_aproject|
+ test "Rerun pipeline instance as #{user} using options #{with_options} #{choose_options} in #{in_aproject}" do
+ visit page_with_token('active')
+
+ if in_aproject
+ find("#projects-menu").click
+ find('.dropdown-menu a,button', text: 'A Project').click
+ end
+
+ create_and_run_pipeline_in_aproject in_aproject
+ instance_path = current_path
+
+ # Pause the pipeline
+ find('a,button', text: 'Pause').click
+ assert page.has_text? 'Paused'
+ page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
+ page.assert_selector 'a,button', text: 'Re-run with latest'
+ page.assert_selector 'a,button', text: 'Re-run options'
+
+ # Pipeline can be re-run now. Access it as the specified user, and re-run
+ if user == 'project_viewer'
+ visit page_with_token(user, instance_path)
+ assert page.has_text? 'A Project'
+ page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
+ page.assert_selector 'a,button', text: 'Re-run with latest'
+ page.assert_selector 'a,button', text: 'Re-run options'
+ end
+
+ # Now re-run the pipeline
+ if with_options
+ find('a,button', text: 'Re-run options').click
+ within('.modal-dialog') do
+ page.assert_selector 'a,button', text: 'Copy and edit inputs'
+ page.assert_selector 'a,button', text: 'Run now'
+ if choose_options
+ find('button', text: 'Copy and edit inputs').click
+ else
+ find('button', text: 'Run now').click
+ end
+ end
+ else
+ find('a,button', text: 'Re-run with latest').click
+ end
+
+ # Verify that the newly created instance is created in the right project.
+ # In case of project_viewer user, since the use cannot write to the project,
+ # the pipeline should have been created in the user's Home project.
+ rerun_instance_path = current_path
+ assert_not_equal instance_path, rerun_instance_path, 'Rerun instance path expected to be different'
+ assert page.has_text? 'Home'
+ if in_aproject && (user != 'project_viewer')
+ assert page.has_text? 'A Project'
+ else
+ assert page.has_no_text? 'A Project'
+ end
+ end
+ end
+
+ # Create and run a pipeline for 'Two Part Pipeline Template' in 'A Project'
+ def create_and_run_pipeline_in_aproject in_aproject
+ # create a pipeline instance
+ find('.btn', text: 'Run a pipeline').click
+ within('.modal-dialog') do
+ find('.selectable', text: 'Two Part Pipeline Template').click
+ find('.btn', text: 'Next: choose inputs').click
+ end
+
+ assert find('p', text: 'Provide a value')
+
+ find('div.form-group', text: 'Foo/bar pair').
+ find('.btn', text: 'Choose').
+ click
+
+ within('.modal-dialog') do
+ if in_aproject
+ assert_selector 'button.dropdown-toggle', text: 'A Project'
+ wait_for_ajax
+ else
+ assert_selector 'button.dropdown-toggle', text: 'Home'
+ wait_for_ajax
+ click_button "Home"
+ click_link "A Project"
+ wait_for_ajax
+ end
+ first('span', text: 'foo_tag').click
+ find('button', text: 'OK').click
+ end
+ wait_for_ajax
+
+ # "Run" button present and enabled
+ page.assert_no_selector 'a.disabled,button.disabled', text: 'Run'
+ first('a,button', text: 'Run').click
+
+ # Pipeline is running. We have a "Pause" button instead now.
+ page.assert_no_selector 'a,button', text: 'Run'
+ page.assert_no_selector 'a.disabled,button.disabled', text: 'Resume'
+ page.assert_selector 'a,button', text: 'Pause'
+
+ # Since it is test env, no jobs are created to run. So, graph not visible
+ assert_not page.has_text? 'Graph'
+ end
+
+ [
+ [0, 0], # run time 0 minutes
+ [9, 17*60*60 + 51*60], # run time 17 hours and 51 minutes
+ ].each do |index, run_time|
+ test "pipeline start and finish time display #{index}" do
+ visit page_with_token("user1_with_load", "/pipeline_instances/zzzzz-d1hrv-10pipelines0#{index.to_s.rjust(3, '0')}")
+
+ assert page.has_text? 'This pipeline started at'
+ page_text = page.text
+
+ match = /This pipeline started at (.*)\. It failed after (.*) seconds at (.*)\. Check the Log/.match page_text
+ assert_not_nil(match, 'Did not find text - This pipeline started at . . . ')
+
+ start_at = match[1]
+ finished_at = match[3]
+ assert_not_nil(start_at, 'Did not find start_at time')
+ assert_not_nil(finished_at, 'Did not find finished_at time')
+
+ # start and finished time display is of the format '2:20 PM 10/20/2014'
+ start_time = DateTime.strptime(start_at, '%H:%M %p %m/%d/%Y').to_time
+ finished_time = DateTime.strptime(finished_at, '%H:%M %p %m/%d/%Y').to_time
+ assert_equal(run_time, finished_time-start_time,
+ "Time difference did not match for start_at #{start_at}, finished_at #{finished_at}, ran_for #{match[2]}")
+ end
+ end
end
find('#project_sharing').all('tr')
end
- def add_share_and_check(share_type, name)
+ def add_share_and_check(share_type, name, obj=nil)
assert(page.has_no_text?(name), "project is already shared with #{name}")
start_share_count = share_rows.size
click_on("Share with #{share_type}")
find(".selectable", text: name).click
assert(has_no_selector?(".modal-dialog-preview-pane"),
"preview pane available in sharing dialog")
+ if share_type == 'users' and obj and obj['email']
+ assert(page.has_text?(obj['email']), "Did not find user's email")
+ end
assert_raises(Capybara::ElementNotFound,
"Projects pulldown available from sharing dialog") do
click_on "All projects"
show_project_using("active")
click_on "Sharing"
- add_share_and_check("users", new_name)
+ add_share_and_check("users", new_name, add_user)
modify_share_and_check(new_name)
end
end
end
+ [
+ ['project with 10 collections', 10],
+ ['project with 201 collections', 201], # two pages of data
+ ].each do |project_name, amount|
+ test "scroll collections tab for #{project_name} with #{amount} objects" do
+ headless = Headless.new
+ headless.start
+ Capybara.current_driver = :selenium
+
+ visit page_with_token 'user1_with_load'
+
+ find("#projects-menu").click
+ find(".dropdown-menu a", text: project_name).click
+
+ my_collections = []
+ for i in 1..amount
+ my_collections << "Collection_#{i}"
+ end
+
+ # verify Data collections scroll
+ assert(page.has_text?("Data collections (#{amount})"), "Number of collections did not match the input amount")
+
+ click_link 'Data collections'
+ begin
+ wait_for_ajax
+ rescue
+ end
+
+ verify_collections = my_collections.dup
+ unexpected_items = []
+ collections_count = 0
+ within('.arv-project-Data_collections') do
+ page.execute_script "window.scrollBy(0,999000)"
+ begin
+ wait_for_ajax
+ rescue
+ end
+
+ # Visit all rows. If not all expected collections are found, retry
+ found_collections = page.all('tr[data-kind="arvados#collection"]')
+ collections_count = found_collections.count
+
+ (0..collections_count-1).each do |i|
+ # Found row text would be of the format "Show Collection_#{n} "
+ collection_name = found_collections[i].text.split[1]
+ if !my_collections.include? collection_name
+ unexpected_items << collection_name
+ else
+ verify_collections.delete collection_name
+ end
+ end
+
+ assert_equal true, unexpected_items.empty?, "Found unexpected items #{unexpected_items.inspect}"
+ assert_equal amount, collections_count, "Found different number of collections"
+ assert_equal true, verify_collections.empty?, "Did not find all the collections"
+ end
+ end
+ end
+
+ [
+ ['project with 10 pipelines', 10, 0],
+# ['project with 200 jobs and 10 pipelines', 2, 200],
+ ['project with 25 pipelines', 25, 0],
+ ].each do |project_name, num_pipelines, num_jobs|
+ test "scroll pipeline instances tab for #{project_name} with #{num_pipelines} pipelines and #{num_jobs} jobs" do
+ headless = Headless.new
+ headless.start
+ Capybara.current_driver = :selenium
+
+ visit page_with_token 'user1_with_load'
+
+ find("#projects-menu").click
+ find(".dropdown-menu a", text: project_name).click
+
+ my_pipelines = []
+ (0..num_pipelines-1).each do |i|
+ name = "pipeline_#{i}"
+ my_pipelines << name
+ end
+
+ # verify Jobs and pipelines tab scroll
+ assert(page.has_text?("Jobs and pipelines (#{num_pipelines+num_jobs})"), "Number of objects did not match the input counts")
+ click_link 'Jobs and pipelines'
+ begin
+ wait_for_ajax
+ rescue
+ end
+
+ verify_pipelines = my_pipelines.dup
+ unexpected_items = []
+ object_count = 0
+ within('.arv-project-Jobs_and_pipelines') do
+ page.execute_script "window.scrollBy(0,999000)"
+ begin
+ wait_for_ajax
+ rescue
+ end
+
+ # Visit all rows. Repeat if not all expected my_pipelines are found (inifinite scrolling should kick in)
+ pipelines_found = page.all('tr[data-kind="arvados#pipelineInstance"]')
+ found_pipeline_count = pipelines_found.count
+ (0..found_pipeline_count-1).each do |i|
+ name = pipelines_found[i].text.split[1]
+ if !my_pipelines.include? name
+ unexpected_items << name
+ else
+ verify_pipelines.delete name
+ end
+
+ assert_equal true, unexpected_items.empty?, "Found unexpected items #{unexpected_items.inspect}"
+ end
+
+ jobs_found = page.all('tr[data-kind="arvados#job"]')
+ found_job_count = jobs_found.count
+
+ assert_equal num_pipelines, found_pipeline_count, "Found different number of pipelines and jobs"
+ assert_equal num_jobs, found_job_count, 'Did not find expected number of jobs'
+ assert_equal true, verify_pipelines.empty?, "Did not find all the pipelines and jobs"
+ end
+ end
+ end
+
+ # Move button accessibility
+ [
+ ['admin', true],
+ ['active', true], # project owner
+ ['project_viewer', false],
+ ].each do |user, can_move|
+ test "#{user} can move subproject under another user's Home #{can_move}" do
+ project = api_fixture('groups')['aproject']
+ collection = api_fixture('collections')['collection_to_move_around_in_aproject']
+
+ # verify the project move button
+ visit page_with_token user, "/projects/#{project['uuid']}"
+ if can_move
+ assert page.has_link? 'Move project...'
+ else
+ assert page.has_no_link? 'Move project...'
+ end
+ end
+ end
+
end
# verify that the new user showed up in the users page and find
# the new user's UUID
- new_user_uuid =
+ new_user_uuid =
find('tr[data-object-uuid]', text: 'foo@example.com')['data-object-uuid']
assert new_user_uuid, "Expected new user uuid not found"
find('tr', text: 'zzzzz-tpzed-xurymjxw79nv3jz').
find('a', text: 'Show').
click
+ user_url = page.current_url
# Setup user
click_link 'Admin'
click_button "Submit"
end
+ visit user_url
assert page.has_text? 'modified_by_client_uuid'
click_link 'Advanced'
click_button "Submit"
end
+ visit user_url
find '#Attributes', text: 'modified_by_client_uuid'
click_link 'Advanced'
find('tr', text: 'zzzzz-tpzed-xurymjxw79nv3jz').
find('a', text: 'Show').
click
+ user_url = page.current_url
# Verify that is_active is set
find('a,button', text: 'Attributes').click
click_button "Submit"
end
+ visit user_url
assert page.has_text? 'modified_by_client_uuid'
click_link 'Advanced'
teardown do
Thread.current[:arvados_api_token] = nil
+ Thread.current[:user] = nil
Thread.current[:reader_tokens] = nil
+ # Diagnostics suite doesn't run a server, so there's no cache to clear.
+ Rails.cache.clear unless (Rails.env == "diagnostics")
# Restore configuration settings changed during tests
$application_config.each do |k,v|
if k.match /^[^.]*$/
module ClassMethods
@@api_fixtures = {}
- def api_fixture(name)
+ def api_fixture(name, *keys)
# Returns the data structure from the named API server test fixture.
@@api_fixtures[name] ||= \
begin
path = File.join(ApiServerForTests::ARV_API_SERVER_DIR,
'test', 'fixtures', "#{name}.yml")
- YAML.load(IO.read(path))
+ file = IO.read(path)
+ trim_index = file.index('# Test Helper trims the rest of the file')
+ file = file[0, trim_index] if trim_index
+ YAML.load(file)
end
+ keys.inject(@@api_fixtures[name]) { |hash, key| hash[key] }
end
end
- def api_fixture name
- self.class.api_fixture name
+ def api_fixture(name, *keys)
+ self.class.api_fixture(name, *keys)
+ end
+
+ def find_fixture(object_class, name)
+ object_class.find(api_fixture(object_class.to_s.pluralize.underscore,
+ name, "uuid"))
end
end
get_files_tree('multilevel_collection_2'),
"Collection file tree was malformed")
end
+
+ test "portable_data_hash never editable" do
+ refute(Collection.new.attribute_editable?("portable_data_hash", :ever))
+ end
+
+ test "admin can edit name" do
+ use_token :admin
+ assert(find_fixture(Collection, "foo_file").attribute_editable?("name"),
+ "admin not allowed to edit collection name")
+ end
+
+ test "project owner can edit name" do
+ use_token :active
+ assert(find_fixture(Collection, "foo_collection_in_aproject")
+ .attribute_editable?("name"),
+ "project owner not allowed to edit collection name")
+ end
+
+ test "project admin can edit name" do
+ use_token :subproject_admin
+ assert(find_fixture(Collection, "baz_file_in_asubproject")
+ .attribute_editable?("name"),
+ "project admin not allowed to edit collection name")
+ end
+
+ test "project viewer cannot edit name" do
+ use_token :project_viewer
+ refute(find_fixture(Collection, "foo_collection_in_aproject")
+ .attribute_editable?("name"),
+ "project viewer allowed to edit collection name")
+ end
end
assert_nil user.owner_uuid
end
end
+
+ test "project editable by its admin" do
+ use_token :subproject_admin
+ project = Group.find(api_fixture("groups")["asubproject"]["uuid"])
+ assert(project.editable?, "project not editable by admin")
+ end
+
+ test "project not editable by reader" do
+ use_token :project_viewer
+ project = Group.find(api_fixture("groups")["aproject"]["uuid"])
+ refute(project.editable?, "project editable by reader")
+ end
end
require 'test_helper'
class JobTest < ActiveSupport::TestCase
- # test "the truth" do
- # assert true
- # end
+ test "admin can edit description" do
+ use_token :admin
+ assert(find_fixture(Job, "job_in_subproject")
+ .attribute_editable?("description"),
+ "admin not allowed to edit job description")
+ end
+
+ test "project owner can edit description" do
+ use_token :active
+ assert(find_fixture(Job, "job_in_subproject")
+ .attribute_editable?("description"),
+ "project owner not allowed to edit job description")
+ end
+
+ test "project admin can edit description" do
+ use_token :subproject_admin
+ assert(find_fixture(Job, "job_in_subproject")
+ .attribute_editable?("description"),
+ "project admin not allowed to edit job description")
+ end
+
+ test "project viewer cannot edit description" do
+ use_token :project_viewer
+ refute(find_fixture(Job, "job_in_subproject")
+ .attribute_editable?("description"),
+ "project viewer allowed to edit job description")
+ end
end
require 'test_helper'
class PipelineInstanceTest < ActiveSupport::TestCase
- # test "the truth" do
- # assert true
- # end
+ test "admin can edit name" do
+ use_token :admin
+ assert(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+ .attribute_editable?("name"),
+ "admin not allowed to edit pipeline instance name")
+ end
+
+ test "project owner can edit name" do
+ use_token :active
+ assert(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+ .attribute_editable?("name"),
+ "project owner not allowed to edit pipeline instance name")
+ end
+
+ test "project admin can edit name" do
+ use_token :subproject_admin
+ assert(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+ .attribute_editable?("name"),
+ "project admin not allowed to edit pipeline instance name")
+ end
+
+ test "project viewer cannot edit name" do
+ use_token :project_viewer
+ refute(find_fixture(PipelineInstance, "new_pipeline_in_subproject")
+ .attribute_editable?("name"),
+ "project viewer allowed to edit pipeline instance name")
+ end
end
import os
import glob
+import stat
class SubstitutionError(Exception):
pass
return None
def sub_file(v):
- return os.path.join(os.environ['TASK_KEEPMOUNT'], v)
+ path = os.path.join(os.environ['TASK_KEEPMOUNT'], v)
+ st = os.stat(path)
+ if st and stat.S_ISREG(st.st_mode):
+ return path
+ else:
+ raise SubstitutionError("$(file {}) is not accessible or is not a regular file".format(path))
def sub_dir(v):
d = os.path.dirname(v)
if d == '':
d = v
- return os.path.join(os.environ['TASK_KEEPMOUNT'], d)
+ path = os.path.join(os.environ['TASK_KEEPMOUNT'], d)
+ st = os.stat(path)
+ if st and stat.S_ISDIR(st.st_mode):
+ return path
+ else:
+ raise SubstitutionError("$(dir {}) is not accessible or is not a directory".format(path))
def sub_basename(v):
return os.path.splitext(os.path.basename(v))[0]
def sub_glob(v):
l = glob.glob(v)
if len(l) == 0:
- raise SubstitutionError("$(glob): No match on '%s'" % v)
+ raise SubstitutionError("$(glob {}) no match fonud".format(v))
else:
return l[0]
import multiprocessing
import crunchutil.robust_put as robust_put
import crunchutil.vwd as vwd
+import argparse
+import json
+import tempfile
-os.umask(0077)
-
-t = arvados.current_task().tmpdir
+parser = argparse.ArgumentParser()
+parser.add_argument('--dry-run', action='store_true')
+parser.add_argument('--script-parameters', type=str, default="{}")
+args = parser.parse_args()
-api = arvados.api('v1')
+os.umask(0077)
-os.chdir(arvados.current_task().tmpdir)
-os.mkdir("tmpdir")
-os.mkdir("output")
+if not args.dry_run:
+ api = arvados.api('v1')
+ t = arvados.current_task().tmpdir
+ os.chdir(arvados.current_task().tmpdir)
+ os.mkdir("tmpdir")
+ os.mkdir("output")
-os.chdir("output")
+ os.chdir("output")
-outdir = os.getcwd()
+ outdir = os.getcwd()
-taskp = None
-jobp = arvados.current_job()['script_parameters']
-if len(arvados.current_task()['parameters']) > 0:
- taskp = arvados.current_task()['parameters']
+ taskp = None
+ jobp = arvados.current_job()['script_parameters']
+ if len(arvados.current_task()['parameters']) > 0:
+ taskp = arvados.current_task()['parameters']
+else:
+ outdir = "/tmp"
+ jobp = json.loads(args.job_parameters)
+ os.environ['JOB_UUID'] = 'zzzzz-8i9sb-1234567890abcde'
+ os.environ['TASK_UUID'] = 'zzzzz-ot0gb-1234567890abcde'
+ os.environ['CRUNCH_SRC'] = '/tmp/crunche-src'
+ if 'TASK_KEEPMOUNT' not in os.environ:
+ os.environ['TASK_KEEPMOUNT'] = '/keep'
links = []
sp.send_signal(signum)
self.sig = signum
+def add_to_group(gr, match):
+ m = match.groups()
+ if m not in gr:
+ gr[m] = []
+ gr[m].append(match.group(0))
+
def expand_item(p, c):
if isinstance(c, dict):
if "foreach" in c and "command" in c:
params[var] = i
r.extend(expand_list(params, c["command"]))
return r
+ if "list" in c and "index" in c and "command" in c:
+ var = c["list"]
+ items = get_items(p, p[var])
+ params = copy.copy(p)
+ params[var] = items[int(c["index"])]
+ return expand_list(params, c["command"])
+ if "regex" in c:
+ pattern = re.compile(c["regex"])
+ if "filter" in c:
+ items = get_items(p, p[c["filter"]])
+ return [i for i in items if pattern.match(i)]
+ elif "group" in c:
+ items = get_items(p, p[c["group"]])
+ groups = {}
+ for i in items:
+ match = pattern.match(i)
+ if match:
+ add_to_group(groups, match)
+ return [groups[k] for k in groups]
+ elif "extract" in c:
+ items = get_items(p, p[c["extract"]])
+ r = []
+ for i in items:
+ match = pattern.match(i)
+ if match:
+ r.append(list(match.groups()))
+ return r
elif isinstance(c, list):
return expand_list(p, c)
- elif isinstance(c, str) or isinstance(c, unicode):
+ elif isinstance(c, basestring):
return [subst.do_substitution(p, c)]
return []
def expand_list(p, l):
- return [exp for arg in l for exp in expand_item(p, arg)]
+ if isinstance(l, basestring):
+ return expand_item(p, l)
+ else:
+ return [exp for arg in l for exp in expand_item(p, arg)]
def get_items(p, value):
+ if isinstance(value, dict):
+ return expand_item(p, value)
+
if isinstance(value, list):
return expand_list(p, value)
prefix = fn[len(os.environ['TASK_KEEPMOUNT'])+1:]
if mode is not None:
if stat.S_ISDIR(mode):
- items = ["$(dir %s/%s/)" % (prefix, l) for l in os.listdir(fn)]
+ items = [os.path.join(fn, l) for l in os.listdir(fn)]
elif stat.S_ISREG(mode):
with open(fn) as f:
- items = [line for line in f]
+ items = [line.rstrip("\r\n") for line in f]
return items
else:
return None
stdinfile = None
rcode = 1
-try:
- if "task.foreach" in jobp:
- if arvados.current_task()['sequence'] == 0:
- var = jobp["task.foreach"]
- items = get_items(jobp, jobp[var])
- logger.info("parallelizing on %s with items %s" % (var, items))
- if items is not None:
- for i in items:
- params = copy.copy(jobp)
- params[var] = i
+def recursive_foreach(params, fvars):
+ var = fvars[0]
+ fvars = fvars[1:]
+ items = get_items(params, params[var])
+ logger.info("parallelizing on %s with items %s" % (var, items))
+ if items is not None:
+ for i in items:
+ params = copy.copy(params)
+ params[var] = i
+ if len(fvars) > 0:
+ recursive_foreach(params, fvars)
+ else:
+ if not args.dry_run:
arvados.api().job_tasks().create(body={
'job_uuid': arvados.current_job()['uuid'],
'created_by_job_task_uuid': arvados.current_task()['uuid'],
'sequence': 1,
'parameters': params
- }
- ).execute()
+ }).execute()
+ else:
+ logger.info(expand_list(params, params["command"]))
+ else:
+ logger.error("parameter %s with value %s in task.foreach yielded no items" % (var, params[var]))
+ sys.exit(1)
+
+try:
+ if "task.foreach" in jobp:
+ if args.dry_run or arvados.current_task()['sequence'] == 0:
+ # This is the first task to start the other tasks and exit
+ fvars = jobp["task.foreach"]
+ if isinstance(fvars, basestring):
+ fvars = [fvars]
+ if not isinstance(fvars, list) or len(fvars) == 0:
+ logger.error("value of task.foreach must be a string or non-empty list")
+ sys.exit(1)
+ recursive_foreach(jobp, jobp["task.foreach"])
+ if not args.dry_run:
if "task.vwd" in jobp:
- # Base vwd collection will be merged with output fragments from
- # the other tasks by crunch.
+ # Set output of the first task to the base vwd collection so it
+ # will be merged with output fragments from the other tasks by
+ # crunch.
arvados.current_task().set_output(subst.do_substitution(jobp, jobp["task.vwd"]))
else:
arvados.current_task().set_output(None)
- sys.exit(0)
- else:
- sys.exit(1)
+ sys.exit(0)
else:
+ # This is the only task so taskp/jobp are the same
taskp = jobp
- if "task.vwd" in taskp:
- # Populate output directory with symlinks to files in collection
- vwd.checkout(subst.do_substitution(taskp, taskp["task.vwd"]), outdir)
+ if not args.dry_run:
+ if "task.vwd" in taskp:
+ # Populate output directory with symlinks to files in collection
+ vwd.checkout(subst.do_substitution(taskp, taskp["task.vwd"]), outdir)
- if "task.cwd" in taskp:
- os.chdir(subst.do_substitution(taskp, taskp["task.cwd"]))
+ if "task.cwd" in taskp:
+ os.chdir(subst.do_substitution(taskp, taskp["task.cwd"]))
cmd = expand_list(taskp, taskp["command"])
- if "task.stdin" in taskp:
- stdinname = subst.do_substitution(taskp, taskp["task.stdin"])
- stdinfile = open(stdinname, "rb")
+ if not args.dry_run:
+ if "task.stdin" in taskp:
+ stdinname = subst.do_substitution(taskp, taskp["task.stdin"])
+ stdinfile = open(stdinname, "rb")
- if "task.stdout" in taskp:
- stdoutname = subst.do_substitution(taskp, taskp["task.stdout"])
- stdoutfile = open(stdoutname, "wb")
+ if "task.stdout" in taskp:
+ stdoutname = subst.do_substitution(taskp, taskp["task.stdout"])
+ stdoutfile = open(stdoutname, "wb")
logger.info("{}{}{}".format(' '.join(cmd), (" < " + stdinname) if stdinname is not None else "", (" > " + stdoutname) if stdoutname is not None else ""))
+
+ if args.dry_run:
+ sys.exit(0)
except subst.SubstitutionError as e:
logger.error(str(e))
logger.error("task parameters were:")
- user/tutorials/tutorial-firstscript.html.textile.liquid
- user/tutorials/tutorial-submit-job.html.textile.liquid
- user/topics/tutorial-parallel.html.textile.liquid
+ - user/topics/arv-docker.html.textile.liquid
+ - Reference:
+ - user/topics/run-command.html.textile.liquid
+ - user/reference/job-pipeline-ref.html.textile.liquid
- user/examples/crunch-examples.html.textile.liquid
- Query the metadata database:
- user/topics/tutorial-trait-search.html.textile.liquid
--- /dev/null
+{
+ "name": "Example using R in a custom Docker image",
+ "components": {
+ "Rscript": {
+ "script": "run-command",
+ "script_version": "master",
+ "repository": "arvados",
+ "script_parameters": {
+ "command": [
+ "Rscript",
+ "$(glob $(file $(myscript))/*.r)",
+ "$(glob $(dir $(mydata))/*.csv)"
+ ],
+ "myscript": {
+ "required": true,
+ "dataclass": "Collection"
+ },
+ "mydata": {
+ "required": true,
+ "dataclass": "Collection"
+ }
+ },
+ "runtime_constraints": {
+ "docker_image": "arvados/jobs-with-r"
+ }
+ }
+ }
+}
--- /dev/null
+{
+ "name":"run-command example pipeline",
+ "components":{
+ "bwa-mem": {
+ "script": "run-command",
+ "script_version": "master",
+ "repository": "arvados",
+ "script_parameters": {
+ "command": [
+ "bwa",
+ "mem",
+ "-t",
+ "$(node.cores)",
+ "$(glob $(dir $(reference_collection))/*.fasta)",
+ {
+ "foreach": "read_pair",
+ "command": "$(read_pair)"
+ }
+ ],
+ "task.stdout": "$(basename $(glob $(dir $(sample))/*_1.fastq)).sam",
+ "task.foreach": ["sample_subdir", "read_pair"],
+ "reference_collection": {
+ "required": true,
+ "dataclass": "Collection"
+ },
+ "sample": {
+ "required": true,
+ "dataclass": "Collection"
+ },
+ "sample_subdir": "$(dir $(samples))",
+ "read_pair": {
+ "value": {
+ "group": "sample_subdir",
+ "regex": "(.*)_[12]\\.fastq(\\.gz)?$"
+ }
+ }
+ }
+ }
+ }
+}
--- /dev/null
+{
+ "name":"run-command example pipeline",
+ "components":{
+ "bwa-mem": {
+ "script": "run-command",
+ "script_version": "master",
+ "repository": "arvados",
+ "script_parameters": {
+ "command": [
+ "bwa",
+ "mem",
+ "-t",
+ "$(node.cores)",
+ "$(glob $(dir $(reference_collection))/*.fasta)",
+ "$(glob $(dir $(sample))/*_1.fastq)",
+ "$(glob $(dir $(sample))/*_2.fastq)"
+ ],
+ "task.stdout": "$(basename $(glob $(dir $(sample))/*_1.fastq)).sam",
+ "reference_collection": {
+ "required": true,
+ "dataclass": "Collection"
+ },
+ "sample": {
+ "required": true,
+ "dataclass": "Collection"
+ }
+ }
+ }
+ }
+}
--- /dev/null
+<div class="alert alert-block alert-info">
+ <button type="button" class="close" data-dismiss="alert">×</button>
+ <h4>Note!</h4>
+ <p>The SSO server codebase currently uses OpenID 2.0 to talk to Google's authentication service. Google <a href="https://developers.google.com/accounts/docs/OpenID2">has deprecated that protocol</a>. This means that new clients will not be allowed to talk to Google's authentication services anymore over OpenID 2.0, and they will phase out the use of OpenID 2.0 completely in the coming monts. We are working on upgrading the SSO server codebase to a newer protocol. That work should be complete by the end of November 2014. In the mean time, anyone is free to use the existing Curoverse SSO server for any local Arvados installation. Instructions to do so are provided on the "API server":install-api-server.html page.</p>
+ <p><strong>Recommendation: skip this step</strong></p>
+</div>
--- /dev/null
+{
+ "name":"My md5 pipeline",
+ "components":{
+ "do_hash":{
+ "repository":"$USER",
+ "script":"hash.py",
+ "script_version":"master",
+ "runtime_constraints":{
+ "docker_image":"arvados/jobs-java-bwa-samtools"
+ },
+ "script_parameters":{
+ "input":{
+ "required": true,
+ "dataclass": "Collection"
+ }
+ }
+ }
+ }
+}
table(table table-bordered table-condensed).
|_. Argument |_. Type |_. Description |_. Location |_. Example |
-|node|object||query||
+{background:#ccffcc}.|node|object||query||
h2. delete
|group_class|string|Type of group. This does not affect behavior, but determines how the group is presented in the user interface. For example, @project@ indicates that the group should be displayed by Workbench and arv-mount as a project for organizing and naming objects.|@"project"@
null|
|description|text|||
+|writable_by|array|List of UUID strings identifying Users and other Groups that have write permission for this Group. Only users who are allowed to administer the Group will receive a full list. Other users will receive a partial list that includes the Group's owner_uuid and (if applicable) their own user UUID.||
|prefs|hash|||
|default_owner_uuid|string|||
|is_active|boolean|||
+|writable_by|array|List of UUID strings identifying Groups and other Users that can modify this User object. This will include the user's owner_uuid and, for administrators and users requesting their own User object, the requesting user's UUID.||
zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
</code></pre></notextile>
+If you want access control on your Keep server(s), you should set @blob_signing_key@ to the same value as the permission key you provided to your "Keep server(s)":install-keep.html.
+
Put it in @config/application.yml@ in the production or common section:
<notextile>
<pre><code>~/arvados/services/api$ <span class="userinput">cp -i config/initializers/omniauth.rb.example config/initializers/omniauth.rb
</code></pre></notextile>
-Edit @config/initializers/omniauth.rb@. Set @APP_SECRET@ to the value of @app_secret@ from "installing the single sign on server":install-sso.html .
+Edit @config/initializers/omniauth.rb@, and tell your api server to use the Curoverse SSO server for authentication:
+
+<notextile>
+<pre><code>APP_ID = 'local_docker_installation'
+APP_SECRET = 'yohbai4eecohshoo1Yoot7tea9zoca9Eiz3Tajahweo9eePaeshaegh9meiye2ph'
+CUSTOM_PROVIDER_URL = 'https://auth.curoverse.com'
+</code></pre></notextile>
+</pre>
+
+<div class="alert alert-block alert-info">
+ <button type="button" class="close" data-dismiss="alert">×</button>
+ <h4>Note!</h4>
+ <p>You can also run your own SSO server. However, the SSO server codebase currently uses OpenID 2.0 to talk to Google's authentication service. Google <a href="https://developers.google.com/accounts/docs/OpenID2">has deprecated that protocol</a>. This means that new clients will not be allowed to talk to Google's authentication services anymore over OpenID 2.0, and they will phase out the use of OpenID 2.0 completely in the coming monts. We are working on upgrading the SSO server codebase to a newer protocol. That work should be complete by the end of November 2014. In the mean time, anyone is free to use the existing Curoverse SSO server for any local Arvados installation.</p>
+</div>
You can now run the development server:
~$ <span class="userinput">echo "deb http://apt.arvados.org/ wheezy main" >> /etc/apt/sources.list.d/apt.arvados.org.list</span>
~$ <span class="userinput">/usr/bin/apt-key adv --keyserver pgp.mit.edu --recv 1078ECD7</span>
~$ <span class="userinput">/usr/bin/apt-get update</span>
-~$ <span class="userinput">/usr/bin/apt-get install keep</span>
+~$ <span class="userinput">/usr/bin/apt-get install keepstore</span>
</code></pre>
</notextile>
Verify that Keep is functional:
<notextile>
-<pre><code>~$ <span class="userinput">keep -h</span>
-keep -h
+<pre><code>~$ <span class="userinput">keepstore -h</span>
2014/07/24 15:38:27 Keep started: pid 13606
-Usage of keep:
+Usage of keepstore:
-data-manager-token-file="": File with the API token used by the Data Manager. All DELETE requests or GET /index requests must carry this token.
-enforce-permissions=false: Enforce permission signatures on requests.
-listen=":25107": Interface on which to listen for requests, in the format ipaddr:port. e.g. -listen=10.0.1.24:8000. Use -listen=:port to listen on all network interfaces.
+ -never-delete=false: If set, nothing will be deleted. HTTP 405 will be returned for valid DELETE requests.
-permission-key-file="": File containing the secret key for generating and verifying permission signatures.
-permission-ttl=1209600: Expiration time (in seconds) for newly generated permission signatures.
-pid="": Path to write pid file
</code></pre>
</notextile>
+If you want access control on your Keep server(s), you should provide a permission key. The @-permission-key-file@ argument should contain the path to a file that contains a single line with a long random alphanumeric string. It should be the same as the @blob_signing_key@ that can be set in the "API server":install-api-server.html config/application.yml file.
+
Prepare one or more volumes for Keep to use. Simply create a /keep directory on all the partitions you would like Keep to use, and then start Keep. For example, using 2 tmpfs volumes:
<notextile>
-<pre><code>~$ <span class="userinput">keep</span>
+<pre><code>~$ <span class="userinput">keepstore</span>
2014/07/24 11:41:37 Keep started: pid 20736
2014/07/24 11:41:37 adding Keep volume: /tmp/tmp.vwSCtUCyeH/keep
2014/07/24 11:41:37 adding Keep volume: /tmp/tmp.Lsn4w8N3Xv/keep
title: Install Single Sign On (SSO) server
...
+{% include 'skip_sso_server_install' %}
+
+h2(#dependencies). Install dependencies
+
+You need to have ruby 2.1 or higher and the bundler gem installed.
+
+One way to install those dependencies is:
+
+<notextile>
+<pre><code>~$ <span class="userinput">\curl -sSL https://get.rvm.io | bash -s stable --ruby=2.1</span>
+~$ <span class="userinput">gem install bundler
+</span></code></pre></notextile>
+
+h2(#install). Install SSO server
+
<notextile>
<pre><code>~$ <span class="userinput">cd $HOME</span> # (or wherever you want to install)
~$ <span class="userinput">git clone https://github.com/curoverse/sso-devise-omniauth-provider.git</span>
Copy @config/application.yml.example@ to @config/application.yml@ and edit it appropriately for your environment.
* Set @secret_token@ to the string you generated with @rake secret@.
-* Point @arvados_login_base@ and @arvados_v1_base@ at your "API server":install-api-server.html
+* Point @arvados_login_base@ and @arvados_v1_base@ at your "API server":install-api-server.html, like this:
+
+<notextile>
+<pre><code>arvados_login_base: https://your.host:3030/login
+arvados_v1_base: https://your.host:3030/arvados/v1
+</code></pre>
+</notextile>
+
* @site_name@ can be any string to identify this Workbench.
* If the SSL certificate you use for development isn't signed by a CA, make sure @arvados_insecure_https@ is @true@.
--- /dev/null
+---
+layout: default
+navsection: userguide
+title: "Pipeline template reference"
+...
+
+Pipeline template options are described on the "pipeline template schema page.":{{site.baseurl}}/api/schema/PipelineTemplate.html
--- /dev/null
+---
+layout: default
+navsection: userguide
+title: "Customizing Crunch environment using Docker"
+...
+
+This page describes how to customize the runtime environment (e.g. the programs, libraries, and other dependencies needed to run a job) that a crunch script will be run in using "Docker.":https://www.docker.com/ Docker is a tool for building and running containers that isolate applications from other applications running on the same node. For detailed information about Docker, see the "Docker User Guide.":https://docs.docker.com/userguide/
+
+This page will demonstrate how to:
+
+# Fetch the arvados/jobs Docker image
+# Manually install additional software into the container
+# Create a new custom image
+# Upload that image to Arvados for use by Crunch jobs
+# Share your image with others
+
+{% include 'tutorial_expectations' %}
+
+You also need ensure that "Docker is installed,":https://docs.docker.com/installation/ the Docker daemon is running, and you have permission to access Docker. You can test this by running @docker version@. If you receive a permission denied error, your user account may need to be added to the @docker@ group. If you have root access, you can add yourself to the @docker@ group using @$ sudo addgroup $USER docker@ then log out and log back in again; otherwise consult your local sysadmin.
+
+h2. Fetch a starting image
+
+The easiest way to begin is to start from the "arvados/jobs" image which already has the Arvados SDK installed along with other configuration required for use with Crunch.
+
+Download the latest "arvados/jobs" image from the Docker registry:
+
+<notextile>
+<pre><code>$ <span class="userinput">docker pull arvados/jobs</span>
+Pulling repository arvados/jobs
+3132168f2acb: Download complete
+a42b7f2c59b6: Download complete
+e5afdf26a7ae: Download complete
+5cae48636278: Download complete
+7a4f91b70558: Download complete
+a04a275c1fd6: Download complete
+c433ff206a22: Download complete
+b2e539b45f96: Download complete
+073b2581c6be: Download complete
+593915af19dc: Download complete
+32260b35005e: Download complete
+6e5b860c1cde: Download complete
+95f0bfb43d4d: Download complete
+c7fd77eedb96: Download complete
+0d7685aafd00: Download complete
+</code></pre>
+</notextile>
+
+h2. Install new packages
+
+Next, enter the container using @docker run@, providing the arvados/jobs image and the program you want to run (in this case the bash shell).
+
+<notextile>
+<pre><code>$ <span class="userinput">docker run --interactive --tty --user root arvados/jobs /bin/bash</span>
+root@a0e8299b59aa:/#
+</code></pre>
+</notextile>
+
+Next, update the package list using @apt-get update@.
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">apt-get update</span>
+Get:1 http://apt.arvados.org wheezy Release.gpg [490 B]
+Get:2 http://apt.arvados.org wheezy Release [1568 B]
+Get:3 http://apt.arvados.org wheezy/main amd64 Packages [34.6 kB]
+Get:4 http://ftp.us.debian.org wheezy Release.gpg [1655 B]
+Get:5 http://ftp.us.debian.org wheezy-updates Release.gpg [836 B]
+Get:6 http://ftp.us.debian.org wheezy Release [168 kB]
+Ign http://apt.arvados.org wheezy/main Translation-en
+Get:7 http://security.debian.org wheezy/updates Release.gpg [836 B]
+Get:8 http://security.debian.org wheezy/updates Release [102 kB]
+Get:9 http://ftp.us.debian.org wheezy-updates Release [124 kB]
+Get:10 http://ftp.us.debian.org wheezy/main amd64 Packages [5841 kB]
+Get:11 http://security.debian.org wheezy/updates/main amd64 Packages [218 kB]
+Get:12 http://security.debian.org wheezy/updates/main Translation-en [123 kB]
+Hit http://ftp.us.debian.org wheezy/main Translation-en
+Hit http://ftp.us.debian.org wheezy-updates/main amd64 Packages/DiffIndex
+Hit http://ftp.us.debian.org wheezy-updates/main Translation-en/DiffIndex
+Fetched 6617 kB in 5s (1209 kB/s)
+Reading package lists... Done
+</code></pre>
+</notextile>
+
+In this example, we will install the "R" statistical language Debian package "r-base-core". Use @apt-get install@:
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">apt-get install r-base-core</span>
+Reading package lists... Done
+Building dependency tree
+Reading state information... Done
+The following extra packages will be installed:
+ [...]
+libxv1 libxxf86dga1 libxxf86vm1 r-base-core r-base-dev r-base-html r-cran-boot r-cran-class r-cran-cluster r-cran-codetools
+ [...]
+Suggested packages:
+ [...]
+The following NEW packages will be installed:
+ [...]
+ libxv1 libxxf86dga1 libxxf86vm1 r-base r-base-core r-base-dev r-base-html r-cran-boot r-cran-class r-cran-cluster
+ [...]
+0 upgraded, 107 newly installed, 0 to remove and 9 not upgraded.
+Need to get 88.2 MB of archives.
+After this operation, 219 MB of additional disk space will be used.
+Do you want to continue [Y/n]? y
+[...]
+Get:85 http://ftp.us.debian.org/debian/ wheezy/main r-base-core amd64 2.15.1-4 [20.6 MB]
+Get:86 http://ftp.us.debian.org/debian/ wheezy/main r-base-dev all 2.15.1-4 [3882 B]
+Get:87 http://ftp.us.debian.org/debian/ wheezy/main r-cran-boot all 1.3-5-1 [472 kB]
+[...]
+Fetched 88.2 MB in 2min 17s (642 kB/s)
+Extracting templates from packages: 100%
+Preconfiguring packages ...
+[...]
+Unpacking r-base-core (from .../r-base-core_2.15.1-4_amd64.deb) ...
+Selecting previously unselected package r-base-dev.
+Unpacking r-base-dev (from .../r-base-dev_2.15.1-4_all.deb) ...
+Selecting previously unselected package r-cran-boot.
+Unpacking r-cran-boot (from .../r-cran-boot_1.3-5-1_all.deb) ...
+[...]
+Setting up r-base-core (2.15.1-4) ...
+Setting R_PAPERSIZE_USER default to 'a4'
+
+Creating config file /etc/R/Renviron with new version
+Setting up r-base-dev (2.15.1-4) ...
+Setting up r-cran-boot (1.3-5-1) ...
+[...]
+</code></pre>
+</notextile>
+
+Now we can verify that "R" is installed:
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">R</span>
+
+R version 2.15.1 (2012-06-22) -- "Roasted Marshmallows"
+Copyright (C) 2012 The R Foundation for Statistical Computing
+ISBN 3-900051-07-0
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+>
+</code></pre>
+</notextile>
+
+Note that you are not limited to installing Debian packages. You may compile programs or libraries from source and install them, edit systemwide configuration files, use other package managers such as @pip@ or @gem@, and perform any other customization necessary to run your program.
+
+h2. Create a new image
+
+We're now ready to create a new Docker image. First, quit the container, then use @docker commit@ to create a new image from the stopped container. The container id can be found in the default hostname of the container displayed in the prompt, in this case @a0e8299b59aa@:
+
+<notextile>
+<pre><code>root@a0e8299b59aa:/# <span class="userinput">exit</span>
+$ <span class="userinput">docker commit a0e8299b59aa arvados/jobs-with-r</span>
+33ea6b87792364cb9989a149c36a31e5a9c8cf96694ba05f66545ad7b842522e
+$ <span class="userinput">docker images</span>
+REPOSITORY TAG IMAGE ID CREATED VIRTUAL SIZE
+arvados/jobs-with-r latest 33ea6b877923 43 seconds ago 1.607 GB
+arvados/jobs latest 3132168f2acb 22 hours ago 1.314 GB
+</code></pre>
+</notextile>
+
+h2. Upload your image
+
+Finally, we are ready to upload the new Docker image to Arvados. Use @arv keep docker@ with the image repository name to upload the image. Without arguments, @arv keep docker@ will print out the list of Docker images in Arvados that are available to you.
+
+<notextile>
+<pre><code>$ <span class="userinput">arv keep docker arvados/jobs-with-r</span>
+1591M / 1591M 100.0%
+Collection saved as 'Docker image arvados/jobs-with-r:latest 33ea6b877923'
+qr1hi-4zz18-3fk2px2ji25nst2
+$ <span class="userinput">arv keep docker</span>
+REPOSITORY TAG IMAGE ID COLLECTION CREATED
+arvados/jobs-with-r latest 33ea6b877923 qr1hi-4zz18-3fk2px2ji25nst2 Thu Oct 16 13:58:53 2014
+</code></pre>
+</notextile>
+
+You are now able to specify the runtime environment for your program using the @docker_image@ field of the @runtime_constaints@ section of your pipeline components:
+
+<notextile>
+{% code 'example_docker' as javascript %}
+</notextile>
+
+* The @docker_image@ field can be one of: the Docker repository name (as shown above), the Docker image hash, the Arvados collection UUID, or the Arvados collection portable data hash.
+
+h2. Share Docker images
+
+Docker images are subject to normal Arvados permissions. If wish to share your Docker image with others (or wish to share a pipeline template that uses your Docker image) you will need to use @arv keep docker@ with the @--project-uuid@ option to upload the image to a shared project.
+
+<notextile>
+<pre><code>$ <span class="userinput">arv keep docker --project-uuid zzzzz-j7d0g-u7zg1qdaowykd8d arvados/jobs-with-r</span>
+</code></pre>
+</notextile>
--- /dev/null
+---
+layout: default
+navsection: userguide
+title: "run-command reference"
+...
+
+The @run-command@ crunch script enables you run command line programs.
+
+h1. Using run-command
+
+The basic @run-command@ process evaluates its inputs and builds a command line, executes the command, and saves the contents of the output directory back to Keep. For large datasets, @run-command@ can schedule concurrent tasks to execute the wrapped program over a range of inputs (see @task.foreach@ below.)
+
+@run-command@ is controlled through the @script_parameters@ section of a pipeline component. @script_parameters@ is a JSON object consisting of key-value pairs. There are three categories of keys that are meaningful to run-command:
+* The @command@ section defining the template to build the command line of task
+* Special processing directives such as @task.foreach@ @task.cwd@ @task.vwd@ @task.stdin@ @task.stdout@
+* User-defined parameters (everything else)
+
+In the following examples, you can use "dry run mode" to determine the command line that @run-command@ will use without actually running the command. For example:
+
+<notextile>
+<pre><code>~$ <span class="userinput">./run-command --dry-run --script-parameters '{
+ "command": ["echo", "hello world"]
+}'</span>
+run-command: echo hello world
+</code></pre>
+</notextile>
+
+h2. Command template
+
+The value of the "command" key is a list. The first parameter of the list is the actual program to invoke, followed by the command arguments. The simplest @run-command@ invocation simply runs a program with static parameters. In this example, run "echo" with the first argument "hello world":
+
+<pre>
+{
+ "command": ["echo", "hello world"]
+}
+</pre>
+
+Running this job will print "hello world" to the job log.
+
+By default, the command will start with the current working directory set to the output directory. Anything written to the output directory will be saved to Keep when the command is finished. You can change the default working directory using @task.cwd@ and get the path to the output directory using @$(task.outdir)@ as explained below.
+
+Items in the "command" list may include lists and objects in addition to strings. Lists are flattened to produce the final command line. JSON objects are evaluated as list item functions (see below). For example, the following evaluates to @["echo", "hello", "world"]@:
+
+<pre>
+{
+ "command": ["echo", ["hello", "world"]]
+}
+</pre>
+
+h2. Parameter substitution
+
+The "command" list can include parameter substitutions. Substitutions are enclosed in "$(...)" and may contain the name of a user-defined parameter. In the following example, the value of "a" is "hello world"; so when "command" is evaluated, it will substitute "hello world" for "$(a)":
+
+<pre>
+{
+ "command": ["echo", "$(file $(a))"],
+ "a": "c1bad4b39ca5a924e481008009d94e32+210/var-GS000016015-ASM.tsv.bz2"
+}
+</pre>
+
+table(table table-bordered table-condensed).
+|_. Function|_. Action|
+|$(file ...) | Takes a reference to a file within an Arvados collection and evaluates to a file path on the local file system where that file can be accessed by your command. Will raise an error if the file is not accessible.|
+|$(dir ...) | Takes a reference to an Arvados collection or directory within an Arvados collection and evaluates to a directory path on the local file system where that directory can be accessed by your command. The path may include a file name, in which case it will evaluate to the parent directory of the file. Uses Python's os.path.dirname(), so "/foo/bar" will evaluate to "/foo" but "/foo/bar/" will evaluate to "/foo/bar". Will raise an error if the directory is not accessible. |
+|$(basename ...) | Strip leading directory and trailing file extension from the path provided. For example, $(basename /foo/bar.baz.txt) will evaluate to "bar.baz".|
+|$(glob ...) | Take a Unix shell path pattern (supports @*@ @?@ and @[]@) and search the local filesystem, returning the first match found. Use together with $(dir ...) to get a local filesystem path for Arvados collections. For example: $(glob $(dir $(mycollection)/*.bam)) will find the first .bam file in the collection specified by the user parameter "mycollection". If there is more than one match, which one is returned is undefined. Will raise an error if no matches are found.|
+
+h2. List context
+
+When a parameter is evaluated in a list context, that means its value should evaluate to a list instead of a string. Parameter values can be a static list (as demonstrated above), a path to a file, a path to a directory, or a JSON object describing a list context function.
+
+If the value is a static list, it will evaluate the list items for parameter substation and list functions.
+
+If the value is a string, it is interpreted as a path. If the path specifies a regular file, that file will be opened as a text file and produce a list with one item for each line in the file (end-of-line characters will be stripped). If the path specifies a directory, produce a list containing all of the entries in the directory. Note that parameter expansion is not performed on lists produced this way.
+
+If the value is a JSON object, it is evaluated as a list function described below.
+
+h2. List functions
+
+When @run-command@ is evaluating a list (such as "command"), in addition to string parameter substitution, you can use list item functions. Note: in the following functions, you specify the name of a user parameter to act on; you cannot provide the list value directly in line.
+
+h3. foreach
+
+The @foreach@ list item function (not to be confused with the @task.foreach@ directive) expands a command template for each item in the specified user parameter (the value of the user parameter is evaluated in a list context, as described above). The following example will evaluate "command" to @["echo", "--something", "alice", "--something", "bob"]@:
+
+<pre>
+{
+ "command": ["echo", {"foreach": "a", "command": ["--something", "$(a)"]}],
+ "a": ["alice", "bob"]
+}
+</pre>
+
+h3. index
+
+This function extracts a single item from a list. The value of @index@ is zero-based (i.e. the first item is at index 0, the second item index 1, etc). The following example will evaluate "command" to @["echo", "--something", "bob"]@:
+
+<pre>
+{
+ "command": ["echo", {"list": "a", "index": 1, "command": ["--something", "$(a)"]}],
+ "a": ["alice", "bob"]
+}
+</pre>
+
+h3. filter
+
+Filter the list so that it only includes items that match a regular expression. The following example will evaluate to @["echo", "bob"]@
+
+<pre>
+{
+ "command": ["echo", {"filter": "a", "regex": "b.*"}],
+ "a": ["alice", "bob"]
+}
+</pre>
+
+h3. group
+
+Generate a list of lists, where items are grouped on common subexpression match. Items which don't match the regular expression are excluded. The following example evaluates to @["echo", "--group", "alice", "carol", "dave", "--group", "bob"]@:
+
+<pre>
+{
+ "command": ["echo", {"foreach": "b", "command":["--group", {"foreach": "b", "command":"$(b)"}]}],
+ "a": ["alice", "bob", "carol", "dave"],
+ "b": {"group": "a", "regex": "[^a]*(a?).*"}
+}
+</pre>
+
+h3. extract
+
+Generate a list of lists, where items are split by subexpression match. Items which don't match the regular expression are excluded. The following example evaluates to @["echo", "c", "a", "rol", "d", "a", "ve"]@:
+
+<pre>
+{
+ "command": ["echo", {"foreach": "b", "command":[{"foreach": "b", "command":"$(b)"}]}],
+ "a": ["alice", "bob", "carol", "dave"],
+ "b": {"extract": "a", "regex": "(.+)(a)(.*)"}
+}
+</pre>
+
+h2. Directives
+
+Directives alter the behavior of run-command. All directives are optional.
+
+h3. task.cwd
+
+This directive sets the initial current working directory that your command will run in. If @task.cwd@ is not specified, the default current working directory is @task.outdir@.
+
+h3. task.stdin and task.stdout
+
+Provide standard input and standard output redirection.
+
+@task.stdin@ must evaluate to a path to a file to be bound to the command's standard input stream.
+
+@task.stdout@ specifies the desired file name in the output directory to save the content of standard output.
+
+h3. task.vwd
+
+Background: because Keep collections are read-only, this does not play well with certain tools that expect to be able to write their outputs alongside their inputs (such as tools that generate indexes that are closely associated with the original file.) The run-command's solution to this is the "virtual working directory".
+
+@task.vwd@ specifies a Keep collection with the starting contents of the directory. @run-command@ will then populate @task.outdir@ with directories and symlinks to mirror the contents of the @task.vwd@ collection. Your command will then be able to both access its input files and write its output files in @task.outdir@. When the command completes, the output collection will merge the output of your command with the contents of the starting collection. Note that files in the starting collection remain read-only and cannot be altered or deleted.
+
+h3. task.foreach
+
+Using @task.foreach@, you can run your command concurrently over large datasets.
+
+@task.foreach@ takes the names of one or more user-defined parameters. The value of these parameters are evaluated in a list context. @run-command@ then generates tasks based on the Cartesian product (i.e. all combinations) of the input lists. The outputs of all tasks are merged to create the final output collection. Note that if two tasks output a file in the same directory with the same name, that file will be concatenated in the final output. In the following example, three tasks will be created for the "grep" command, based on the contents of user parameter "a":
+
+<pre>
+{
+ "command": ["echo", "$(a)"],
+ "task.foreach": "a",
+ "a": ["alice", "bob", "carol"]
+}
+</pre>
+
+This evaluates to the commands:
+<notextile>
+<pre>
+["echo", "alice"]
+["echo", "bob"]
+["echo", "carol"]
+</pre>
+</notextile>
+
+You can also specify multiple parameters:
+
+<pre>
+{
+ "command": ["echo", "$(a)", "$(b)"],
+ "task.foreach": ["a", "b"],
+ "a": ["alice", "bob"],
+ "b": ["carol", "dave"]
+}
+</pre>
+
+This evaluates to the commands:
+
+<pre>
+["echo", "alice", "carol"]
+["echo", "alice", "dave"]
+["echo", "bob", "carol"]
+["echo", "bob", "dave"]
+</pre>
+
+h1. Examples
+
+The following is a single task pipeline using @run-command@ to run the bwa alignment tool to align a single paired-end read fastq sample. The input to this pipeline is the reference genome and a collection consisting of two fastq files for the read pair.
+
+<notextile>{% code 'run_command_simple_example' as javascript %}</notextile>
+
+The following is a concurrent task pipeline using @run-command@ to run the bwa alignment tool to align a set of fastq reads over multiple samples. The input to this pipeline is the reference genome and a collection consisting subdirectories for each sample, with each subdirectory containing pairs of fastq files for each set of reads.
+
+<notextile>{% code 'run_command_foreach_example' as javascript %}</notextile>
{% include 'tutorial_expectations' %}
-Use the following command to create a new empty template using @arv pipeline_template create@:
+Use the following command to create an empty template using @arv create pipeline_template@:
<notextile>
-<pre><code>~$ <span class="userinput">arv edit $(arv --format=uuid pipeline_template create --pipeline-template '{}') name components </span></code></pre>
+<pre><code>~$ <span class="userinput">arv create pipeline_template</span></code></pre>
</notextile>
-* @--format=uuid@ option prints out just the unique identifier for the new template, instead of the entire template record (default)
-
-This will open the template record in an interactive text editor (as specified by $EDITOR or $VISUAL, otherwise defaults to @nano@) using @arv edit@. Now add the following content:
+This will open the template record in an interactive text editor (as specified by $EDITOR or $VISUAL, otherwise defaults to @nano@). Now, update the contents of the editor with the following content:
<notextile>{% code 'tutorial_bwa_sortsam_pipeline' as javascript %}</notextile>
When using @run-command@, the tool should write its output to the current working directory. The output will be automatically uploaded to Keep when the job completes.
+See the "run-command reference":{{site.baseurl}}/user/topics/run-command.html for more information about using @run-command@.
+
h2. Running your pipeline
Your new pipeline template should appear at the top of the Workbench "pipeline templates":https://{{ site.arvados_workbench_host }}/pipeline_templates page. You can run your pipeline "using Workbench":tutorial-pipeline-workbench.html or the "command line.":{{site.baseurl}}/user/topics/running-pipeline-command-line.html
* It is easy for existing tools to access files in Keep.
* Data is downloaded on demand. It is not necessary to download an entire file or collection to start processing.
-The default mode permits browsing any collection in Arvados as a subdirectory under the mount directory. To avoid having to fetch a potentially large list of all collections, collection directories only come into existence when explicitly accessed by their keep locator.
+The default mode permits browsing any collection in Arvados as a subdirectory under the mount directory. To avoid having to fetch a potentially large list of all collections, collection directories only come into existence when explicitly accessed by their Keep locator. For instance, a collection may be found by its content hash in the @keep/by_id@ directory.
<notextile>
<pre><code>~$ <span class="userinput">mkdir -p keep</span>
~$ <span class="userinput">arv-mount keep</span>
-~$ <span class="userinput">cd keep/c1bad4b39ca5a924e481008009d94e32+210</span>
-~/keep/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">ls</span>
+~$ <span class="userinput">cd keep/by_id/c1bad4b39ca5a924e481008009d94e32+210</span>
+~/keep/by_id/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">ls</span>
var-GS000016015-ASM.tsv.bz2
-~/keep/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">md5sum var-GS000016015-ASM.tsv.bz2</span>
+~/keep/by_id/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">md5sum var-GS000016015-ASM.tsv.bz2</span>
44b8ae3fde7a8a88d2f7ebd237625b4f var-GS000016015-ASM.tsv.bz2
-~/keep/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">cd ../..</span>
+~/keep/by_id/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">cd ../..</span>
~$ <span class="userinput">fusermount -u keep</span>
</code></pre>
</notextile>
+
+The last line unmounts Keep. Subdirectories will no longer be accessible.
+
+Within each directory on Keep, there is a @.arvados#collection@ file that does not show up with @ls@. Its contents include, for instance, the @portable_data_hash@, which is the same as the Keep locator.
+++ /dev/null
----
-layout: default
-navsection: userguide
-title: "Writing a multi-step pipeline"
-...
-
-A pipeline in Arvados is a collection of crunch scripts, in which the output from one script may be used as the input to another script.
-
-{% include 'tutorial_expectations' %}
-
-
-First, use @arv pipeline_template create@ to create a new empty template. The @--format=uuid@ option will print out the unique identifier for the new template:
-
-<notextile>
-<pre><code>~$ <span class="userinput">arv --format=uuid pipeline_template create --pipeline-template '{}'</span>
-qr1hi-p5p6p-wt1vdhkezgx7g2k
-</span></code></pre>
-</notextile>
-
-Next, use @arv edit@ to edit the template. This will open the template record in an interactive text editor (as specified by $EDITOR or $VISUAL, otherwise defaults to @nano@). Replace the empty fields with the following content:
-
-<notextile>{% code 'tutorial_bwa_pipeline' as javascript %}</notextile>
-
-
-Your new pipeline template will appear on the Workbench "Pipeline templates":https://{{ site.arvados_workbench_host }}/pipeline_templates page.
-
-For more information and examples for writing pipelines, see the "pipeline template reference":{{site.baseurl}}/api/schema/PipelineTemplate.html
title: "Running a pipeline using Workbench"
...
-A "pipeline" (sometimes called a "workflow" in other systems) is a sequence of steps that apply various programs or tools to transform input data to output data. Pipelines are the principal means of performing computation with Arvados. This tutorial demonstrates how to run a single-stage pipeline to take a small data set of paired-end reads from a sample "exome":https://en.wikipedia.org/wiki/Exome in "FASTQ":https://en.wikipedia.org/wiki/FASTQ_format format and align them to "Chromosome 19":https://en.wikipedia.org/wiki/Chromosome_19_%28human%29 using the "bwa mem":http://bio-bwa.sourceforge.net/ tool, producing a "Sequence Alignment/Map (SAM)":https://samtools.github.io/ file. This will introduce the following Arvados features:
+A "pipeline" (sometimes called a "workflow" in other systems) is a sequence of steps that apply various programs or tools to transform input data to output data. Pipelines are the principal means of performing computation with Arvados. This tutorial demonstrates how to run a single-stage pipeline to take a small data set of paired-end reads from a sample "exome":https://en.wikipedia.org/wiki/Exome in "FASTQ":https://en.wikipedia.org/wiki/FASTQ_format format and align them to "Chromosome 19":https://en.wikipedia.org/wiki/Chromosome_19_%28human%29 using the "bwa mem":http://bio-bwa.sourceforge.net/ tool, producing a "Sequence Alignment/Map (SAM)":https://samtools.github.io/ file. This tutorial will introduce the following Arvados features:
<div class="inside-list">
-* How to create a project.
-* How to browse available pipeline templates and create a new pipeline from an existing template.
+* How to create a new pipeline from an existing template.
* How to browse and select input data for the pipeline and submit the pipeline to run on the Arvados cluster.
* How to access your pipeline results.
</div>
notextile. <div class="spaced-out">
# Start from the *Workbench Dashboard*. You can access the Dashboard by clicking on *<i class="fa fa-lg fa-fw fa-dashboard"></i> Dashboard* in the upper left corner of any Workbench page.
-# In the *My projects* panel, click the <span class="btn btn-sm btn-primary" > <i class="fa fa-fw fa-plus"></i> Add new project</span> button. The new project will be created immediately, and your browser opens the new project's page for you to customize it.
-# On the new project page, click on the pencil icon <i class="fa fa-fw fa-pencil"></i> next to *New project* to pop up a text box and change the project title to *Tutorial output*. Click the <span class="btn btn-xs btn-primary" ><i class="glyphicon glyphicon-ok"></i></span> checkbox button to save the new name.
# Click on the <span class="btn btn-sm btn-primary"><i class="fa fa-fw fa-gear"></i> Run a pipeline...</span> button. This will open a dialog box titled *Choose a pipeline to run*.
# Click to open the *All projects <span class="caret"></span>* menu. Under the *Projects shared with me* header, select *<i class="fa fa-fw fa-share-alt"></i> Arvados Tutorial*.
# Select *<i class="fa fa-fw fa-gear"></i> Tutorial align using bwa mem* and click the <span class="btn btn-sm btn-primary" >Next: choose inputs <i class="fa fa-fw fa-arrow-circle-right"></i></span> button. This will load a new page where you will supply the inputs for the pipeline.
# Once again, open the *All projects <span class="caret"></span>* menu and select *<i class="fa fa-fw fa-share-alt"></i> Arvados Tutorial*. Select *<i class="fa fa-fw fa-archive"></i> Tutorial chromosome 19 reference* and click the <span class="btn btn-sm btn-primary" >OK</span> button.
# Repeat the previous two steps to set the *Input genome (fastq)* parameter to *<i class="fa fa-fw fa-archive"></i> Tutorial sample exome*.
# Click on the <span class="btn btn-sm btn-primary" >Run <i class="fa fa-fw fa-play"></i></span> button. The page updates to show you that the pipeline has been submitted to run on the Arvados cluster.
-# After the pipeline starts running, you can track the progress by watching log messages from jobs. This page refreshes automatically. You will see a <span class="label label-success">success</span> label under the *job* the column when the pipeline completes successfully.
-# Click on *<i class="fa fa-fw fa-archive"></i> Show output files* to see the results of the job. This will load a new page listing the output files from this pipeline. You'll see the output SAM file from the alignment tool under the *Files* tab.
+# After the pipeline starts running, you can track the progress by watching log messages from jobs. This page refreshes automatically. You will see a <span class="label label-success">complete</span> label under the *job* column when the pipeline completes successfully.
+# Click on the *Output* link to see the results of the job. This will load a new page listing the output files from this pipeline. You'll see the output SAM file from the alignment tool under the *Files* tab.
# Click on the <span class="btn btn-sm btn-info"><i class="fa fa-download"></i></span> download button to the right of the SAM file to download your results.
notextile. </div>
h2. Create a pipeline template
-Next, create a file that contains the pipeline definition:
+Next, create a new template using @arv create pipeline_template@:
<notextile>
-<pre><code>~/$USER/crunch_scripts$ <span class="userinput">cd ~</span>
-~$ <span class="userinput">cat >the_pipeline <<EOF
-{
- "name":"My md5 pipeline",
- "components":{
- "do_hash":{
- "script":"hash.py",
- "script_parameters":{
- "input":{
- "required": true,
- "dataclass": "Collection"
- }
- },
- "repository":"$USER",
- "script_version":"master",
- "runtime_constraints":{
- "docker_image":"arvados/jobs-java-bwa-samtools"
- }
- }
- }
-}
-EOF
-</span></code></pre>
+<pre><code>~$ <span class="userinput">arv create pipeline_template</span></code></pre>
</notextile>
+In the editor, enter the following template:
+
+<notextile> {% code 'tutorial_submit_job' as javascript %} </notextile>
+
* @"repository"@ is the name of a git repository to search for the script version. You can access a list of available git repositories on the Arvados Workbench under "Code repositories":https://{{site.arvados_workbench_host}}/repositories.
* @"script_version"@ specifies the version of the script that you wish to run. This can be in the form of an explicit Git revision hash, a tag, or a branch (in which case it will use the HEAD of the specified branch). Arvados logs the script version that was used in the run, enabling you to go back and re-run any past job with the guarantee that the exact same code will be used as was used in the previous run.
* @"script"@ specifies the filename of the script to run. Crunch expects to find this in the @crunch_scripts/@ subdirectory of the Git repository.
-
-Now, use @arv pipeline_template create@ to register your pipeline template in Arvados:
-
-<notextile>
-<pre><code>~$ <span class="userinput">arv pipeline_template create --pipeline-template "$(cat the_pipeline)"</span>
-</code></pre>
-</notextile>
+* @"runtime_constraints"@ describes the runtime environment required to run the job. These are described in the "job record schema":{{site.baseurl}}/api/schema/Job.html
h2. Running your pipeline
require 'oj'
require 'active_support/inflector'
require 'yaml'
+ require 'tempfile'
rescue LoadError
abort <<-EOS
end
end
-subcommands = %w(keep pipeline tag ws edit)
+subcommands = %w(create edit keep pipeline tag ws)
def check_subcommands client, arvados, subcommand, global_opts, remaining_opts
case subcommand
+ when 'create'
+ arv_create client, arvados, global_opts, remaining_opts
+ when 'edit'
+ arv_edit client, arvados, global_opts, remaining_opts
when 'keep'
@sub = remaining_opts.shift
if ['get', 'put', 'ls', 'normalize'].index @sub then
exec `which arv-tag`.strip, *remaining_opts
when 'ws'
exec `which arv-ws`.strip, *remaining_opts
- when 'edit'
- arv_edit client, arvados, global_opts, remaining_opts
end
end
puts "Saved contents to " + tmp.path + ".saved"
end
+def command_exists?(command)
+ ENV['PATH'].split(':').each {|folder| File.executable?(File.join(folder, command))}
+end
+
+def run_editor tmp_file, global_opts
+ need_edit = true
+ while need_edit
+ pid = Process::fork
+ if pid.nil?
+ editor = nil
+ [ENV["VISUAL"], ENV["EDITOR"], "nano", "vi"].each do |e|
+ editor ||= e if e and command_exists? e
+ end
+ if editor.nil?
+ puts "Could not find any editor to use, please set $VISUAL or $EDITOR to your desired editor."
+ exit 1
+ end
+ exec editor, tmp_file.path
+ else
+ Process.wait pid
+ end
+
+ if $?.exitstatus == 0
+ tmp_file.open
+ newcontent = tmp_file.read()
+
+ newobj = {}
+ begin
+ case global_opts[:format]
+ when 'json'
+ newobj = Oj.load(newcontent)
+ when 'yaml'
+ newobj = YAML.load(newcontent)
+ end
+ need_edit = false
+ rescue Exception => e
+ n = 1
+ newcontent.each_line do |line|
+ puts "#{n.to_s.rjust 4} #{line}"
+ n += 1
+ end
+ puts "Parse error! " + e.to_s
+ puts "\nTry again (y/n)? "
+ yn = "X"
+ while not ["y", "Y", "n", "N"].include?(yn)
+ yn = $stdin.read 1
+ end
+ if yn == 'n' or yn == 'N'
+ arv_edit_save_tmp tmp_file
+ abort
+ end
+ end
+ else
+ puts "Editor exited with status #{$?.exitstatus}"
+ exit $?.exitstatus
+ end
+ end
+
+ newobj
+end
+
def arv_edit client, arvados, global_opts, remaining_opts
uuid = remaining_opts.shift
if uuid.nil? or uuid == "-h" or uuid == "--help"
content = results.to_yaml
end
- require 'tempfile'
-
- tmp = Tempfile.new([uuid, "." + global_opts[:format]])
- tmp.write(content)
- tmp.close
-
- need_edit = true
-
- while need_edit
- pid = Process::fork
- if pid.nil?
- editor ||= ENV["VISUAL"]
- editor ||= ENV["EDITOR"]
- editor ||= "nano"
- exec editor, tmp.path
- else
- Process.wait pid
- end
-
- if $?.exitstatus == 0
- tmp.open
- newcontent = tmp.read()
+ tmp_file = Tempfile.new([uuid, "." + global_opts[:format]])
+ tmp_file.write(content)
+ tmp_file.close
- newobj = {}
- begin
- case global_opts[:format]
- when 'json'
- newobj = Oj.load(newcontent)
- when 'yaml'
- newobj = YAML.load(newcontent)
- end
- need_edit = false
- rescue Exception => e
- puts "Parse error! " + e.to_s
- n = 1
- newcontent.each_line do |line|
- puts "#{n.to_s.rjust 4} #{line}"
- n += 1
- end
- puts "\nTry again (y/n)? "
- yn = "X"
- while not ["y", "Y", "n", "N"].include?(yn)
- yn = $stdin.read 1
- end
- if yn == 'n' or yn == 'N'
- arv_edit_save_tmp tmp
- abort
- end
- end
- else
- puts "Editor exited with status #{$?.exitstatus}"
- exit $?.exitstatus
- end
- end
+ newobj = run_editor tmp_file, global_opts
begin
if newobj != results
puts "Error communicating with server, error was #{e}"
puts "Update body was:"
puts dumped
- arv_edit_save_tmp tmp
+ arv_edit_save_tmp tmp_file
abort
end
begin
results = JSON.parse result.body
rescue JSON::ParserError => e
+ arv_edit_save_tmp tmp_file
abort "Failed to parse server response:\n" + e.to_s
end
puts "Update failed. Server responded #{result.response.status}: #{results['errors']} "
puts "Update body was:"
puts dumped
- arv_edit_save_tmp tmp
+ arv_edit_save_tmp tmp_file
abort
end
else
puts "Object is unchanged, did not update."
end
ensure
- tmp.close(true)
+ tmp_file.close(true)
+ end
+
+ exit 0
+end
+
+def arv_create client, arvados, global_opts, remaining_opts
+ types = resource_types(arvados.discovery_document)
+ create_opts = Trollop::options do
+ opt :project_uuid, "Project uuid in which to create the object", :type => :string
+ stop_on resource_types(arvados.discovery_document)
+ end
+
+ object_type = remaining_opts.shift
+ if object_type.nil?
+ abort "Missing resource type, must be one of #{types.join ', '}"
+ end
+
+ rsc = arvados.discovery_document["resources"].keys.select { |k| object_type == k.singularize }
+ if rsc.empty?
+ abort "Could not determine resource type #{object_type}"
+ end
+ rsc = rsc.first
+
+ discovered_params = arvados.discovery_document["resources"][rsc]["methods"]["create"]["parameters"]
+ method_opts = Trollop::options do
+ banner head_banner
+ banner "Usage: arv create [--project-uuid] #{object_type} [create parameters]"
+ banner ""
+ banner "This method supports the following parameters:"
+ banner ""
+ discovered_params.each do |k,v|
+ opts = Hash.new()
+ opts[:type] = v["type"].to_sym if v.include?("type")
+ if [:datetime, :text, :object, :array].index opts[:type]
+ opts[:type] = :string # else trollop bork
+ end
+ opts[:default] = v["default"] if v.include?("default")
+ opts[:default] = v["default"].to_i if opts[:type] == :integer
+ opts[:default] = to_boolean(v["default"]) if opts[:type] == :boolean
+ opts[:required] = true if v.include?("required") and v["required"]
+ description = ''
+ description = ' ' + v["description"] if v.include?("description")
+ opt k.to_sym, description, opts
+ end
+ end
+
+
+ newobj = {}
+ if create_opts[:project_uuid]
+ newobj["owner_uuid"] = create_opts[:project_uuid]
+ end
+
+ case global_opts[:format]
+ when 'json'
+ content = Oj.dump(newobj, :indent => 1)
+ when 'yaml'
+ content = newobj.to_yaml
+ end
+
+ tmp_file = Tempfile.new(["", ".#{global_opts[:format]}"])
+ tmp_file.write(content)
+ tmp_file.close
+
+ newobj = run_editor tmp_file, global_opts
+
+ begin
+ api_method = 'arvados.' + rsc + '.create'
+ dumped = Oj.dump(newobj)
+
+ result = client.execute(:api_method => eval(api_method),
+ :parameters => method_opts,
+ :body_object => {object_type => newobj},
+ :authenticated => false,
+ :headers => {
+ authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+ })
+
+ begin
+ results = JSON.parse result.body
+ rescue JSON::ParserError => e
+ arv_edit_save_tmp tmp_file
+ abort "Failed to parse server response:\n" + e.to_s
+ end
+
+ if result.response.status != 200
+ puts "Create failed. Server responded #{result.response.status}: #{results['errors']} "
+ puts "Create body was:"
+ puts dumped
+ arv_edit_save_tmp tmp_file
+ abort
+ end
+
+ begin
+ puts "Created object #{results['uuid']}"
+ rescue
+ arv_edit_save_tmp tmp_file
+ abort "Unexpected response:\n#{results}"
+ end
+ ensure
+ tmp_file.close(true)
end
exit 0
exit 255
end
-def parse_arguments(discovery_document, subcommands)
+def resource_types discovery_document
resource_types = Array.new()
discovery_document["resources"].each do |k,v|
resource_types << k.singularize
end
+ resource_types
+end
- resource_types += subcommands
+def parse_arguments(discovery_document, subcommands)
+ resources_and_subcommands = resource_types(discovery_document) + subcommands
option_parser = Trollop::Parser.new do
version __FILE__
banner "Additional options:"
conflicts :short, :format
- stop_on resource_types
+ stop_on resources_and_subcommands
end
global_opts = Trollop::with_standard_exception_handling option_parser do
resource = ARGV.shift
if not subcommands.include? resource
- if not resource_types.include?(resource)
+ if not resources_and_subcommands.include?(resource)
puts "Resource or subcommand '#{resource}' is not recognized.\n\n" if !resource.nil?
help_resources(option_parser, discovery_document, resource)
end
if value.nil? and
![false,'false',0,'0'].index parameter[:required]
if parameter[:output_of]
+ if not @components[parameter[:output_of].intern]
+ errors << [componentname, parametername, "output_of refers to nonexistent component '#{parameter[:output_of]}'"]
+ end
next
end
errors << [componentname, parametername, "required parameter is missing"]
end
debuglog "parameter #{componentname}::#{parametername} == #{value}"
- component[:script_parameters][parametername] = value
+
+ component[:script_parameters][parametername] =
+ parameter.dup.merge(value: value)
end
end
if !errors.empty?
my_submit_id = "instance #{@instance[:uuid]} rand #{rand(2**64).to_s(36)}"
job = JobCache.create(@instance, cname, {
:script => c[:script],
- :script_parameters => c[:script_parameters],
+ :script_parameters => Hash[c[:script_parameters].map do |key, spec|
+ [key, spec[:value]]
+ end],
:script_version => c[:script_version],
:repository => c[:repository],
:nondeterministic => c[:nondeterministic],
c2[:script_parameters].each do |pname, p|
if p.is_a? Hash and p[:output_of] == cname.to_s
debuglog "parameter #{c2name}::#{pname} == #{c[:job][:output]}"
- c2[:script_parameters][pname] = c[:job][:output]
+ c2[:script_parameters][pname] = {value: c[:job][:output]}
moretodo = true
end
end
Obtain job details from Arvados, run tasks on compute nodes (typically
invoked by scheduler on controller):
- crunch-job --job x-y-z
+ crunch-job --job x-y-z --git-dir /path/to/repo/.git
Obtain job details from command line, run tasks on local machine
(typically invoked by application or developer on VM):
- crunch-job --job '{"script_version":"/path/to/tree","script":"scriptname",...}'
+ crunch-job --job '{"script_version":"/path/to/working/tree","script":"scriptname",...}'
+
+ crunch-job --job '{"repository":"https://github.com/curoverse/arvados.git","script_version":"master","script":"scriptname",...}'
=head1 OPTIONS
=item --git-dir
-Path to .git directory where the specified commit is found.
+Path to a .git directory (or a git URL) where the commit given in the
+job's C<script_version> attribute is to be found. If this is I<not>
+given, the job's C<repository> attribute will be used.
=item --job-api-token
setup. This can speed up development and debugging when running jobs
locally.
+=item --job
+
+UUID of the job to run, or a JSON-encoded job resource without a
+UUID. If the latter is given, a new job object will be created.
+
=back
=head1 RUNNING JOBS LOCALLY
use IO::Select;
use File::Temp;
use Fcntl ':flock';
-use File::Path qw( make_path );
+use File::Path qw( make_path remove_tree );
use constant EX_TEMPFAIL => 75;
}
my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
-my $job_has_uuid = $jobspec =~ /^[-a-z\d]+$/;
-my $local_job = !$job_has_uuid;
+my $local_job = 0;
$SIG{'USR1'} = sub
my $arv = Arvados->new('apiVersion' => 'v1');
-my $User = $arv->{'users'}->{'current'}->execute;
-
-my $Job = {};
+my $Job;
my $job_id;
my $dbh;
my $sth;
-if ($job_has_uuid)
+my @jobstep;
+
+my $User = retry_op(sub { $arv->{'users'}->{'current'}->execute; });
+
+if ($jobspec =~ /^[-a-z\d]+$/)
{
- $Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
+ # $jobspec is an Arvados UUID, not a JSON job specification
+ $Job = retry_op(sub {
+ $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
+ });
if (!$force_unlock) {
# Claim this job, and make sure nobody else does
- eval {
+ eval { retry_op(sub {
# lock() sets is_locked_by_uuid and changes state to Running.
$arv->{'jobs'}->{'lock'}->execute('uuid' => $Job->{'uuid'})
- };
+ }); };
if ($@) {
Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
exit EX_TEMPFAIL;
$Job->{'is_locked_by_uuid'} = $User->{'uuid'};
$Job->{'started_at'} = gmtime;
+ $Job->{'state'} = 'Running';
- $Job = $arv->{'jobs'}->{'create'}->execute('job' => $Job);
-
- $job_has_uuid = 1;
+ $Job = retry_op(sub { $arv->{'jobs'}->{'create'}->execute('job' => $Job); });
}
$job_id = $Job->{'uuid'};
$ENV{"JOB_UUID"} = $job_id;
-my @jobstep;
my @jobstep_todo = ();
my @jobstep_done = ();
my @jobstep_tomerge = ();
}
else
{
- my $first_task = $arv->{'job_tasks'}->{'create'}->execute('job_task' => {
- 'job_uuid' => $Job->{'uuid'},
- 'sequence' => 0,
- 'qsequence' => 0,
- 'parameters' => {},
- });
+ my $first_task = retry_op(sub {
+ $arv->{'job_tasks'}->{'create'}->execute('job_task' => {
+ 'job_uuid' => $Job->{'uuid'},
+ 'sequence' => 0,
+ 'qsequence' => 0,
+ 'parameters' => {},
+ });
+ });
push @jobstep, { 'level' => 0,
'failures' => 0,
'arvados_task' => $first_task,
my $build_script;
+do {
+ local $/ = undef;
+ $build_script = <DATA>;
+};
+my $nodelist = join(",", @node);
+if (!defined $no_clear_tmp) {
+ # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
+ Log (undef, "Clean work dirs");
-$ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
-
-my $skip_install = ($local_job && $Job->{script_version} =~ m{^/});
-if ($skip_install)
-{
- if (!defined $no_clear_tmp) {
- my $clear_tmp_cmd = 'rm -rf $JOB_WORK $CRUNCH_TMP/opt $CRUNCH_TMP/src*';
- system($clear_tmp_cmd) == 0
- or croak ("`$clear_tmp_cmd` failed: ".($?>>8));
- }
- $ENV{"CRUNCH_SRC"} = $Job->{script_version};
- for my $src_path ("$ENV{CRUNCH_SRC}/arvados/sdk/python") {
- if (-d $src_path) {
- system("virtualenv", "$ENV{CRUNCH_TMP}/opt") == 0
- or croak ("virtualenv $ENV{CRUNCH_TMP}/opt failed: exit ".($?>>8));
- system ("cd $src_path && ./build.sh && \$CRUNCH_TMP/opt/bin/python setup.py install")
- == 0
- or croak ("setup.py in $src_path failed: exit ".($?>>8));
- }
+ my $cleanpid = fork();
+ if ($cleanpid == 0)
+ {
+ srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
+ ['bash', '-c', 'if mount | grep -q $JOB_WORK/; then for i in $JOB_WORK/*keep; do /bin/fusermount -z -u $i; done; fi; sleep 1; rm -rf $JOB_WORK $CRUNCH_TMP/opt $CRUNCH_TMP/src*']);
+ exit (1);
}
+ while (1)
+ {
+ last if $cleanpid == waitpid (-1, WNOHANG);
+ freeze_if_want_freeze ($cleanpid);
+ select (undef, undef, undef, 0.1);
+ }
+ Log (undef, "Cleanup command exited ".exit_status_s($?));
}
-else
-{
- do {
- local $/ = undef;
- $build_script = <DATA>;
- };
- Log (undef, "Install revision ".$Job->{script_version});
- my $nodelist = join(",", @node);
-
- if (!defined $no_clear_tmp) {
- # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
- my $cleanpid = fork();
- if ($cleanpid == 0)
- {
- srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
- ['bash', '-c', 'if mount | grep -q $JOB_WORK/; then for i in $JOB_WORK/*keep; do /bin/fusermount -z -u $i; done; fi; sleep 1; rm -rf $JOB_WORK $CRUNCH_TMP/opt $CRUNCH_TMP/src*']);
- exit (1);
- }
- while (1)
- {
- last if $cleanpid == waitpid (-1, WNOHANG);
- freeze_if_want_freeze ($cleanpid);
- select (undef, undef, undef, 0.1);
- }
- Log (undef, "Clean-work-dir exited $?");
- }
- # Install requested code version
+my $git_archive;
+if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
+ # If script_version looks like an absolute path, *and* the --git-dir
+ # argument was not given -- which implies we were not invoked by
+ # crunch-dispatch -- we will use the given path as a working
+ # directory instead of resolving script_version to a git commit (or
+ # doing anything else with git).
+ $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
+ $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
+}
+else {
+ # Resolve the given script_version to a git commit sha1. Also, if
+ # the repository is remote, clone it into our local filesystem: this
+ # ensures "git archive" will work, and is necessary to reliably
+ # resolve a symbolic script_version like "master^".
+ $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
- my @execargs;
- my @srunargs = ("srun",
- "--nodelist=$nodelist",
- "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
+ Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
$ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
- $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
-
- my $commit;
- my $git_archive;
- my $treeish = $Job->{'script_version'};
- # If we're running under crunch-dispatch, it will have pulled the
- # appropriate source tree into its own repository, and given us that
- # repo's path as $git_dir. If we're running a "local" job, and a
- # script_version was specified, it's up to the user to provide the
- # full path to a local repository in Job->{repository}.
+ # If we're running under crunch-dispatch, it will have already
+ # pulled the appropriate source tree into its own repository, and
+ # given us that repo's path as $git_dir.
#
- # TODO: Accept URLs too, not just local paths. Use git-ls-remote and
- # git-archive --remote where appropriate.
+ # If we're running a "local" job, we might have to fetch content
+ # from a remote repository.
#
- # TODO: Accept a locally-hosted Arvados repository by name or
- # UUID. Use arvados.v1.repositories.list or .get to figure out the
- # appropriate fetch-url.
- my $repo = $git_dir || $ENV{'CRUNCH_DEFAULT_GIT_DIR'} || $Job->{'repository'};
-
+ # (Currently crunch-dispatch gives a local path with --git-dir, but
+ # we might as well accept URLs there too in case it changes its
+ # mind.)
+ my $repo = $git_dir || $Job->{'repository'};
+
+ # Repository can be remote or local. If remote, we'll need to fetch it
+ # to a local dir before doing `git log` et al.
+ my $repo_location;
+
+ if ($repo =~ m{://|^[^/]*:}) {
+ # $repo is a git url we can clone, like git:// or https:// or
+ # file:/// or [user@]host:repo.git. Note "user/name@host:foo" is
+ # not recognized here because distinguishing that from a local
+ # path is too fragile. If you really need something strange here,
+ # use the ssh:// form.
+ $repo_location = 'remote';
+ } elsif ($repo =~ m{^\.*/}) {
+ # $repo is a local path to a git index. We'll also resolve ../foo
+ # to ../foo/.git if the latter is a directory. To help
+ # disambiguate local paths from named hosted repositories, this
+ # form must be given as ./ or ../ if it's a relative path.
+ if (-d "$repo/.git") {
+ $repo = "$repo/.git";
+ }
+ $repo_location = 'local';
+ } else {
+ # $repo is none of the above. It must be the name of a hosted
+ # repository.
+ my $arv_repo_list = retry_op(sub {
+ $arv->{'repositories'}->{'list'}->execute(
+ 'filters' => [['name','=',$repo]]);
+ });
+ my @repos_found = @{$arv_repo_list->{'items'}};
+ my $n_found = $arv_repo_list->{'serverResponse'}->{'items_available'};
+ if ($n_found > 0) {
+ Log(undef, "Repository '$repo' -> "
+ . join(", ", map { $_->{'uuid'} } @repos_found));
+ }
+ if ($n_found != 1) {
+ croak("Error: Found $n_found repositories with name '$repo'.");
+ }
+ $repo = $repos_found[0]->{'fetch_url'};
+ $repo_location = 'remote';
+ }
+ Log(undef, "Using $repo_location repository '$repo'");
$ENV{"CRUNCH_SRC_URL"} = $repo;
- if (-d "$repo/.git") {
- # We were given a working directory, but we are only interested in
- # the index.
- $repo = "$repo/.git";
- }
+ # Resolve given script_version (we'll call that $treeish here) to a
+ # commit sha1 ($commit).
+ my $treeish = $Job->{'script_version'};
+ my $commit;
+ if ($repo_location eq 'remote') {
+ # We minimize excess object-fetching by re-using the same bare
+ # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
+ # just keep adding remotes to it as needed.
+ my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
+ my $gitcmd = "git --git-dir=\Q$local_repo\E";
+
+ # Set up our local repo for caching remote objects, making
+ # archives, etc.
+ if (!-d $local_repo) {
+ make_path($local_repo) or croak("Error: could not create $local_repo");
+ }
+ # This works (exits 0 and doesn't delete fetched objects) even
+ # if $local_repo is already initialized:
+ `$gitcmd init --bare`;
+ if ($?) {
+ croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
+ }
+
+ # If $treeish looks like a hash (or abbrev hash) we look it up in
+ # our local cache first, since that's cheaper. (We don't want to
+ # do that with tags/branches though -- those change over time, so
+ # they should always be resolved by the remote repo.)
+ if ($treeish =~ /^[0-9a-f]{7,40}$/s) {
+ # Hide stderr because it's normal for this to fail:
+ my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
+ if ($? == 0 &&
+ # Careful not to resolve a branch named abcdeff to commit 1234567:
+ $sha1 =~ /^$treeish/ &&
+ $sha1 =~ /^([0-9a-f]{40})$/s) {
+ $commit = $1;
+ Log(undef, "Commit $commit already present in $local_repo");
+ }
+ }
+
+ if (!defined $commit) {
+ # If $treeish isn't just a hash or abbrev hash, or isn't here
+ # yet, we need to fetch the remote to resolve it correctly.
- # If this looks like a subversion r#, look for it in git-svn commit messages
+ # First, remove all local heads. This prevents a name that does
+ # not exist on the remote from resolving to (or colliding with)
+ # a previously fetched branch or tag (possibly from a different
+ # remote).
+ remove_tree("$local_repo/refs/heads", {keep_root => 1});
- if ($treeish =~ m{^\d{1,4}$}) {
- my $gitlog = `git --git-dir=\Q$repo\E log --pretty="format:%H" --grep="git-svn-id:.*\@"\Q$treeish\E" " master`;
- chomp $gitlog;
- Log(undef, "git Subversion search exited $?");
- if (($? == 0) && ($gitlog =~ /^[a-f0-9]{40}$/)) {
- $commit = $gitlog;
- Log(undef, "Using commit $commit for Subversion revision $treeish");
+ Log(undef, "Fetching objects from $repo to $local_repo");
+ `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
+ if ($?) {
+ croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
+ }
}
+
+ # Now that the data is all here, we will use our local repo for
+ # the rest of our git activities.
+ $repo = $local_repo;
}
- # If that didn't work, try asking git to look it up as a tree-ish.
-
- if (!defined $commit) {
- my $found = `git --git-dir=\Q$repo\E rev-list -1 ''\Q$treeish\E`;
- chomp $found;
- Log(undef, "git rev-list exited $? with result '$found'");
- if (($? == 0) && ($found =~ /^[0-9a-f]{40}$/s)) {
- $commit = $found;
- Log(undef, "Using commit $commit for tree-ish $treeish");
- if ($commit ne $treeish) {
- # Make sure we record the real commit id in the database,
- # frozentokey, logs, etc. -- instead of an abbreviation or a
- # branch name which can become ambiguous or point to a
- # different commit in the future.
- $Job->{'script_version'} = $commit;
- !$job_has_uuid or
- $Job->update_attributes('script_version' => $commit) or
- croak("Error while updating job");
- }
+ my $gitcmd = "git --git-dir=\Q$repo\E";
+ my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
+ unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
+ croak("`$gitcmd rev-list` exited "
+ .exit_status_s($?)
+ .", '$treeish' not found. Giving up.");
+ }
+ $commit = $1;
+ Log(undef, "Version $treeish is commit $commit");
+
+ if ($commit ne $Job->{'script_version'}) {
+ # Record the real commit id in the database, frozentokey, logs,
+ # etc. -- instead of an abbreviation or a branch name which can
+ # become ambiguous or point to a different commit in the future.
+ if (!$Job->update_attributes('script_version' => $commit)) {
+ croak("Error: failed to update job's script_version attribute");
}
}
- if (defined $commit) {
- $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
- @execargs = ("sh", "-c",
- "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
- $git_archive = `git --git-dir=\Q$repo\E archive ''\Q$commit\E`;
- croak("git archive failed: exit " . ($? >> 8)) if ($? != 0);
+ $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
+ $git_archive = `$gitcmd archive ''\Q$commit\E`;
+ if ($?) {
+ croak("Error: $gitcmd archive exited ".exit_status_s($?));
}
- else {
- croak ("could not figure out commit id for $treeish");
+}
+
+if (!defined $git_archive) {
+ Log(undef, "Skip install phase (no git archive)");
+ if ($have_slurm) {
+ Log(undef, "Warning: This probably means workers have no source tree!");
}
+}
+else {
+ Log(undef, "Run install script on all workers");
+
+ my @srunargs = ("srun",
+ "--nodelist=$nodelist",
+ "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
+ my @execargs = ("sh", "-c",
+ "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
# Note: this section is almost certainly unnecessary if we're
# running tasks in docker containers.
freeze_if_want_freeze ($installpid);
select (undef, undef, undef, 0.1);
}
- Log (undef, "Install exited $?");
+ Log (undef, "Install script exited ".exit_status_s($?));
}
if (!$have_slurm)
}
if ($? != 0)
{
- croak("Installing Docker image from $docker_locator returned exit code $?");
+ croak("Installing Docker image from $docker_locator exited "
+ .exit_status_s($?));
}
}
while (my $manifest_line = <$orig_manifest>) {
$orig_manifest_text .= $manifest_line;
}
- my $output = $arv->{'collections'}->{'create'}->execute('collection' => {
- 'manifest_text' => $orig_manifest_text,
+ my $output = retry_op(sub {
+ $arv->{'collections'}->{'create'}->execute(
+ 'collection' => {'manifest_text' => $orig_manifest_text});
});
Log(undef, "output uuid " . $output->{uuid});
Log(undef, "output hash " . $output->{portable_data_hash});
- $Job->update_attributes('output' => $output->{portable_data_hash}) if $job_has_uuid;
+ $Job->update_attributes('output' => $output->{portable_data_hash});
};
if ($@) {
Log (undef, "Failed to register output manifest: $@");
save_meta();
-if ($job_has_uuid) {
- if ($collated_output && $main::success) {
- $Job->update_attributes('state' => 'Complete')
- } else {
- $Job->update_attributes('state' => 'Failed')
- }
+my $final_state;
+if ($collated_output && $main::success) {
+ $final_state = 'Complete';
+} else {
+ $final_state = 'Failed';
}
+$Job->update_attributes('state' => $final_state);
-exit ($Job->{'state'} != 'Complete' ? 1 : 0);
+exit (($final_state eq 'Complete') ? 0 : 1);
$Job->{'tasks_summary'}->{'todo'} = $todo;
$Job->{'tasks_summary'}->{'done'} = $done;
$Job->{'tasks_summary'}->{'running'} = $running;
- if ($job_has_uuid) {
- $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
- }
+ $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
Log (undef, "status: $done done, $running running, $todo todo");
$progress_is_dirty = 0;
}
my $childstatus = $?;
my $exitvalue = $childstatus >> 8;
- my $exitinfo = sprintf("exit %d signal %d%s",
- $exitvalue,
- $childstatus & 127,
- ($childstatus & 128 ? ' core dump' : ''));
+ my $exitinfo = "exit ".exit_status_s($childstatus);
$Jobstep->{'arvados_task'}->reload;
my $task_success = $Jobstep->{'arvados_task'}->{success};
$main::success = 0;
$main::please_freeze = 1;
}
- else {
- # Put this task back on the todo queue
- push @jobstep_todo, $jobstepid;
- }
+ # Put this task back on the todo queue
+ push @jobstep_todo, $jobstepid;
$Job->{'tasks_summary'}->{'failed'}++;
}
else
my $newtask_list = [];
my $newtask_results;
do {
- $newtask_results = $arv->{'job_tasks'}->{'list'}->execute(
- 'where' => {
- 'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
- },
- 'order' => 'qsequence',
- 'offset' => scalar(@$newtask_list),
- );
+ $newtask_results = retry_op(sub {
+ $arv->{'job_tasks'}->{'list'}->execute(
+ 'where' => {
+ 'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
+ },
+ 'order' => 'qsequence',
+ 'offset' => scalar(@$newtask_list),
+ );
+ });
push(@$newtask_list, @{$newtask_results->{items}});
} while (@{$newtask_results->{items}});
foreach my $arvados_task (@$newtask_list) {
my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
if (@stat && $stat[9] > $latest_refresh) {
$latest_refresh = scalar time;
- if ($job_has_uuid) {
- my $Job2 = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
- for my $attr ('cancelled_at',
- 'cancelled_by_user_uuid',
- 'cancelled_by_client_uuid',
- 'state') {
- $Job->{$attr} = $Job2->{$attr};
- }
- if ($Job->{'state'} ne "Running") {
- if ($Job->{'state'} eq "Cancelled") {
- Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
- } else {
- Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
- }
- $main::success = 0;
- $main::please_freeze = 1;
+ my $Job2 = retry_op(sub {
+ $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
+ });
+ for my $attr ('cancelled_at',
+ 'cancelled_by_user_uuid',
+ 'cancelled_by_client_uuid',
+ 'state') {
+ $Job->{$attr} = $Job2->{$attr};
+ }
+ if ($Job->{'state'} ne "Running") {
+ if ($Job->{'state'} eq "Cancelled") {
+ Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
+ } else {
+ Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
}
+ $main::success = 0;
+ $main::please_freeze = 1;
}
}
}
my ($child_out, $child_in);
my $pid = open2($child_out, $child_in, 'arv-put', '--raw',
- '--retries', put_retry_count());
+ '--retries', retry_count());
my $joboutput;
for (@jobstep)
{
if ($s->can_read(120)) {
sysread($child_out, $joboutput, 64 * 1024 * 1024);
chomp($joboutput);
+ # TODO: Ensure exit status == 0.
} else {
Log (undef, "timed out reading from 'arv-put'");
}
}
+ # TODO: kill $pid instead of waiting, now that we've decided to
+ # ignore further output.
waitpid($pid, 0);
return $joboutput;
waitpid($log_pipe_pid, 0);
$log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
if ($?) {
- Log("log_writer_finish: arv-put returned error $?")
+ Log("log_writer_finish: arv-put exited ".exit_status_s($?))
}
return $arv_put_output;
freeze() if @jobstep_todo;
collate_output() if @jobstep_todo;
cleanup();
- save_meta() if log_writer_is_active();
+ save_meta();
die;
}
sub cleanup
{
- return if !$job_has_uuid;
+ return unless $Job;
if ($Job->{'state'} eq 'Cancelled') {
$Job->update_attributes('finished_at' => scalar gmtime);
} else {
{
my $justcheckpoint = shift; # false if this will be the last meta saved
return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
+ return unless log_writer_is_active();
my $loglocator = log_writer_finish();
Log (undef, "log manifest is $loglocator");
$Job->{'log'} = $loglocator;
- $Job->update_attributes('log', $loglocator) if $job_has_uuid;
+ $Job->update_attributes('log', $loglocator);
}
collate_output();
cleanup();
save_meta();
- exit 0;
+ exit 1;
}
}
# If not, return undef for both values.
my $locator = shift;
my ($streamname, $filename);
- if (my $image = $arv->{collections}->{get}->execute(uuid => $locator)) {
+ my $image = retry_op(sub {
+ $arv->{collections}->{get}->execute(uuid => $locator);
+ });
+ if ($image) {
foreach my $line (split(/\n/, $image->{manifest_text})) {
my @tokens = split(/\s+/, $line);
next if (!@tokens);
}
}
-sub put_retry_count {
- # Calculate a --retries argument for arv-put that will have it try
- # approximately as long as this Job has been running.
- my $stoptime = shift || time;
- my $starttime = $jobstep[0]->{starttime};
- my $timediff = defined($starttime) ? ($stoptime - $starttime) : 1;
- my $retries = 0;
- while ($timediff >= 2) {
- $retries++;
- $timediff /= 2;
+sub retry_count {
+ # Calculate the number of times an operation should be retried,
+ # assuming exponential backoff, and that we're willing to retry as
+ # long as tasks have been running. Enforce a minimum of 3 retries.
+ my ($starttime, $endtime, $timediff, $retries);
+ if (@jobstep) {
+ $starttime = $jobstep[0]->{starttime};
+ $endtime = $jobstep[-1]->{finishtime};
+ }
+ if (!defined($starttime)) {
+ $timediff = 0;
+ } elsif (!defined($endtime)) {
+ $timediff = time - $starttime;
+ } else {
+ $timediff = ($endtime - $starttime) - (time - $endtime);
+ }
+ if ($timediff > 0) {
+ $retries = int(log($timediff) / log(2));
+ } else {
+ $retries = 1; # Use the minimum.
}
return ($retries > 3) ? $retries : 3;
}
+sub retry_op {
+ # Given a function reference, call it with the remaining arguments. If
+ # it dies, retry it with exponential backoff until it succeeds, or until
+ # the current retry_count is exhausted.
+ my $operation = shift;
+ my $retries = retry_count();
+ foreach my $try_count (0..$retries) {
+ my $next_try = time + (2 ** $try_count);
+ my $result = eval { $operation->(@_); };
+ if (!$@) {
+ return $result;
+ } elsif ($try_count < $retries) {
+ my $sleep_time = $next_try - time;
+ sleep($sleep_time) if ($sleep_time > 0);
+ }
+ }
+ # Ensure the error message ends in a newline, so Perl doesn't add
+ # retry_op's line number to it.
+ chomp($@);
+ die($@ . "\n");
+}
+
+sub exit_status_s {
+ # Given a $?, return a human-readable exit code string like "0" or
+ # "1" or "0 with signal 1" or "1 with signal 11".
+ my $exitcode = shift;
+ my $s = $exitcode >> 8;
+ if ($exitcode & 0x7f) {
+ $s .= " with signal " . ($exitcode & 0x7f);
+ }
+ if ($exitcode & 0x80) {
+ $s .= " with core dump";
+ }
+ return $s;
+}
+
__DATA__
#!/usr/bin/perl
import types
import apiclient
-import apiclient.discovery
-import apiclient.errors
+from apiclient import discovery as apiclient_discovery
+from apiclient import errors as apiclient_errors
import config
import errors
import util
# Monkey patch discovery._cast() so objects and arrays get serialized
# with json.dumps() instead of str().
-_cast_orig = apiclient.discovery._cast
+_cast_orig = apiclient_discovery._cast
def _cast_objects_too(value, schema_type):
global _cast_orig
if (type(value) != type('') and
return json.dumps(value)
else:
return _cast_orig(value, schema_type)
-apiclient.discovery._cast = _cast_objects_too
+apiclient_discovery._cast = _cast_objects_too
# Convert apiclient's HttpErrors into our own API error subclass for better
# error reporting.
-# Reassigning apiclient.errors.HttpError is not sufficient because most of the
+# Reassigning apiclient_errors.HttpError is not sufficient because most of the
# apiclient submodules import the class into their own namespace.
def _new_http_error(cls, *args, **kwargs):
- return super(apiclient.errors.HttpError, cls).__new__(
+ return super(apiclient_errors.HttpError, cls).__new__(
errors.ApiError, *args, **kwargs)
-apiclient.errors.HttpError.__new__ = staticmethod(_new_http_error)
+apiclient_errors.HttpError.__new__ = staticmethod(_new_http_error)
def http_cache(data_type):
path = os.environ['HOME'] + '/.cache/arvados/' + data_type
* insecure: If True, ignore SSL certificate validation errors.
Additional keyword arguments will be passed directly to
- `apiclient.discovery.build` if a new Resource object is created.
+ `apiclient_discovery.build` if a new Resource object is created.
If the `discoveryServiceUrl` or `http` keyword arguments are
missing, this function will set default values for them, based on
the current Arvados configuration settings.
credentials = CredentialsFromToken(api_token=token)
kwargs['http'] = credentials.authorize(kwargs['http'])
- svc = apiclient.discovery.build('arvados', version, **kwargs)
+ svc = apiclient_discovery.build('arvados', version, **kwargs)
svc.api_token = token
kwargs['http'].cache = None
if cache:
import sys
import tarfile
import tempfile
+import _strptime
from collections import namedtuple
from stat import *
return getattr(image_file, 'name', image_file) + '.stat'
def pull_image(image_name, image_tag):
- check_docker(popen_docker(['pull', '-t', image_tag, image_name]), "pull")
+ check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
+ "pull")
def save_image(image_hash, image_file):
# Save the specified Docker image to image_file, then try to save its
# TODO:
# --md5sum - display md5 of each file as read from disk
-import apiclient.errors
import argparse
import arvados
import base64
import socket
import sys
import tempfile
+from apiclient import errors as apiclient_errors
import arvados.commands._util as arv_cmd
def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
global api_client
- if api_client is None:
- api_client = arvados.api('v1')
- status = 0
args = parse_arguments(arguments)
+ status = 0
+ if api_client is None:
+ api_client = arvados.api('v1')
# Determine the name to use
if args.name:
try:
project_uuid = desired_project_uuid(api_client, args.project_uuid,
args.retries)
- except (apiclient.errors.Error, ValueError) as error:
+ except (apiclient_errors.Error, ValueError) as error:
print >>stderr, error
sys.exit(1)
else:
output = collection['uuid']
- except apiclient.errors.Error as error:
+ except apiclient_errors.Error as error:
print >>stderr, (
"arv-put: Error creating Collection on project: {}.".format(
error))
--- /dev/null
+#!/usr/bin/env python
+
+import sys
+import logging
+import argparse
+import arvados
+import json
+from arvados.events import subscribe
+import signal
+
+def main(arguments=None):
+ logger = logging.getLogger('arvados.arv-ws')
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-u', '--uuid', type=str, default="", help="Filter events on object_uuid")
+ parser.add_argument('-f', '--filters', type=str, default="", help="Arvados query filter to apply to log events (JSON encoded)")
+
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument('--poll-interval', default=15, type=int, help="If websockets is not available, specify the polling interval, default is every 15 seconds")
+ group.add_argument('--no-poll', action='store_false', dest='poll_interval', help="Do not poll if websockets are not available, just fail")
+
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument('-p', '--pipeline', type=str, default="", help="Supply pipeline uuid, print log output from pipeline and its jobs")
+ group.add_argument('-j', '--job', type=str, default="", help="Supply job uuid, print log output from jobs")
+
+ args = parser.parse_args(arguments)
+
+ global filters
+ global known_component_jobs
+ global ws
+
+ filters = []
+ known_component_jobs = set()
+ ws = None
+
+ def update_subscribed_components(components):
+ global known_component_jobs
+ global filters
+ pipeline_jobs = set()
+ for c in components:
+ if "job" in components[c]:
+ pipeline_jobs.add(components[c]["job"]["uuid"])
+ if known_component_jobs != pipeline_jobs:
+ ws.unsubscribe(filters)
+ filters = [['object_uuid', 'in', [args.pipeline] + list(pipeline_jobs)]]
+ ws.subscribe([['object_uuid', 'in', [args.pipeline] + list(pipeline_jobs)]])
+ known_component_jobs = pipeline_jobs
+
+ api = arvados.api('v1', cache=False)
+
+ if args.uuid:
+ filters += [ ['object_uuid', '=', args.uuid] ]
+
+ if args.filters:
+ filters += json.loads(args.filters)
+
+ if args.job:
+ filters += [ ['object_uuid', '=', args.job] ]
+
+ if args.pipeline:
+ filters += [ ['object_uuid', '=', args.pipeline] ]
+
+ def on_message(ev):
+ global filters
+ global ws
+
+ logger.debug(ev)
+ if 'event_type' in ev and (args.pipeline or args.job):
+ if ev['event_type'] in ('stderr', 'stdout'):
+ sys.stdout.write(ev["properties"]["text"])
+ elif ev["event_type"] in ("create", "update"):
+ if ev["object_kind"] == "arvados#pipelineInstance":
+ update_subscribed_components(ev["properties"]["new_attributes"]["components"])
+ elif 'status' in ev and ev['status'] == 200:
+ pass
+ else:
+ print json.dumps(ev)
+
+ try:
+ ws = subscribe(arvados.api('v1', cache=False), filters, on_message, poll_fallback=args.poll_interval)
+ if ws:
+ if args.pipeline:
+ c = api.pipeline_instances().get(uuid=args.pipeline).execute()
+ update_subscribed_components(c["components"])
+
+ while True:
+ signal.pause()
+ except KeyboardInterrupt:
+ pass
+ except Exception as e:
+ logger.error(e)
+ finally:
+ if ws:
+ ws.close()
import re
_settings = None
-default_config_file = os.environ['HOME'] + '/.config/arvados/settings.conf'
+if os.environ.get('HOME') != None:
+ default_config_file = os.environ['HOME'] + '/.config/arvados/settings.conf'
+else:
+ default_config_file = ''
EMPTY_BLOCK_LOCATOR = 'd41d8cd98f00b204e9800998ecf8427e+0'
# errors.py - Arvados-specific exceptions.
-import apiclient.errors
import json
+from apiclient import errors as apiclient_errors
-class ApiError(apiclient.errors.HttpError):
+class ApiError(apiclient_errors.HttpError):
def _get_reason(self):
try:
return '; '.join(json.loads(self.content)['errors'])
from ws4py.client.threadedclient import WebSocketClient
-import thread
+import threading
import json
import os
import time
import re
import config
import logging
+import arvados
_logger = logging.getLogger('arvados.events')
ssl_options={'cert_reqs': ssl.CERT_NONE}
else:
ssl_options={'cert_reqs': ssl.CERT_REQUIRED}
-
- super(EventClient, self).__init__(url, ssl_options)
+ super(EventClient, self).__init__(url, ssl_options=ssl_options)
self.filters = filters
self.on_event = on_event
def opened(self):
- self.send(json.dumps({"method": "subscribe", "filters": self.filters}))
+ self.subscribe(self.filters)
def received_message(self, m):
self.on_event(json.loads(str(m)))
except:
pass
-def subscribe(api, filters, on_event):
+ def subscribe(self, filters, last_log_id=None):
+ m = {"method": "subscribe", "filters": filters}
+ if last_log_id is not None:
+ m["last_log_id"] = last_log_id
+ self.send(json.dumps(m))
+
+ def unsubscribe(self, filters):
+ self.send(json.dumps({"method": "unsubscribe", "filters": filters}))
+
+class PollClient(threading.Thread):
+ def __init__(self, api, filters, on_event, poll_time):
+ super(PollClient, self).__init__()
+ self.api = api
+ if filters:
+ self.filters = [filters]
+ else:
+ self.filters = [[]]
+ self.on_event = on_event
+ self.poll_time = poll_time
+ self.stop = threading.Event()
+
+ def run(self):
+ self.id = 0
+ for f in self.filters:
+ items = self.api.logs().list(limit=1, order="id desc", filters=f).execute()['items']
+ if items:
+ if items[0]['id'] > self.id:
+ self.id = items[0]['id']
+
+ self.on_event({'status': 200})
+
+ while not self.stop.isSet():
+ max_id = self.id
+ for f in self.filters:
+ items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute()['items']
+ for i in items:
+ if i['id'] > max_id:
+ max_id = i['id']
+ self.on_event(i)
+ self.id = max_id
+ self.stop.wait(self.poll_time)
+
+ def close(self):
+ self.stop.set()
+ self.join()
+
+ def subscribe(self, filters):
+ self.on_event({'status': 200})
+ self.filters.append(filters)
+
+ def unsubscribe(self, filters):
+ del self.filters[self.filters.index(filters)]
+
+
+def subscribe(api, filters, on_event, poll_fallback=15):
+ '''
+ api: Must be a newly created from arvados.api(cache=False), not shared with the caller, as it may be used by a background thread.
+ filters: Initial subscription filters.
+ on_event: The callback when a message is received
+ poll_fallback: If websockets are not available, fall back to polling every N seconds. If poll_fallback=False, this will return None if websockets are not available.
+ '''
ws = None
- try:
- url = "{}?api_token={}".format(api._rootDesc['websocketUrl'], config.get('ARVADOS_API_TOKEN'))
- ws = EventClient(url, filters, on_event)
- ws.connect()
- return ws
- except Exception:
- if (ws):
- ws.close_connection()
- raise
+ if 'websocketUrl' in api._rootDesc:
+ try:
+ url = "{}?api_token={}".format(api._rootDesc['websocketUrl'], api.api_token)
+ ws = EventClient(url, filters, on_event)
+ ws.connect()
+ return ws
+ except Exception as e:
+ _logger.warn("Got exception %s trying to connect to websockets at %s" % (e, api._rootDesc['websocketUrl']))
+ if ws:
+ ws.close_connection()
+ if poll_fallback:
+ _logger.warn("Websockets not available, falling back to log table polling")
+ p = PollClient(api, filters, on_event, poll_fallback)
+ p.start()
+ return p
+ else:
+ _logger.error("Websockets not available")
+ return None
#!/usr/bin/env python
-import sys
-import logging
-import argparse
-import arvados
-from arvados.events import subscribe
-
-logger = logging.getLogger('arvados.arv-ws')
-
-parser = argparse.ArgumentParser()
-parser.add_argument('-u', '--uuid', type=str, default="")
-args = parser.parse_args()
-
-filters = []
-if len(args.uuid)>0: filters = [ ['object_uuid', '=', args.uuid] ]
-
-api = arvados.api('v1', cache=False)
-
-def on_message(ev):
- print "\n", ev
-
-ws = None
-try:
- ws = subscribe(api, filters, lambda ev: on_message(ev))
- ws.run_forever()
-except Exception:
- logger.exception('')
- if (ws):
- ws.close_connection()
+from arvados.commands.ws import main
+main()
'''load a fixture yaml file'''
with open(os.path.join(SERVICES_SRC_DIR, 'api', "test", "fixtures",
fix + ".yml")) as f:
- return yaml.load(f.read())
+ yaml_file = f.read()
+ try:
+ trim_index = yaml_file.index("# Test Helper trims the rest of the file")
+ yaml_file = yaml_file[0:trim_index]
+ except ValueError:
+ pass
+ return yaml.load(yaml_file)
def authorize_with(token):
'''token is the symbolic name of the token from the api_client_authorizations fixture'''
#!/usr/bin/env python
-import apiclient.errors
import arvados
import httplib2
import json
import os
import run_test_server
import unittest
+from apiclient import errors as apiclient_errors
+from apiclient import http as apiclient_http
-from apiclient.http import RequestMockBuilder
from arvados_testutil import fake_httplib2_response
if not mimetypes.inited:
'arvados.humans.list': (None, json.dumps(
{'items_available': 0, 'items': []})),
}
- req_builder = RequestMockBuilder(mock_responses)
+ req_builder = apiclient_http.RequestMockBuilder(mock_responses)
cls.api = arvados.api('v1', cache=False,
host=os.environ['ARVADOS_API_HOST'],
token='discovery-doc-only-no-token-needed',
self.assertEqual(answer['items_available'], len(answer['items']))
def test_exceptions_include_errors(self):
- with self.assertRaises(apiclient.errors.HttpError) as err_ctx:
+ with self.assertRaises(apiclient_errors.HttpError) as err_ctx:
self.api.humans().get(uuid='xyz-xyz-abcdef').execute()
err_s = str(err_ctx.exception)
for msg in ["Bad UUID format", "Bad output format"]:
self.assertIn(msg, err_s)
def test_exceptions_without_errors_have_basic_info(self):
- with self.assertRaises(apiclient.errors.HttpError) as err_ctx:
+ with self.assertRaises(apiclient_errors.HttpError) as err_ctx:
self.api.humans().delete(uuid='xyz-xyz-abcdef').execute()
self.assertIn("500", str(err_ctx.exception))
import unittest
import arvados
import arvados.events
-import time
-
-class WebsocketTest(run_test_server.TestCaseWithServers):
- MAIN_SERVER = {'websockets': True}
+import threading
+class EventTestBase(object):
def on_event(self, ev):
if self.state == 1:
self.assertEqual(200, ev['status'])
self.state = 2
+ self.subscribed.set()
elif self.state == 2:
self.assertEqual(self.h[u'uuid'], ev[u'object_uuid'])
self.state = 3
+ self.done.set()
elif self.state == 3:
self.fail()
def runTest(self):
+ self.ws = None
self.state = 1
+ self.subscribed = threading.Event()
+ self.done = threading.Event()
run_test_server.authorize_with("admin")
api = arvados.api('v1', cache=False)
- arvados.events.subscribe(api, [['object_uuid', 'is_a', 'arvados#human']], lambda ev: self.on_event(ev))
- time.sleep(1)
+ self.ws = arvados.events.subscribe(arvados.api('v1', cache=False), [['object_uuid', 'is_a', 'arvados#human']], self.on_event, poll_fallback=2)
+ self.assertIsInstance(self.ws, self.WS_TYPE)
+ self.subscribed.wait(10)
self.h = api.humans().create(body={}).execute()
- time.sleep(1)
+ self.done.wait(10)
+ self.assertEqual(3, self.state)
+
+class WebsocketTest(run_test_server.TestCaseWithServers, EventTestBase):
+ MAIN_SERVER = {'websockets': True}
+ WS_TYPE = arvados.events.EventClient
+
+ def tearDown(self):
+ if self.ws:
+ self.ws.close()
+ super(WebsocketTest, self).tearDown()
+
+
+class PollClientTest(run_test_server.TestCaseWithServers, EventTestBase):
+ MAIN_SERVER = {}
+ WS_TYPE = arvados.events.PollClient
+
+ def tearDown(self):
+ if self.ws:
+ self.ws.close()
+ super(PollClientTest, self).tearDown()
gem 'themes_for_rails'
gem 'arvados', '>= 0.1.20140919104705'
-gem 'arvados-cli', '>= 0.1.20140919104705'
+gem 'arvados-cli', '>= 0.1.20141014201516'
# pg_power lets us use partial indexes in schema.rb in Rails 3
gem 'pg_power'
google-api-client (~> 0.6.3)
json (>= 1.7.7)
jwt (>= 0.1.5, < 1.0.0)
- arvados-cli (0.1.20140919104705)
+ arvados-cli (0.1.20141014201516)
activesupport (~> 3.2, >= 3.2.13)
andand (~> 1.3, >= 1.3.3)
- arvados (~> 0.1.0)
+ arvados (~> 0.1, >= 0.1.0)
curb (~> 0.8)
- google-api-client (~> 0.6.3)
+ google-api-client (~> 0.6, >= 0.6.3)
json (~> 1.7, >= 1.7.7)
jwt (>= 0.1.5, < 1.0.0)
oj (~> 2.0, >= 2.0.3)
acts_as_api
andand
arvados (>= 0.1.20140919104705)
- arvados-cli (>= 0.1.20140919104705)
+ arvados-cli (>= 0.1.20141014201516)
coffee-rails (~> 3.2.0)
database_cleaner
factory_girl_rails
def cancel
reload_object_before_update
- @object.update_attributes! cancelled_at: Time.now
+ @object.update_attributes! state: Job::Cancelled
show
end
skip_before_filter :find_object_by_uuid, :only => :ping
skip_before_filter :render_404_if_no_object, :only => :ping
- def create
- @object = Node.new
- @object.save!
- @object.start!(lambda { |h| ping_arvados_v1_node_url(h) })
- show
- end
-
def update
if resource_attrs[:job_uuid]
@object.job_readable = readable_job_uuids(resource_attrs[:job_uuid]).any?
unless (owner_uuid == current_user.uuid or
current_user.is_admin or
(current_user.groups_i_can(:manage) & [uuid, owner_uuid]).any?)
- if current_user.groups_i_can(:write).index(uuid)
+ if ((current_user.groups_i_can(:write) + [current_user.uuid]) &
+ [uuid, owner_uuid]).any?
return [owner_uuid, current_user.uuid]
else
return [owner_uuid]
log = Log.new(event_type: event_type).fill_object(self)
yield log
log.save!
- connection.execute "NOTIFY logs, '#{log.id}'"
log_start_state
end
before_create :ensure_unique_submit_id
after_commit :trigger_crunch_dispatch_if_cancelled, :on => :update
before_validation :set_priority
- before_validation :update_timestamps_when_state_changes
before_validation :update_state_from_old_state_attrs
validate :ensure_script_version_is_commit
validate :find_docker_image_locator
validate :validate_status
validate :validate_state_change
+ before_save :update_timestamps_when_state_changes
has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version
has_many(:nodes, foreign_key: :job_uuid, primary_key: :uuid)
def update_timestamps_when_state_changes
return if not (state_changed? or new_record?)
+
case state
when Running
self.started_at ||= Time.now
serialize :properties, Hash
before_validation :set_default_event_at
attr_accessor :object, :object_kind
+ after_save :send_notify
api_accessible :user, extend: :common do |t|
t.add :id
# logs can have references to deleted objects
end
+ def send_notify
+ connection.execute "NOTIFY logs, '#{self.id}'"
+ end
+
end
if o[:ec2_instance_id]
if !self.info['ec2_instance_id']
self.info['ec2_instance_id'] = o[:ec2_instance_id]
- if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
- tag_cmd = ("ec2-create-tags #{o[:ec2_instance_id]} " +
- "--tag 'Name=#{self.uuid}'")
- `#{tag_cmd}`
- end
elsif self.info['ec2_instance_id'] != o[:ec2_instance_id]
logger.debug "Multiple nodes have credentials for #{self.uuid}"
raise "#{self.uuid} is already running at #{self.info['ec2_instance_id']} so rejecting ping from #{o[:ec2_instance_id]}"
raise "No available node slots" if try_slot == MAX_SLOTS
end while true
self.hostname = self.class.hostname_for_slot(self.slot_number)
- if info['ec2_instance_id']
- if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
- `ec2-create-tags #{self.info['ec2_instance_id']} --tag 'hostname=#{self.hostname}'`
- end
- end
end
# Record other basic stats
save!
end
- def start!(ping_url_method)
- ensure_permission_to_save
- ping_url = ping_url_method.call({ id: self.uuid, ping_secret: self.info['ping_secret'] })
- if (Rails.configuration.compute_node_ec2run_args and
- Rails.configuration.compute_node_ami)
- ec2_args = ["--user-data '#{ping_url}'",
- "-t c1.xlarge -n 1",
- Rails.configuration.compute_node_ec2run_args,
- Rails.configuration.compute_node_ami
- ]
- ec2run_cmd = ["ec2-run-instances",
- "--client-token", self.uuid,
- ec2_args].flatten.join(' ')
- ec2spot_cmd = ["ec2-request-spot-instances",
- "-p #{Rails.configuration.compute_node_spot_bid} --type one-time",
- ec2_args].flatten.join(' ')
- else
- ec2run_cmd = ''
- ec2spot_cmd = ''
- end
- self.info['ec2_run_command'] = ec2run_cmd
- self.info['ec2_spot_command'] = ec2spot_cmd
- self.info['ec2_start_command'] = ec2spot_cmd
- logger.info "#{self.uuid} ec2_start_command= #{ec2spot_cmd.inspect}"
- result = `#{ec2spot_cmd} 2>&1`
- self.info['ec2_start_result'] = result
- logger.info "#{self.uuid} ec2_start_result= #{result.inspect}"
- result.match(/INSTANCE\s*(i-[0-9a-f]+)/) do |m|
- instance_id = m[1]
- self.info['ec2_instance_id'] = instance_id
- if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
- `ec2-create-tags #{instance_id} --tag 'Name=#{self.uuid}'`
- end
- end
- result.match(/SPOTINSTANCEREQUEST\s*(sir-[0-9a-f]+)/) do |m|
- sir_id = m[1]
- self.info['ec2_sir_id'] = sir_id
- if (Rails.configuration.compute_node_ec2_tag_enable rescue true)
- `ec2-create-tags #{sir_id} --tag 'Name=#{self.uuid}'`
- end
- end
- self.save!
- end
-
protected
def ensure_ping_secret
t.add :is_admin
t.add :is_invited
t.add :prefs
+ t.add :writable_by
end
ALL_PERMISSIONS = {read: true, write: true, manage: true}
# crunch-job must be able to stat() it.
crunch_refresh_trigger: /tmp/crunch_refresh_trigger
- # Maximum number of log events that may be generated by a single job.
- crunch_limit_log_events_per_job: 65536
-
# These two settings control how frequently log events are flushed to the
# database. Log lines are buffered until either crunch_log_bytes_per_event
# has been reached or crunch_log_seconds_between_events has elapsed since
# Path to /etc/dnsmasq.d, or false = do not update dnsmasq data.
dnsmasq_conf_dir: false
- # Set to AMI id (ami-123456) to auto-start nodes. See app/models/node.rb
- compute_node_ami: false
- compute_node_ec2run_args: -g arvados-compute
- compute_node_spot_bid: 0.11
-
compute_node_domain: false
compute_node_nameservers:
- 192.168.1.1
- compute_node_ec2_tag_enable: false
# The version below is suitable for AWS.
# To use it, copy it to your application.yml, uncomment, and change <%# to <%=
:mount => "/websocket",
:websocket_only => (ENV['ARVADOS_WEBSOCKETS'] == "ws-only")
}
+ Rails.logger.info "Websockets #{ENV['ARVADOS_WEBSOCKETS']}, running at /websocket"
+ else
+ Rails.logger.info "Websockets disabled"
end
end
# Start with log rows readable by user, sorted in ascending order
logs = Log.readable_by(ws.user).order("id asc")
+ cond_id = nil
+ cond_out = []
+ param_out = []
+
if ws.last_log_id
# Client is only interested in log rows that are newer than the
# last log row seen by the client.
- logs = logs.where("logs.id > ?", ws.last_log_id)
+ cond_id = "logs.id > ?"
+ param_out << ws.last_log_id
elsif id
# No last log id, so only look at the most recently changed row
- logs = logs.where("logs.id = ?", id.to_i)
+ cond_id = "logs.id = ?"
+ param_out << id.to_i
else
return
end
# Now process filters provided by client
- cond_out = []
- param_out = []
ws.filters.each do |filter|
ft = record_filters filter.filters, Log
- cond_out += ft[:cond_out]
- param_out += ft[:param_out]
+ if ft[:cond_out].any?
+ # Join the clauses within a single subscription filter with AND
+ # so it is consistent with regular queries
+ cond_out << "(#{ft[:cond_out].join ') AND ('})"
+ param_out += ft[:param_out]
+ end
end
# Add filters to query
if cond_out.any?
- logs = logs.where('(' + cond_out.join(') OR (') + ')', *param_out)
+ # Join subscriptions with OR
+ logs = logs.where(cond_id + " AND ((#{cond_out.join ') OR ('}))", *param_out)
+ else
+ logs = logs.where(cond_id, *param_out)
end
# Finally execute query and actually send the matching log rows
ws.last_log_id = id.to_i
end
rescue Exception => e
- puts "Error publishing event: #{$!}"
- puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ Rails.logger.warn "Error publishing event: #{$!}"
+ Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
ws.send ({status: 500, message: 'error'}.to_json)
ws.close
end
# Add a filter. This gets the :filters field which is the same
# format as used for regular index queries.
ws.filters << Filter.new(p)
- ws.send ({status: 200, message: 'subscribe ok'}.to_json)
+ ws.send ({status: 200, message: 'subscribe ok', filter: p}.to_json)
# Send any pending events
push_events ws
rescue Oj::Error => e
ws.send ({status: 400, message: "malformed request"}.to_json)
rescue Exception => e
- puts "Error handling message: #{$!}"
- puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ Rails.logger.warn "Error handling message: #{$!}"
+ Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
ws.send ({status: 500, message: 'error'}.to_json)
ws.close
end
stderr_flushed_at: Time.new(0),
bytes_logged: 0,
events_logged: 0,
- log_throttle_timestamp: Time.new(0),
+ log_throttle_is_open: true,
+ log_throttle_reset_time: Time.now + Rails.configuration.crunch_log_throttle_period,
log_throttle_bytes_so_far: 0,
log_throttle_lines_so_far: 0,
log_throttle_bytes_skipped: 0,
# the log line should go to output or not. Modifies "line" in place to
# replace it with an error if a logging limit is tripped.
def rate_limit running_job, line
- if running_job[:bytes_logged] > Rails.configuration.crunch_limit_log_bytes_per_job
- # Don't log anything if the hard cap has already been exceeded
- return false
- end
-
- now = Time.now
- throttle_period = Rails.configuration.crunch_log_throttle_period
-
- if running_job[:log_throttle_bytes_skipped] > 0
- # We've skipped some log in the current time period already, so continue to
- # skip the log
- running_job[:log_throttle_bytes_skipped] += line.size
- return false
- end
-
- # Count lines and bytes logged in this period, and total bytes logged for the job
- running_job[:log_throttle_lines_so_far] += 1
- running_job[:log_throttle_bytes_so_far] += line.size
- running_job[:bytes_logged] += line.size
-
- if running_job[:log_throttle_bytes_so_far] > Rails.configuration.crunch_log_throttle_bytes or
- running_job[:log_throttle_lines_so_far] > Rails.configuration.crunch_log_throttle_lines
- # We've exceeded the per-period throttle, so start skipping
- running_job[:log_throttle_bytes_skipped] += line.size
-
- # Replace log line with a message about skipping the log
- remaining_time = throttle_period - (now - running_job[:log_throttle_timestamp])
- if running_job[:log_throttle_bytes_so_far] > Rails.configuration.crunch_log_throttle_bytes
- line.replace "Exceeded rate #{Rails.configuration.crunch_log_throttle_bytes} bytes per #{throttle_period} seconds (crunch_log_throttle_bytes), logging will be silenced for the next #{remaining_time.round} seconds\n"
- else
- line.replace "Exceeded rate #{Rails.configuration.crunch_log_throttle_lines} lines per #{throttle_period} seconds (crunch_log_throttle_lines), logging will be silenced for the next #{remaining_time.round} seconds\n"
+ message = false
+ linesize = line.size
+ if running_job[:log_throttle_is_open]
+ running_job[:log_throttle_lines_so_far] += 1
+ running_job[:log_throttle_bytes_so_far] += linesize
+ running_job[:bytes_logged] += linesize
+
+ if (running_job[:bytes_logged] >
+ Rails.configuration.crunch_limit_log_bytes_per_job)
+ message = "Exceeded log limit #{Rails.configuration.crunch_limit_log_bytes_per_job} bytes (crunch_limit_log_bytes_per_job). Log will be truncated."
+ running_job[:log_throttle_reset_time] = Time.now + 100.years
+ running_job[:log_throttle_is_open] = false
+
+ elsif (running_job[:log_throttle_bytes_so_far] >
+ Rails.configuration.crunch_log_throttle_bytes)
+ remaining_time = running_job[:log_throttle_reset_time] - Time.now
+ message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_bytes} bytes per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_bytes). Logging will be silenced for the next #{remaining_time.round} seconds.\n"
+ running_job[:log_throttle_is_open] = false
+
+ elsif (running_job[:log_throttle_lines_so_far] >
+ Rails.configuration.crunch_log_throttle_lines)
+ remaining_time = running_job[:log_throttle_reset_time] - Time.now
+ message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_lines} lines per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_lines), logging will be silenced for the next #{remaining_time.round} seconds.\n"
+ running_job[:log_throttle_is_open] = false
end
end
- if running_job[:bytes_logged] > Rails.configuration.crunch_limit_log_bytes_per_job
- # Replace log line with a message about truncating the log
- line.replace "Exceeded log limit #{Rails.configuration.crunch_limit_log_bytes_per_job} bytes (crunch_limit_log_bytes_per_job). Log will be truncated."
+ if not running_job[:log_throttle_is_open]
+ # Don't log anything if any limit has been exceeded. Just count lossage.
+ running_job[:log_throttle_bytes_skipped] += linesize
end
- true
+ if message
+ # Yes, write to logs, but use our "rate exceeded" message
+ # instead of the log message that exceeded the limit.
+ line.replace message
+ true
+ else
+ running_job[:log_throttle_is_open]
+ end
end
def read_pipes
job = j[:job]
now = Time.now
- if (now - j[:log_throttle_timestamp]) > Rails.configuration.crunch_log_throttle_period
- # It has been more than throttle_period seconds since the last checkpoint so reset the
- # throttle
+ if now > j[:log_throttle_reset_time]
+ # It has been more than throttle_period seconds since the last
+ # checkpoint so reset the throttle
if j[:log_throttle_bytes_skipped] > 0
- j[:stderr_buf_to_flush] << "Skipped #{j[:log_throttle_bytes_skipped]} bytes of log"
+ message = "#{job_uuid} ! Skipped #{j[:log_throttle_bytes_skipped]} bytes of log"
+ $stderr.puts message
+ j[:stderr_buf_to_flush] << "#{Time.now.ctime.to_s} #{message}\n"
end
- j[:log_throttle_timestamp] = now
+ j[:log_throttle_reset_time] = now + Rails.configuration.crunch_log_throttle_period
j[:log_throttle_bytes_so_far] = 0
j[:log_throttle_lines_so_far] = 0
j[:log_throttle_bytes_skipped] = 0
+ j[:log_throttle_is_open] = true
end
j[:buf].each do |stream, streambuf|
# Read some data from the child stream
- buf = false
+ buf = ''
begin
- buf = j[stream].read_nonblock(2**16)
+ # It's important to use a big enough buffer here. When we're
+ # being flooded with logs, we must read and discard many
+ # bytes at once. Otherwise, we can easily peg a CPU with
+ # time-checking and other loop overhead. (Quick tests show a
+ # 1MiB buffer working 2.5x as fast as a 64 KiB buffer.)
+ #
+ # So don't reduce this buffer size!
+ buf = j[stream].read_nonblock(2**20)
rescue Errno::EAGAIN, EOFError
end
- if buf
- # Add to the stream buffer
- streambuf << buf
-
- # Check for at least one complete line
- if streambuf.index "\n"
- lines = streambuf.lines("\n").to_a
-
- # check if the last line is partial or not
- streambuf.replace(if streambuf[-1] == "\n"
- '' # ends on a newline
- else
- lines.pop # Put the partial line back into the buffer
- end)
-
- # Now spool the lines to the log output buffer
- lines.each do |line|
- # rate_limit returns true or false as to whether to actually log
- # the line or not. It also modifies "line" in place to replace
- # it with an error if a logging limit is tripped.
- if rate_limit j, line
- $stderr.print "#{job_uuid} ! " unless line.index(job_uuid)
- $stderr.puts line
- pub_msg = "#{Time.now.ctime.to_s} #{line.strip} \n"
- j[:stderr_buf_to_flush] << pub_msg
- end
- # Send log output to the logs table
- write_log j
+ # Short circuit the counting code if we're just going to throw
+ # away the data anyway.
+ if not j[:log_throttle_is_open]
+ j[:log_throttle_bytes_skipped] += streambuf.size + buf.size
+ streambuf.replace ''
+ next
+ elsif buf == ''
+ next
+ end
+
+ # Append to incomplete line from previous read, if any
+ streambuf << buf
+
+ bufend = ''
+ streambuf.each_line do |line|
+ if not line.end_with? $/
+ if line.size > Rails.configuration.crunch_log_throttle_bytes
+ # Without a limit here, we'll use 2x an arbitrary amount
+ # of memory, and waste a lot of time copying strings
+ # around, all without providing any feedback to anyone
+ # about what's going on _or_ hitting any of our throttle
+ # limits.
+ #
+ # Here we leave "line" alone, knowing it will never be
+ # sent anywhere: rate_limit() will reach
+ # crunch_log_throttle_bytes immediately. However, we'll
+ # leave [...] in bufend: if the trailing end of the long
+ # line does end up getting sent anywhere, it will have
+ # some indication that it is incomplete.
+ bufend = "[...]"
+ else
+ # If line length is sane, we'll wait for the rest of the
+ # line to appear in the next read_pipes() call.
+ bufend = line
+ break
end
end
+ # rate_limit returns true or false as to whether to actually log
+ # the line or not. It also modifies "line" in place to replace
+ # it with an error if a logging limit is tripped.
+ if rate_limit j, line
+ $stderr.print "#{job_uuid} ! " unless line.index(job_uuid)
+ $stderr.puts line
+ pub_msg = "#{Time.now.ctime.to_s} #{line.strip}\n"
+ j[:stderr_buf_to_flush] << pub_msg
+ end
end
+
+ # Leave the trailing incomplete line (if any) in streambuf for
+ # next time.
+ streambuf.replace bufend
end
+ # Flush buffered logs to the logs table, if appropriate. We have
+ # to do this even if we didn't collect any new logs this time:
+ # otherwise, buffered data older than seconds_between_events
+ # won't get flushed until new data arrives.
+ write_log j
end
end
$stderr.puts "dispatch: child #{pid_done} exit"
$stderr.puts "dispatch: job #{job_done.uuid} end"
- # Ensure every last drop of stdout and stderr is consumed
+ # Ensure every last drop of stdout and stderr is consumed.
read_pipes
- j_done[:stderr_flushed_at] = Time.new(0) # reset flush timestamp to make sure log gets written
- write_log j_done # write any remaining logs
+ # Reset flush timestamp to make sure log gets written.
+ j_done[:stderr_flushed_at] = Time.new(0)
+ # Write any remaining logs.
+ write_log j_done
j_done[:buf].each do |stream, streambuf|
if streambuf != ''
# send message to log table. we want these records to be transient
def write_log running_job
return if running_job[:stderr_buf_to_flush] == ''
- return if running_job[:events_logged] > Rails.configuration.crunch_limit_log_events_per_job
# Send out to log event if buffer size exceeds the bytes per event or if
# it has been at least crunch_log_seconds_between_events seconds since
if running_job[:stderr_buf_to_flush].size > Rails.configuration.crunch_log_bytes_per_event or
(Time.now - running_job[:stderr_flushed_at]) >= Rails.configuration.crunch_log_seconds_between_events
begin
- # Just reached crunch_limit_log_events_per_job so replace log with notification.
- if running_job[:events_logged] == Rails.configuration.crunch_limit_log_events_per_job
- running_job[:stderr_buf_to_flush] =
- "Exceeded live log limit #{Rails.configuration.crunch_limit_log_events_per_job} events (crunch_limit_log_events_per_job). Live log will be truncated."
- end
log = Log.new(object_uuid: running_job[:job].uuid,
event_type: 'stderr',
owner_uuid: running_job[:job].owner_uuid,
api_token: projectviewertoken1234567890abcdefghijklmnopqrstuv
expires_at: 2038-01-01 00:00:00
+subproject_admin:
+ api_client: untrusted
+ user: subproject_admin
+ api_token: subprojectadmintoken1234567890abcdefghijklmnopqrst
+ expires_at: 2038-01-01 00:00:00
+
admin_vm:
api_client: untrusted
user: admin
user: user_foo_in_sharing_group
api_token: 2p1pou8p4ls208mcbedeewlotghppenobcyrmyhq8pyf51xd8u
expires_at: 2038-01-01 00:00:00
+
+user1_with_load:
+ api_client: untrusted
+ user: user1_with_load
+ api_token: 1234k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi
+ expires_at: 2038-01-01 00:00:00
owner_uuid: zzzzz-tpzed-81hsbo6mk8nl05c
created_at: 2014-02-03T17:22:54Z
name: collection_owned_by_foo
+
+collection_to_remove_from_subproject:
+ # The Workbench tests remove this from subproject.
+ uuid: zzzzz-4zz18-subprojgonecoll
+ portable_data_hash: 2386ca6e3fffd4be5e197a72c6c80fb2+51
+ manifest_text: ". 8258b505536a9ab47baa2f4281cb932a+9 0:9:missingno\n"
+ owner_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+ created_at: 2014-10-15T10:45:00
+ name: Collection to remove from subproject
+
+collection_with_files_in_subdir:
+ uuid: zzzzz-4zz18-filesinsubdir00
+ name: collection_files_in_subdir
+ portable_data_hash: 85877ca2d7e05498dd3d109baf2df106+95
+ owner_uuid: zzzzz-tpzed-user1withloadab
+ created_at: 2014-02-03T17:22:54Z
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+ modified_at: 2014-02-03T17:22:54Z
+ updated_at: 2014-02-03T17:22:54Z
+ manifest_text: ". 85877ca2d7e05498dd3d109baf2df106+95+A3a4e26a366ee7e4ed3e476ccf05354761be2e4ae@545a9920 0:95:file_in_subdir1\n./subdir2/subdir3 2bbc341c702df4d8f42ec31f16c10120+64+A315d7e7bad2ce937e711fc454fae2d1194d14d64@545a9920 0:32:file1_in_subdir3.txt 32:32:file2_in_subdir3.txt\n./subdir2/subdir3/subdir4 2bbc341c702df4d8f42ec31f16c10120+64+A315d7e7bad2ce937e711fc454fae2d1194d14d64@545a9920 0:32:file1_in_subdir4.txt 32:32:file2_in_subdir4.txt"
+
+# Test Helper trims the rest of the file
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
+
+# collections in project_with_10_collections
+<% for i in 1..10 do %>
+collection_<%=i%>_of_10:
+ name: Collection_<%= i %>
+ portable_data_hash: ea10d51bcf88862dbcc36eb292017dfd+45
+ manifest_text: ". 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz\n"
+ uuid: zzzzz-4zz18-10gneyn6brkx<%= i.to_s.rjust(3, '0') %>
+ owner_uuid: zzzzz-j7d0g-0010collections
+ created_at: <%= i.minute.ago.to_s(:db) %>
+<% end %>
+
+# collections in project_with_201_collections
+<% for i in 1..201 do %>
+collection_<%=i%>_of_201:
+ name: Collection_<%= i %>
+ portable_data_hash: ea10d51bcf88862dbcc36eb292017dfd+45
+ manifest_text: ". 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz\n"
+ uuid: zzzzz-4zz18-201gneyn6brd<%= i.to_s.rjust(3, '0') %>
+ owner_uuid: zzzzz-j7d0g-0201collections
+ created_at: <%= i.minute.ago.to_s(:db) %>
+<% end %>
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
name: Active user has can_manage
# Group for testing granting permission between users who share a group.
-#
group_for_sharing_tests:
uuid: zzzzz-j7d0g-t4ucgncwteul7zt
owner_uuid: zzzzz-tpzed-000000000000000
name: Group for sharing tests
description: Users who can share objects with each other
group_class: role
+
+project_with_10_collections:
+ uuid: zzzzz-j7d0g-0010collections
+ owner_uuid: zzzzz-tpzed-user1withloadab
+ created_at: 2014-04-21 15:37:48 -0400
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+ modified_at: 2014-04-21 15:37:48 -0400
+ updated_at: 2014-04-21 15:37:48 -0400
+ name: project with 10 collections
+ description: This will result in one page in the display
+ group_class: project
+
+project_with_201_collections:
+ uuid: zzzzz-j7d0g-0201collections
+ owner_uuid: zzzzz-tpzed-user1withloadab
+ created_at: 2014-04-21 15:37:48 -0400
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+ modified_at: 2014-04-21 15:37:48 -0400
+ updated_at: 2014-04-21 15:37:48 -0400
+ name: project with 201 collections
+ description: This will result in two pages in the display
+ group_class: project
+
+project_with_10_pipelines:
+ uuid: zzzzz-j7d0g-000010pipelines
+ owner_uuid: zzzzz-tpzed-user1withloadab
+ created_at: 2014-04-21 15:37:48 -0400
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+ modified_at: 2014-04-21 15:37:48 -0400
+ updated_at: 2014-04-21 15:37:48 -0400
+ name: project with 10 pipelines
+ description: project with 10 pipelines
+ group_class: project
+
+project_with_2_pipelines_and_200_jobs:
+ uuid: zzzzz-j7d0g-nnjobspipelines
+ owner_uuid: zzzzz-tpzed-user1withloadab
+ created_at: 2014-04-21 15:37:48 -0400
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+ modified_at: 2014-04-21 15:37:48 -0400
+ updated_at: 2014-04-21 15:37:48 -0400
+ name: project with 2 pipelines and 200 jobs
+ description: This will result in two pages in the display
+ group_class: project
+
+project_with_25_pipelines:
+ uuid: zzzzz-j7d0g-000025pipelines
+ owner_uuid: zzzzz-tpzed-user1withloadab
+ created_at: 2014-04-21 15:37:48 -0400
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-user1withloadab
+ modified_at: 2014-04-21 15:37:48 -0400
+ updated_at: 2014-04-21 15:37:48 -0400
+ name: project with 25 pipelines
+ description: project with 25 pipelines
+ group_class: project
log: 0b9a7787660e1fce4a93f33e01376ba6+81
script_version: 7def43a4d3f20789dda4700f703b5514cc3ed250
state: Complete
+
+cancelled:
+ uuid: zzzzz-8i9sb-4cf0abc123e809j
+ owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ cancelled_at: <%= 1.minute.ago.to_s(:db) %>
+ cancelled_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ cancelled_by_client_uuid: zzzzz-ozdt8-obw7foaks3qjyej
+ created_at: <%= 4.minute.ago.to_s(:db) %>
+ started_at: <%= 3.minute.ago.to_s(:db) %>
+ finished_at: ~
+ script_version: 1de84a854e2b440dc53bf42f8548afa4c17da332
+ running: false
+ success: ~
+ output: ~
+ priority: 0
+ log: ~
+ is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ tasks_summary:
+ failed: 0
+ todo: 3
+ running: 1
+ done: 1
+ runtime_constraints: {}
+ state: Cancelled
+
+job_in_subproject:
+ uuid: zzzzz-8i9sb-subprojectjob01
+ created_at: 2014-10-15 12:00:00
+ owner_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+ log: ~
+ repository: foo
+ script: hash
+ script_version: 4fe459abe02d9b365932b8f5dc419439ab4e2577
+ state: Complete
+
+# Test Helper trims the rest of the file
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
+
+# jobs in project_with_2_pipelines_and_200_jobs
+<% for i in 1..200 do %>
+job_<%=i%>_of_200:
+ uuid: zzzzz-8i9sb-0vsrcqi7whch<%= i.to_s.rjust(3, '0') %>
+ created_at: <%= i.minute.ago.to_s(:db) %>
+ owner_uuid: zzzzz-j7d0g-nnjobspipelines
+ script_version: 7def43a4d3f20789dda4700f703b5514cc3ed250
+ state: Complete
+<% end %>
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
head_uuid: zzzzz-j7d0g-v955i6s2oi1cbso
properties: {}
+subproject_admin_can_manage_subproject:
+ uuid: zzzzz-o0j2j-subprojadminlnk
+ owner_uuid: zzzzz-tpzed-000000000000000
+ created_at: 2014-10-15 10:00:00 -0000
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+ modified_at: 2014-10-15 10:00:00 -0000
+ updated_at: 2014-10-15 10:00:00 -0000
+ tail_uuid: zzzzz-tpzed-subprojectadmin
+ link_class: permission
+ name: can_manage
+ head_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+ properties: {}
+
foo_collection_tag:
uuid: zzzzz-o0j2j-eedahfaho8aphiv
owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
name: can_read
head_uuid: zzzzz-tpzed-n3oaj4sm5fcnwib
+user1-with-load_member_of_all_users_group:
+ uuid: zzzzz-o0j2j-user1-with-load
+ owner_uuid: zzzzz-tpzed-000000000000000
+ created_at: 2014-01-24 20:42:26 -0800
+ modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+ modified_by_user_uuid: zzzzz-tpzed-d9tiejq69daie8f
+ modified_at: 2014-01-24 20:42:26 -0800
+ updated_at: 2014-01-24 20:42:26 -0800
+ tail_uuid: zzzzz-tpzed-user1withloadab
+ link_class: permission
+ name: can_read
+ head_uuid: zzzzz-j7d0g-fffffffffffffff
+ properties: {}
owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
created_at: <%= 1.minute.ago.to_s(:db) %>
+new_pipeline_in_subproject:
+ state: New
+ uuid: zzzzz-d1hrv-subprojpipeline
+ owner_uuid: zzzzz-j7d0g-axqo7eu9pwvna1x
+ created_at: <%= 1.minute.ago.to_s(:db) %>
+
has_component_with_no_script_parameters:
state: Ready
uuid: zzzzz-d1hrv-1xfj6xkicf2muk2
required: true
dataclass: Collection
title: foo instance input
+
+# Test Helper trims the rest of the file
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
+
+# pipelines in project_with_10_pipelines
+<% for i in 0..9 do %>
+pipeline_<%=i%>_of_10:
+ name: pipeline_<%= i %>
+ state: Failed
+ uuid: zzzzz-d1hrv-10pipelines0<%= i.to_s.rjust(3, '0') %>
+ owner_uuid: zzzzz-j7d0g-000010pipelines
+ created_at: <%= (2*i).hour.ago.to_s(:db) %>
+ started_at: <%= (2*i).hour.ago.to_s(:db) %>
+ finished_at: <%= i.minute.ago.to_s(:db) %>
+ components:
+ foo:
+ script: foo
+ script_version: master
+ script_parameters:
+ input:
+ required: true
+ dataclass: Collection
+ title: foo instance input
+<% end %>
+
+# pipelines in project_with_2_pipelines_and_200_jobs
+<% for i in 0..1 do %>
+pipeline_<%=i%>_of_2_pipelines_and_200_jobs:
+ name: pipeline_<%= i %>
+ state: New
+ uuid: zzzzz-d1hrv-abcgneyn6brx<%= i.to_s.rjust(3, '0') %>
+ owner_uuid: zzzzz-j7d0g-nnjobspipelines
+ created_at: <%= i.minute.ago.to_s(:db) %>
+ components:
+ foo:
+ script: foo
+ script_version: master
+ script_parameters:
+ input:
+ required: true
+ dataclass: Collection
+ title: foo instance input
+<% end %>
+
+# pipelines in project_with_25_pipelines
+<% for i in 0..24 do %>
+pipeline_<%=i%>_of_25:
+ name: pipeline_<%=i%>
+ state: Failed
+ uuid: zzzzz-d1hrv-25pipelines0<%= i.to_s.rjust(3, '0') %>
+ owner_uuid: zzzzz-j7d0g-000025pipelines
+ created_at: <%= i.hour.ago.to_s(:db) %>
+ started_at: <%= i.hour.ago.to_s(:db) %>
+ finished_at: <%= i.minute.ago.to_s(:db) %>
+ components:
+ foo:
+ script: foo
+ script_version: master
+ script_parameters:
+ input:
+ required: true
+ dataclass: Collection
+ title: foo instance input
+<% end %>
+
+# Do not add your fixtures below this line as the rest of this file will be trimmed by test_helper
organization: example.com
role: Computational biologist
+subproject_admin:
+ owner_uuid: zzzzz-tpzed-000000000000000
+ uuid: zzzzz-tpzed-subprojectadmin
+ email: subproject-admin@arvados.local
+ first_name: Subproject
+ last_name: Admin
+ identity_url: https://subproject-admin.openid.local
+ is_active: true
+ is_admin: false
+ prefs:
+ profile:
+ organization: example.com
+ role: Computational biologist
+
spectator:
owner_uuid: zzzzz-tpzed-000000000000000
uuid: zzzzz-tpzed-l1s2piq4t4mps8r
identity_url: https://user_bar_in_sharing_group.openid.local
is_active: true
is_admin: false
+
+user1_with_load:
+ owner_uuid: zzzzz-tpzed-000000000000000
+ uuid: zzzzz-tpzed-user1withloadab
+ email: user1_with_load@arvados.local
+ first_name: user1_with_load
+ last_name: User
+ identity_url: https://user1_with_load.openid.local
+ is_active: true
+ is_admin: false
+ prefs:
+ profile:
+ organization: example.com
+ role: IT
'trigger file should be created when job is cancelled')
end
- test "cancelling a cancelled jobs stays cancelled" do
+ [
+ [:put, :update, {job:{cancelled_at: Time.now}}, :success],
+ [:put, :update, {job:{cancelled_at: nil}}, :unprocessable_entity],
+ [:put, :update, {job:{state: 'Cancelled'}}, :success],
+ [:put, :update, {job:{state: 'Queued'}}, :unprocessable_entity],
+ [:put, :update, {job:{state: 'Running'}}, :unprocessable_entity],
+ [:put, :update, {job:{state: 'Failed'}}, :unprocessable_entity],
+ [:put, :update, {job:{state: 'Complete'}}, :unprocessable_entity],
+ [:post, :cancel, {}, :success],
+ ].each do |http_method, action, params, expected_response|
+ test "cancelled job stays cancelled after #{[http_method, action, params].inspect}" do
+ # We need to verify that "cancel" creates a trigger file, so first
+ # let's make sure there is no stale trigger file.
+ begin
+ File.unlink(Rails.configuration.crunch_refresh_trigger)
+ rescue Errno::ENOENT
+ end
+
+ authorize_with :active
+ self.send http_method, action, { id: jobs(:cancelled).uuid }.merge(params)
+ assert_response expected_response
+ if expected_response == :success
+ job = json_response
+ assert_not_nil job['cancelled_at'], 'job cancelled again using #{attribute}=#{value} did not have cancelled_at value'
+ assert_equal job['state'], 'Cancelled', 'cancelled again job state changed when updated using using #{attribute}=#{value}'
+ end
+ # Verify database record still says Cancelled
+ assert_equal 'Cancelled', Job.find(jobs(:cancelled).id).state, 'job was un-cancelled'
+ end
+ end
+
+ test "cancelled job updated to any other state change results in error" do
# We need to verify that "cancel" creates a trigger file, so first
# let's make sure there is no stale trigger file.
begin
test "create node" do
authorize_with :admin
- post :create
+ post :create, {node: {}}
assert_response :success
assert_not_nil json_response['uuid']
assert_not_nil json_response['info'].is_a? Hash
assert_equal false, found_email, 'Expected no email after updating profile'
end
+ test "user API response includes writable_by" do
+ authorize_with :active
+ get :current
+ assert_response :success
+ assert_includes(json_response["writable_by"], users(:active).uuid,
+ "user's writable_by should include self")
+ assert_includes(json_response["writable_by"], users(:active).owner_uuid,
+ "user's writable_by should include its owner_uuid")
+ end
+
NON_ADMIN_USER_DATA = ["uuid", "kind", "is_active", "email", "first_name",
"last_name"].sort
assert_equal human.uuid, human_ev_uuid
end
+
+ test "connect, subscribe, compound filter" do
+ state = 1
+ t1 = nil
+
+ authorize_with :admin
+
+ ws_helper :admin do |ws|
+ ws.on :open do |event|
+ ws.send ({method: 'subscribe', filters: [['object_uuid', 'is_a', 'arvados#trait'], ['event_type', '=', 'update']]}.to_json)
+ end
+
+ ws.on :message do |event|
+ d = Oj.load event.data
+ case state
+ when 1
+ assert_equal 200, d["status"]
+ t1 = Trait.create("name" => "foo")
+ t1.name = "bar"
+ t1.save!
+ state = 2
+ when 2
+ assert_equal 'update', d['event_type']
+ state = 3
+ ws.close
+ when 3
+ assert false, "Should not get any more events"
+ end
+ end
+
+ end
+
+ assert_equal 3, state
+ assert_not_nil t1
+ end
+
test "connect, subscribe, ask events starting at seq num" do
state = 1
human = nil
def check_counter action
@counter += 1
if @counter == 2
- # assert_equal 1, 2, "Multiple actions in functional test"
+ assert_equal 1, 2, "Multiple actions in functional test"
end
end
import (
"bufio"
+ "bytes"
+ "errors"
"flag"
"fmt"
"io"
"os"
"os/exec"
"os/signal"
+ "strconv"
"strings"
"syscall"
"time"
)
-func ReadLineByLine(inp io.ReadCloser, out chan string, finish chan bool) {
- s := bufio.NewScanner(inp)
+/*
+#include <unistd.h>
+#include <sys/types.h>
+#include <pwd.h>
+#include <stdlib.h>
+*/
+import "C"
+
+// The above block of magic allows us to look up user_hz via _SC_CLK_TCK.
+
+type Cgroup struct {
+ root string
+ parent string
+ cid string
+}
+
+func CopyPipeToChan(in io.Reader, out chan string, done chan<- bool) {
+ s := bufio.NewScanner(in)
for s.Scan() {
out <- s.Text()
}
- finish <- true
+ done <- true
}
-func OutputChannel(stdout chan string, stderr chan string) {
- for {
- select {
- case s, ok := <-stdout:
- if ok {
- fmt.Fprintln(os.Stdout, s)
- } else {
- return
- }
- case s, ok := <-stderr:
- if ok {
- fmt.Fprintln(os.Stderr, s)
- } else {
- return
- }
- }
+func CopyChanToPipe(in <-chan string, out io.Writer) {
+ for s := range in {
+ fmt.Fprintln(out, s)
}
}
-func FindStat(cgroup_root string, cgroup_parent string, container_id string, statgroup string, stat string) string {
- var path string
- path = fmt.Sprintf("%s/%s/%s/%s/%s.%s", cgroup_root, statgroup, cgroup_parent, container_id, statgroup, stat)
- if _, err := os.Stat(path); err == nil {
- return path
+var logChan chan string
+func LogPrintf(format string, args ...interface{}) {
+ if logChan == nil {
+ return
}
- path = fmt.Sprintf("%s/%s/%s/%s.%s", cgroup_root, cgroup_parent, container_id, statgroup, stat)
- if _, err := os.Stat(path); err == nil {
- return path
+ logChan <- fmt.Sprintf("crunchstat: " + format, args...)
+}
+
+func ReadAllOrWarn(in *os.File) ([]byte, error) {
+ content, err := ioutil.ReadAll(in)
+ if err != nil {
+ LogPrintf("read %s: %s", in.Name(), err)
}
- path = fmt.Sprintf("%s/%s/%s.%s", cgroup_root, statgroup, statgroup, stat)
- if _, err := os.Stat(path); err == nil {
- return path
+ return content, err
+}
+
+var reportedStatFile = map[string]string{}
+
+// Open the cgroup stats file in /sys/fs corresponding to the target
+// cgroup, and return an *os.File. If no stats file is available,
+// return nil.
+//
+// TODO: Instead of trying all options, choose a process in the
+// container, and read /proc/PID/cgroup to determine the appropriate
+// cgroup root for the given statgroup. (This will avoid falling back
+// to host-level stats during container setup and teardown.)
+func OpenStatFile(cgroup Cgroup, statgroup string, stat string) (*os.File, error) {
+ var paths = []string{
+ fmt.Sprintf("%s/%s/%s/%s/%s", cgroup.root, statgroup, cgroup.parent, cgroup.cid, stat),
+ fmt.Sprintf("%s/%s/%s/%s", cgroup.root, cgroup.parent, cgroup.cid, stat),
+ fmt.Sprintf("%s/%s/%s", cgroup.root, statgroup, stat),
+ fmt.Sprintf("%s/%s", cgroup.root, stat),
+ }
+ var path string
+ var file *os.File
+ var err error
+ for _, path = range paths {
+ file, err = os.Open(path)
+ if err == nil {
+ break
+ } else {
+ path = ""
+ }
}
- path = fmt.Sprintf("%s/%s.%s", cgroup_root, statgroup, stat)
- if _, err := os.Stat(path); err == nil {
- return path
+ if pathWas, ok := reportedStatFile[stat]; !ok || pathWas != path {
+ // Log whenever we start using a new/different cgroup
+ // stat file for a given statistic. This typically
+ // happens 1 to 3 times per statistic, depending on
+ // whether we happen to collect stats [a] before any
+ // processes have been created in the container and
+ // [b] after all contained processes have exited.
+ reportedStatFile[stat] = path
+ if path == "" {
+ LogPrintf("did not find stats file: stat %s, statgroup %s, cid %s, parent %s, root %s", stat, statgroup, cgroup.cid, cgroup.parent, cgroup.root)
+ } else {
+ LogPrintf("reading stats from %s", path)
+ }
}
- return ""
+ return file, err
}
-func PollCgroupStats(cgroup_root string, cgroup_parent string, container_id string, stderr chan string, poll int64) {
- //var last_usage int64 = 0
- var last_user int64 = 0
- var last_sys int64 = 0
- var last_cpucount int64 = 0
-
- type Disk struct {
- last_read int64
- next_read int64
- last_write int64
- next_write int64
+func GetContainerNetStats(cgroup Cgroup) (io.Reader, error) {
+ procsFile, err := OpenStatFile(cgroup, "cpuacct", "cgroup.procs")
+ if err != nil {
+ return nil, err
}
+ defer procsFile.Close()
+ reader := bufio.NewScanner(procsFile)
+ for reader.Scan() {
+ taskPid := reader.Text()
+ statsFilename := fmt.Sprintf("/proc/%s/net/dev", taskPid)
+ stats, err := ioutil.ReadFile(statsFilename)
+ if err != nil {
+ LogPrintf("read %s: %s", statsFilename, err)
+ continue
+ }
+ return strings.NewReader(string(stats)), nil
+ }
+ return nil, errors.New("Could not read stats for any proc in container")
+}
- disk := make(map[string]*Disk)
-
- //cpuacct_usage := FindStat(cgroup_path, "cpuacct", "usage")
- cpuacct_stat := FindStat(cgroup_root, cgroup_parent, container_id, "cpuacct", "stat")
- blkio_io_service_bytes := FindStat(cgroup_root, cgroup_parent, container_id, "blkio", "io_service_bytes")
- cpuset_cpus := FindStat(cgroup_root, cgroup_parent, container_id, "cpuset", "cpus")
- memory_stat := FindStat(cgroup_root, cgroup_parent, container_id, "memory", "stat")
+type IoSample struct {
+ sampleTime time.Time
+ txBytes int64
+ rxBytes int64
+}
- if cpuacct_stat != "" {
- stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuacct_stat)
- }
- if blkio_io_service_bytes != "" {
- stderr <- fmt.Sprintf("crunchstat: reading stats from %s", blkio_io_service_bytes)
+func DoBlkIoStats(cgroup Cgroup, lastSample map[string]IoSample) {
+ c, err := OpenStatFile(cgroup, "blkio", "blkio.io_service_bytes")
+ if err != nil {
+ return
}
- if cpuset_cpus != "" {
- stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuset_cpus)
+ defer c.Close()
+ b := bufio.NewScanner(c)
+ var sampleTime = time.Now()
+ newSamples := make(map[string]IoSample)
+ for b.Scan() {
+ var device, op string
+ var val int64
+ if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &val); err != nil {
+ continue
+ }
+ var thisSample IoSample
+ var ok bool
+ if thisSample, ok = newSamples[device]; !ok {
+ thisSample = IoSample{sampleTime, -1, -1}
+ }
+ switch op {
+ case "Read":
+ thisSample.rxBytes = val
+ case "Write":
+ thisSample.txBytes = val
+ }
+ newSamples[device] = thisSample
}
- if memory_stat != "" {
- stderr <- fmt.Sprintf("crunchstat: reading stats from %s", memory_stat)
+ for dev, sample := range newSamples {
+ if sample.txBytes < 0 || sample.rxBytes < 0 {
+ continue
+ }
+ delta := ""
+ if prev, ok := lastSample[dev]; ok {
+ delta = fmt.Sprintf(" -- interval %.4f seconds %d write %d read",
+ sample.sampleTime.Sub(prev.sampleTime).Seconds(),
+ sample.txBytes-prev.txBytes,
+ sample.rxBytes-prev.rxBytes)
+ }
+ LogPrintf("blkio:%s %d write %d read%s", dev, sample.txBytes, sample.rxBytes, delta)
+ lastSample[dev] = sample
}
+}
- var elapsed int64 = poll
+type MemSample struct {
+ sampleTime time.Time
+ memStat map[string]int64
+}
- for {
- /*{
- c, _ := os.Open(cpuacct_usage)
- b, _ := ioutil.ReadAll(c)
- var next int64
- fmt.Sscanf(string(b), "%d", &next)
- if last_usage != 0 {
- stderr <- fmt.Sprintf("crunchstat: cpuacct.usage %v", (next-last_usage)/10000000)
- }
- //fmt.Printf("usage %d %d %d %d%%\n", last_usage, next, next-last_usage, (next-last_usage)/10000000)
- last_usage = next
- c.Close()
- }*/
- var cpus int64 = 0
- if cpuset_cpus != "" {
- c, _ := os.Open(cpuset_cpus)
- b, _ := ioutil.ReadAll(c)
- sp := strings.Split(string(b), ",")
- for _, v := range sp {
- var min, max int64
- n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
- if n == 2 {
- cpus += (max - min) + 1
- } else {
- cpus += 1
- }
- }
+func DoMemoryStats(cgroup Cgroup) {
+ c, err := OpenStatFile(cgroup, "memory", "memory.stat")
+ if err != nil {
+ return
+ }
+ defer c.Close()
+ b := bufio.NewScanner(c)
+ thisSample := MemSample{time.Now(), make(map[string]int64)}
+ wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
+ for b.Scan() {
+ var stat string
+ var val int64
+ if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err != nil {
+ continue
+ }
+ thisSample.memStat[stat] = val
+ }
+ var outstat bytes.Buffer
+ for _, key := range wantStats {
+ if val, ok := thisSample.memStat[key]; ok {
+ outstat.WriteString(fmt.Sprintf(" %d %s", val, key))
+ }
+ }
+ LogPrintf("mem%s", outstat.String())
+}
- if cpus != last_cpucount {
- stderr <- fmt.Sprintf("crunchstat: cpuset.cpus %v", cpus)
- }
- last_cpucount = cpus
+func DoNetworkStats(cgroup Cgroup, lastSample map[string]IoSample) {
+ sampleTime := time.Now()
+ stats, err := GetContainerNetStats(cgroup)
+ if err != nil {
+ return
+ }
- c.Close()
+ scanner := bufio.NewScanner(stats)
+ for scanner.Scan() {
+ var ifName string
+ var rx, tx int64
+ words := strings.Fields(scanner.Text())
+ if len(words) != 17 {
+ // Skip lines with wrong format
+ continue
}
- if cpus == 0 {
- cpus = 1
+ ifName = strings.TrimRight(words[0], ":")
+ if ifName == "lo" || ifName == "" {
+ // Skip loopback interface and lines with wrong format
+ continue
}
- if cpuacct_stat != "" {
- c, _ := os.Open(cpuacct_stat)
- b, _ := ioutil.ReadAll(c)
- var next_user int64
- var next_sys int64
- fmt.Sscanf(string(b), "user %d\nsystem %d", &next_user, &next_sys)
- c.Close()
-
- if last_user != 0 {
- user_diff := next_user - last_user
- sys_diff := next_sys - last_sys
- // Assume we're reading stats based on 100
- // jiffies per second. Because the elapsed
- // time is in milliseconds, we need to boost
- // that to 1000 jiffies per second, then boost
- // it by another 100x to get a percentage, then
- // finally divide by the actual elapsed time
- // and the number of cpus to get average load
- // over the polling period.
- user_pct := (user_diff * 10 * 100) / (elapsed * cpus)
- sys_pct := (sys_diff * 10 * 100) / (elapsed * cpus)
-
- stderr <- fmt.Sprintf("crunchstat: cpuacct.stat user %v", user_pct)
- stderr <- fmt.Sprintf("crunchstat: cpuacct.stat sys %v", sys_pct)
- }
-
- /*fmt.Printf("user %d %d %d%%\n", last_user, next_user, next_user-last_user)
- fmt.Printf("sys %d %d %d%%\n", last_sys, next_sys, next_sys-last_sys)
- fmt.Printf("sum %d%%\n", (next_user-last_user)+(next_sys-last_sys))*/
- last_user = next_user
- last_sys = next_sys
+ if tx, err = strconv.ParseInt(words[9], 10, 64); err != nil {
+ continue
}
- if blkio_io_service_bytes != "" {
- c, _ := os.Open(blkio_io_service_bytes)
- b := bufio.NewScanner(c)
- var device, op string
- var next int64
- for b.Scan() {
- if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &next); err == nil {
- if disk[device] == nil {
- disk[device] = new(Disk)
- }
- if op == "Read" {
- disk[device].last_read = disk[device].next_read
- disk[device].next_read = next
- if disk[device].last_read > 0 && (disk[device].next_read != disk[device].last_read) {
- stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s read %v", device, disk[device].next_read-disk[device].last_read)
- }
- }
- if op == "Write" {
- disk[device].last_write = disk[device].next_write
- disk[device].next_write = next
- if disk[device].last_write > 0 && (disk[device].next_write != disk[device].last_write) {
- stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s write %v", device, disk[device].next_write-disk[device].last_write)
- }
- }
- }
- }
- c.Close()
+ if rx, err = strconv.ParseInt(words[1], 10, 64); err != nil {
+ continue
}
+ nextSample := IoSample{}
+ nextSample.sampleTime = sampleTime
+ nextSample.txBytes = tx
+ nextSample.rxBytes = rx
+ var delta string
+ if prev, ok := lastSample[ifName]; ok {
+ interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
+ delta = fmt.Sprintf(" -- interval %.4f seconds %d tx %d rx",
+ interval,
+ tx-prev.txBytes,
+ rx-prev.rxBytes)
+ }
+ LogPrintf("net:%s %d tx %d rx%s", ifName, tx, rx, delta)
+ lastSample[ifName] = nextSample
+ }
+}
- if memory_stat != "" {
- c, _ := os.Open(memory_stat)
- b := bufio.NewScanner(c)
- var stat string
- var val int64
- for b.Scan() {
- if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err == nil {
- if stat == "rss" {
- stderr <- fmt.Sprintf("crunchstat: memory.stat rss %v", val)
- }
- }
- }
- c.Close()
+type CpuSample struct {
+ hasData bool // to distinguish the zero value from real data
+ sampleTime time.Time
+ user float64
+ sys float64
+ cpus int64
+}
+
+// Return the number of CPUs available in the container. Return 0 if
+// we can't figure out the real number of CPUs.
+func GetCpuCount(cgroup Cgroup) int64 {
+ cpusetFile, err := OpenStatFile(cgroup, "cpuset", "cpuset.cpus")
+ if err != nil {
+ return 0
+ }
+ defer cpusetFile.Close()
+ b, err := ReadAllOrWarn(cpusetFile)
+ sp := strings.Split(string(b), ",")
+ cpus := int64(0)
+ for _, v := range sp {
+ var min, max int64
+ n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
+ if n == 2 {
+ cpus += (max - min) + 1
+ } else {
+ cpus += 1
}
+ }
+ return cpus
+}
- bedtime := time.Now()
- time.Sleep(time.Duration(poll) * time.Millisecond)
- morning := time.Now()
- elapsed = morning.Sub(bedtime).Nanoseconds() / int64(time.Millisecond)
+func DoCpuStats(cgroup Cgroup, lastSample *CpuSample) {
+ statFile, err := OpenStatFile(cgroup, "cpuacct", "cpuacct.stat")
+ if err != nil {
+ return
}
+ defer statFile.Close()
+ b, err := ReadAllOrWarn(statFile)
+ if err != nil {
+ return
+ }
+
+ nextSample := CpuSample{true, time.Now(), 0, 0, GetCpuCount(cgroup)}
+ var userTicks, sysTicks int64
+ fmt.Sscanf(string(b), "user %d\nsystem %d", &userTicks, &sysTicks)
+ user_hz := float64(C.sysconf(C._SC_CLK_TCK))
+ nextSample.user = float64(userTicks) / user_hz
+ nextSample.sys = float64(sysTicks) / user_hz
+
+ delta := ""
+ if lastSample.hasData {
+ delta = fmt.Sprintf(" -- interval %.4f seconds %.4f user %.4f sys",
+ nextSample.sampleTime.Sub(lastSample.sampleTime).Seconds(),
+ nextSample.user-lastSample.user,
+ nextSample.sys-lastSample.sys)
+ }
+ LogPrintf("cpu %.4f user %.4f sys %d cpus%s",
+ nextSample.user, nextSample.sys, nextSample.cpus, delta)
+ *lastSample = nextSample
}
-func main() {
+func PollCgroupStats(cgroup Cgroup, poll int64, stop_poll_chan <-chan bool) {
+ var lastNetSample = map[string]IoSample{}
+ var lastDiskSample = map[string]IoSample{}
+ var lastCpuSample = CpuSample{}
+
+ poll_chan := make(chan bool, 1)
+ go func() {
+ // Send periodic poll events.
+ poll_chan <- true
+ for {
+ time.Sleep(time.Duration(poll) * time.Millisecond)
+ poll_chan <- true
+ }
+ }()
+ for {
+ select {
+ case <-stop_poll_chan:
+ return
+ case <-poll_chan:
+ // Emit stats, then select again.
+ }
+ DoMemoryStats(cgroup)
+ DoCpuStats(cgroup, &lastCpuSample)
+ DoBlkIoStats(cgroup, lastDiskSample)
+ DoNetworkStats(cgroup, lastNetSample)
+ }
+}
+
+func run(logger *log.Logger) error {
var (
cgroup_root string
flag.Parse()
- logger := log.New(os.Stderr, "crunchstat: ", 0)
-
if cgroup_root == "" {
logger.Fatal("Must provide -cgroup-root")
}
- // Make output channel
- stdout_chan := make(chan string)
- stderr_chan := make(chan string)
+ logChan = make(chan string, 1)
+ defer close(logChan)
finish_chan := make(chan bool)
- defer close(stdout_chan)
- defer close(stderr_chan)
defer close(finish_chan)
- go OutputChannel(stdout_chan, stderr_chan)
+ go CopyChanToPipe(logChan, os.Stderr)
var cmd *exec.Cmd
logger.Print("Running ", flag.Args())
- // Child process will read from our stdin pipe (we
- // close our copy below)
+ // Child process will use our stdin and stdout pipes
+ // (we close our copies below)
cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
// Forward SIGINT and SIGTERM to inner process
term := make(chan os.Signal, 1)
if cmd.Process != nil {
cmd.Process.Signal(catch)
}
- logger.Print("caught signal:", catch)
+ logger.Print("caught signal: ", catch)
}(term)
signal.Notify(term, syscall.SIGTERM)
signal.Notify(term, syscall.SIGINT)
- // Funnel stdout and stderr from subprocess to output channels
- stdout_pipe, err := cmd.StdoutPipe()
- if err != nil {
- logger.Fatal(err)
- }
- go ReadLineByLine(stdout_pipe, stdout_chan, finish_chan)
-
+ // Funnel stderr through our channel
stderr_pipe, err := cmd.StderrPipe()
if err != nil {
logger.Fatal(err)
}
- go ReadLineByLine(stderr_pipe, stderr_chan, finish_chan)
+ go CopyPipeToChan(stderr_pipe, logChan, finish_chan)
// Run subprocess
if err := cmd.Start(); err != nil {
logger.Fatal(err)
}
- }
- // Close standard input in this (parent) process
- os.Stdin.Close()
+ // Close stdin/stdout in this (parent) process
+ os.Stdin.Close()
+ os.Stdout.Close()
+ }
// Read the cid file
var container_id string
if cgroup_cidfile != "" {
// wait up to 'wait' seconds for the cid file to appear
+ ok := false
var i time.Duration
for i = 0; i < time.Duration(wait)*time.Second; i += (100 * time.Millisecond) {
- f, err := os.Open(cgroup_cidfile)
- if err == nil {
- cid, err2 := ioutil.ReadAll(f)
- if err2 == nil && len(cid) > 0 {
- container_id = string(cid)
- f.Close()
- break
- }
+ cid, err := ioutil.ReadFile(cgroup_cidfile)
+ if err == nil && len(cid) > 0 {
+ ok = true
+ container_id = string(cid)
+ break
}
time.Sleep(100 * time.Millisecond)
}
- if cgroup_root == "" {
+ if !ok {
logger.Printf("Could not read cid file %s", cgroup_cidfile)
}
}
- go PollCgroupStats(cgroup_root, cgroup_parent, container_id, stderr_chan, poll)
+ stop_poll_chan := make(chan bool, 1)
+ cgroup := Cgroup{cgroup_root, cgroup_parent, container_id}
+ go PollCgroupStats(cgroup, poll, stop_poll_chan)
- // Wait for each of stdout and stderr to drain
- <-finish_chan
+ // When the child exits, tell the polling goroutine to stop.
+ defer func() { stop_poll_chan <- true }()
+
+ // Wait for CopyPipeToChan to consume child's stderr pipe
<-finish_chan
- if err := cmd.Wait(); err != nil {
+ return cmd.Wait()
+}
+
+func main() {
+ logger := log.New(os.Stderr, "crunchstat: ", 0)
+ if err := run(logger); err != nil {
if exiterr, ok := err.(*exec.ExitError); ok {
// The program has exited with an exit code != 0
- // This works on both Unix and Windows. Although package
- // syscall is generally platform dependent, WaitStatus is
- // defined for both Unix and Windows and in both cases has
- // an ExitStatus() method with the same signature.
+ // This works on both Unix and
+ // Windows. Although package syscall is
+ // generally platform dependent, WaitStatus is
+ // defined for both Unix and Windows and in
+ // both cases has an ExitStatus() method with
+ // the same signature.
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
os.Exit(status.ExitStatus())
}
--- /dev/null
+package main
+
+import (
+ "os"
+ "regexp"
+ "testing"
+)
+
+func TestReadAllOrWarnFail(t *testing.T) {
+ logChan = make(chan string)
+ go func() {
+ defer close(logChan)
+ // The special file /proc/self/mem can be opened for
+ // reading, but reading from byte 0 returns an error.
+ f, err := os.Open("/proc/self/mem")
+ if err != nil {
+ t.Fatalf("Opening /proc/self/mem: %s", err)
+ }
+ if x, err := ReadAllOrWarn(f); err == nil {
+ t.Fatalf("Expected error, got %v", x)
+ }
+ }()
+ if _, ok := <-logChan; !ok {
+ t.Fatalf("Expected error message about nonexistent file")
+ }
+ if msg, ok := <-logChan; ok {
+ t.Fatalf("Expected channel to close, got %s", msg)
+ }
+}
+
+func TestReadAllOrWarnSuccess(t *testing.T) {
+ logChan = make(chan string)
+ go func() {
+ defer close(logChan)
+ f, err := os.Open("./crunchstat_test.go")
+ if err != nil {
+ t.Fatalf("Opening ./crunchstat_test.go: %s", err)
+ }
+ data, err := ReadAllOrWarn(f)
+ if err != nil {
+ t.Fatalf("got error %s", err)
+ }
+ if matched, err := regexp.MatchString("^package main\n", string(data)); err != nil || !matched {
+ t.Fatalf("data failed regexp: %s", err)
+ }
+ }()
+ if msg, ok := <-logChan; ok {
+ t.Fatalf("Expected channel to close, got %s", msg)
+ }
+}
import json
import logging
import time
+import _strptime
import calendar
import threading
from arvados.util import portable_data_hash_pattern, uuid_pattern, collection_uuid_pattern, group_uuid_pattern, user_uuid_pattern, link_uuid_pattern
if pidfile != "" {
f, err := os.Create(pidfile)
- if err == nil {
- fmt.Fprint(f, os.Getpid())
- f.Close()
- } else {
- log.Printf("Error writing pid file (%s): %s", pidfile, err.Error())
+ if err != nil {
+ log.Fatalf("Error writing pid file (%s): %s", pidfile, err.Error())
}
+ fmt.Fprint(f, os.Getpid())
+ f.Close()
+ defer os.Remove(pidfile)
}
kc.Want_replicas = default_replicas
s := <-sig
log.Println("caught signal:", s)
listener.Close()
+ listener = nil
}(term)
signal.Notify(term, syscall.SIGTERM)
signal.Notify(term, syscall.SIGINT)
- if pidfile != "" {
- f, err := os.Create(pidfile)
- if err == nil {
- fmt.Fprint(f, os.Getpid())
- f.Close()
- } else {
- log.Printf("Error writing pid file (%s): %s", pidfile, err.Error())
- }
- }
-
log.Printf("Arvados Keep proxy started listening on %v with server list %v", listener.Addr(), kc.ServiceRoots())
// Start listening for requests.
http.Serve(listener, MakeRESTRouter(!no_get, !no_put, &kc))
log.Println("shutting down")
-
- if pidfile != "" {
- os.Remove(pidfile)
- }
}
type ApiTokenCache struct {
return fmt.Sprintf("%s/../../sdk/python/tests", cwd)
}
+// Wait (up to 1 second) for keepproxy to listen on a port. This
+// avoids a race condition where we hit a "connection refused" error
+// because we start testing the proxy too soon.
+func waitForListener() {
+ const (ms = 5)
+ for i := 0; listener == nil && i < 1000; i += ms {
+ time.Sleep(ms * time.Millisecond)
+ }
+ if listener == nil {
+ log.Fatalf("Timed out waiting for listener to start")
+ }
+}
+
+func closeListener() {
+ if listener != nil {
+ listener.Close()
+ }
+}
+
func (s *ServerRequiredSuite) SetUpSuite(c *C) {
cwd, _ := os.Getwd()
defer os.Chdir(cwd)
os.Setenv("ARVADOS_EXTERNAL_CLIENT", "")
log.Print("keepclient created")
- defer listener.Close()
+ waitForListener()
+ defer closeListener()
hash := fmt.Sprintf("%x", md5.Sum([]byte("foo")))
var hash2 string
log.Print("TestPutAndGet start")
kc := runProxy(c, []string{"keepproxy"}, "123abc", 29951)
- defer listener.Close()
+ waitForListener()
+ defer closeListener()
log.Print("keepclient created")
log.Print("TestGetDisabled start")
kc := runProxy(c, []string{"keepproxy", "-no-get"}, "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h", 29952)
- defer listener.Close()
+ waitForListener()
+ defer closeListener()
hash := fmt.Sprintf("%x", md5.Sum([]byte("baz")))
log.Print("TestPutDisabled start")
kc := runProxy(c, []string{"keepproxy", "-no-put"}, "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h", 29953)
- defer listener.Close()
+ waitForListener()
+ defer closeListener()
{
hash2, rep, err := kc.PutB([]byte("quux"))
}
// Test /index requests:
-// - enforce_permissions off | unauthenticated /index request
-// - enforce_permissions off | unauthenticated /index/prefix request
-// - enforce_permissions off | authenticated /index request | non-superuser
-// - enforce_permissions off | authenticated /index/prefix request | non-superuser
-// - enforce_permissions off | authenticated /index request | superuser
-// - enforce_permissions off | authenticated /index/prefix request | superuser
-// - enforce_permissions on | unauthenticated /index request
-// - enforce_permissions on | unauthenticated /index/prefix request
-// - enforce_permissions on | authenticated /index request | non-superuser
-// - enforce_permissions on | authenticated /index/prefix request | non-superuser
-// - enforce_permissions on | authenticated /index request | superuser
-// - enforce_permissions on | authenticated /index/prefix request | superuser
+// - unauthenticated /index request
+// - unauthenticated /index/prefix request
+// - authenticated /index request | non-superuser
+// - authenticated /index/prefix request | non-superuser
+// - authenticated /index request | superuser
+// - authenticated /index/prefix request | superuser
//
// The only /index requests that should succeed are those issued by the
-// superuser when enforce_permissions = true.
+// superuser. They should pass regardless of the value of enforce_permissions.
//
func TestIndexHandler(t *testing.T) {
defer teardown()
api_token: data_manager_token,
}
- // ----------------------------
- // enforce_permissions disabled
- // All /index requests should fail.
- enforce_permissions = false
+ // -------------------------------------------------------------
+ // Only the superuser should be allowed to issue /index requests.
+
+ // ---------------------------
+ // enforce_permissions enabled
+ // This setting should not affect tests passing.
+ enforce_permissions = true
// unauthenticated /index request
- // => PermissionError
+ // => UnauthorizedError
response := IssueRequest(rest, unauthenticated_req)
ExpectStatusCode(t,
- "enforce_permissions off, unauthenticated request",
- PermissionError.HTTPCode,
+ "enforce_permissions on, unauthenticated request",
+ UnauthorizedError.HTTPCode,
response)
// unauthenticated /index/prefix request
- // => PermissionError
+ // => UnauthorizedError
response = IssueRequest(rest, unauth_prefix_req)
ExpectStatusCode(t,
- "enforce_permissions off, unauthenticated /index/prefix request",
- PermissionError.HTTPCode,
+ "permissions on, unauthenticated /index/prefix request",
+ UnauthorizedError.HTTPCode,
response)
// authenticated /index request, non-superuser
- // => PermissionError
+ // => UnauthorizedError
response = IssueRequest(rest, authenticated_req)
ExpectStatusCode(t,
- "enforce_permissions off, authenticated request, non-superuser",
- PermissionError.HTTPCode,
+ "permissions on, authenticated request, non-superuser",
+ UnauthorizedError.HTTPCode,
response)
// authenticated /index/prefix request, non-superuser
- // => PermissionError
+ // => UnauthorizedError
response = IssueRequest(rest, auth_prefix_req)
ExpectStatusCode(t,
- "enforce_permissions off, authenticated /index/prefix request, non-superuser",
- PermissionError.HTTPCode,
+ "permissions on, authenticated /index/prefix request, non-superuser",
+ UnauthorizedError.HTTPCode,
response)
- // authenticated /index request, superuser
- // => PermissionError
+ // superuser /index request
+ // => OK
response = IssueRequest(rest, superuser_req)
ExpectStatusCode(t,
- "enforce_permissions off, superuser request",
- PermissionError.HTTPCode,
- response)
-
- // superuser /index/prefix request
- // => PermissionError
- response = IssueRequest(rest, superuser_prefix_req)
- ExpectStatusCode(t,
- "enforce_permissions off, superuser /index/prefix request",
- PermissionError.HTTPCode,
- response)
-
- // ---------------------------
- // enforce_permissions enabled
- // Only the superuser should be allowed to issue /index requests.
- enforce_permissions = true
-
- // unauthenticated /index request
- // => PermissionError
- response = IssueRequest(rest, unauthenticated_req)
- ExpectStatusCode(t,
- "enforce_permissions on, unauthenticated request",
- PermissionError.HTTPCode,
- response)
-
- // unauthenticated /index/prefix request
- // => PermissionError
- response = IssueRequest(rest, unauth_prefix_req)
- ExpectStatusCode(t,
- "permissions on, unauthenticated /index/prefix request",
- PermissionError.HTTPCode,
- response)
-
- // authenticated /index request, non-superuser
- // => PermissionError
- response = IssueRequest(rest, authenticated_req)
- ExpectStatusCode(t,
- "permissions on, authenticated request, non-superuser",
- PermissionError.HTTPCode,
+ "permissions on, superuser request",
+ http.StatusOK,
response)
- // authenticated /index/prefix request, non-superuser
- // => PermissionError
- response = IssueRequest(rest, auth_prefix_req)
- ExpectStatusCode(t,
- "permissions on, authenticated /index/prefix request, non-superuser",
- PermissionError.HTTPCode,
- response)
+ // ----------------------------
+ // enforce_permissions disabled
+ // Valid Request should still pass.
+ enforce_permissions = false
// superuser /index request
// => OK
http.StatusOK,
response)
+
+
expected := `^` + TEST_HASH + `\+\d+ \d+\n` +
TEST_HASH_2 + `\+\d+ \d+\n$`
match, _ := regexp.MatchString(expected, response.Body.String())
// A HandleFunc to address /index and /index/{prefix} requests.
//
func IndexHandler(resp http.ResponseWriter, req *http.Request) {
- prefix := mux.Vars(req)["prefix"]
-
- // Only the data manager may issue /index requests,
- // and only if enforce_permissions is enabled.
- // All other requests return 403 Forbidden.
- api_token := GetApiToken(req)
- if !enforce_permissions ||
- api_token == "" ||
- data_manager_token != api_token {
- http.Error(resp, PermissionError.Error(), PermissionError.HTTPCode)
+ // Reject unauthorized requests.
+ if !IsDataManagerToken(GetApiToken(req)) {
+ http.Error(resp, UnauthorizedError.Error(), UnauthorizedError.HTTPCode)
+ log.Printf("%s %s: %s\n", req.Method, req.URL, UnauthorizedError.Error())
return
}
+
+ prefix := mux.Vars(req)["prefix"]
+
var index string
for _, vol := range KeepVM.Volumes() {
index = index + vol.Index(prefix)
"@" + timestamp_hex
}
+var signedLocatorRe = regexp.MustCompile(`^([[:xdigit:]]{32}).*\+A([[:xdigit:]]{40})@([[:xdigit:]]{8})`)
+
// VerifySignature returns true if the signature on the signed_locator
// can be verified using the given api_token.
func VerifySignature(signed_locator string, api_token string) bool {
- if re, err := regexp.Compile(`^([a-f0-9]{32}(\+[0-9]+)?).*\+A[[:xdigit:]]+@([[:xdigit:]]{8})`); err == nil {
- if matches := re.FindStringSubmatch(signed_locator); matches != nil {
- blob_locator := matches[1]
- timestamp_hex := matches[3]
- if expire_ts, err := ParseHexTimestamp(timestamp_hex); err == nil {
- // Fail signatures with expired timestamps.
- if expire_ts.Before(time.Now()) {
- return false
- }
- return signed_locator == SignLocator(blob_locator, api_token, expire_ts)
- }
- }
+ matches := signedLocatorRe.FindStringSubmatch(signed_locator)
+ if matches == nil {
+ // Could not find a permission signature at all
+ return false
+ }
+ blob_hash := matches[1]
+ sig_hex := matches[2]
+ exp_hex := matches[3]
+ if exp_time, err := ParseHexTimestamp(exp_hex); err != nil || exp_time.Before(time.Now()) {
+ // Signature is expired, or timestamp is unparseable
+ return false
}
- return false
+ return sig_hex == MakePermSignature(blob_hash, api_token, exp_hex)
}
func ParseHexTimestamp(timestamp_hex string) (ts time.Time, err error) {
"time"
)
-var (
+const (
known_hash = "acbd18db4cc2f85cedef654fccc4a4d8"
known_locator = known_hash + "+3"
known_token = "hocfupkn2pjhrpgp2vxv8rsku7tvtx49arbc9s4bvu7p7wxqvk"
"786u5rw2a9gx743dj3fgq2irk"
known_signature = "257f3f5f5f0a4e4626a18fc74bd42ec34dcb228a"
known_timestamp = "7fffffff"
- known_signed_locator = known_locator + "+A" + known_signature + "@" + known_timestamp
+ known_sig_hint = "+A" + known_signature + "@" + known_timestamp
+ known_signed_locator = known_locator + known_sig_hint
)
func TestSignLocator(t *testing.T) {
}
}
+func TestVerifySignatureExtraHints(t *testing.T) {
+ PermissionSecret = []byte(known_key)
+ defer func() { PermissionSecret = nil }()
+
+ if !VerifySignature(known_locator + "+K@xyzzy" + known_sig_hint, known_token) {
+ t.Fatal("Verify cannot handle hint before permission signature")
+ }
+
+ if !VerifySignature(known_locator + known_sig_hint + "+Zfoo", known_token) {
+ t.Fatal("Verify cannot handle hint after permission signature")
+ }
+
+ if !VerifySignature(known_locator + "+K@xyzzy" + known_sig_hint + "+Zfoo", known_token) {
+ t.Fatal("Verify cannot handle hints around permission signature")
+ }
+}
+
// The size hint on the locator string should not affect signature validation.
func TestVerifySignatureWrongSize(t *testing.T) {
PermissionSecret = []byte(known_key)
defer func() { PermissionSecret = nil }()
- signed_locator_wrong_size := known_hash + "+999999+A" + known_signature + "@" + known_timestamp
- if !VerifySignature(signed_locator_wrong_size, known_token) {
- t.Fail()
+ if !VerifySignature(known_hash + "+999999" + known_sig_hint, known_token) {
+ t.Fatal("Verify cannot handle incorrect size hint")
+ }
+
+ if !VerifySignature(known_hash + known_sig_hint, known_token) {
+ t.Fatal("Verify cannot handle missing size hint")
}
}
if err != nil {
return err
}
+ defer f.Close()
if e := lockfile(f); e != nil {
return e
}
if err != nil {
return err
}
+ defer f.Close()
if e := lockfile(f); e != nil {
return e
}
"fmt"
"io/ioutil"
"os"
+ "syscall"
"testing"
"time"
)
if err := v.Put(TEST_HASH, TEST_BLOCK); err != nil {
t.Error(err)
}
- old_mtime, err := v.Mtime(TEST_HASH)
- if err != nil {
- t.Error(err)
- }
- if old_mtime.IsZero() {
- t.Errorf("v.Mtime(%s) returned a zero mtime\n", TEST_HASH)
+
+ // We'll verify { t0 < threshold < t1 }, where t0 is the
+ // existing block's timestamp on disk before Put() and t1 is
+ // its timestamp after Put().
+ threshold := time.Now().Add(-time.Second)
+
+ // Set the stored block's mtime far enough in the past that we
+ // can see the difference between "timestamp didn't change"
+ // and "timestamp granularity is too low".
+ {
+ oldtime := time.Now().Add(-20 * time.Second).Unix()
+ if err := syscall.Utime(v.blockPath(TEST_HASH),
+ &syscall.Utimbuf{oldtime, oldtime}); err != nil {
+ t.Error(err)
+ }
+
+ // Make sure v.Mtime() agrees the above Utime really worked.
+ if t0, err := v.Mtime(TEST_HASH); err != nil || t0.IsZero() || !t0.Before(threshold) {
+ t.Errorf("Setting mtime failed: %v, %v", t0, err)
+ }
}
- // Sleep for 1s, then put the block again. The volume
- // should report a more recent mtime.
- //
- // TODO(twp): this would be better handled with a mock Time object.
- // Alternatively, set the mtime manually to some moment in the past
- // (maybe a v.SetMtime method?)
- //
- time.Sleep(time.Second)
+
+ // Write the same block again.
if err := v.Put(TEST_HASH, TEST_BLOCK); err != nil {
t.Error(err)
}
- new_mtime, err := v.Mtime(TEST_HASH)
+
+ // Verify threshold < t1
+ t1, err := v.Mtime(TEST_HASH)
if err != nil {
t.Error(err)
}
-
- if !new_mtime.After(old_mtime) {
- t.Errorf("v.Put did not update the block mtime:\nold_mtime = %v\nnew_mtime = %v\n",
- old_mtime, new_mtime)
+ if t1.Before(threshold) {
+ t.Errorf("t1 %v must be >= threshold %v after v.Put ",
+ t1, threshold)
}
}
--- /dev/null
+*.pyc
+*.egg
+*.egg-info
+build/
+dist/
--- /dev/null
+====================
+Arvados Node Manager
+====================
+
+Overview
+--------
+
+This package provides ``arvados-node-manager``. It dynamically starts
+and stops compute nodes on an Arvados_ cloud installation based on job
+demand.
+
+.. _Arvados: https://arvados.org/
+
+Setup
+-----
+
+1. Install the package.
+
+2. Write a configuration file. ``doc/ec2.example.cfg`` documents all
+ of the options available, with specific tunables for EC2 clouds.
+
+3. Run ``arvados-node-manager --config YOURCONFIGFILE`` using whatever
+ supervisor you like (e.g., runit).
+
+Testing and Development
+-----------------------
+
+To run tests, just run::
+
+ python setup.py test
+
+Our `hacking guide
+<https://arvados.org/projects/arvados/wiki/Hacking_Node_Manager>`_
+provides an architectural overview of the Arvados Node Manager to help
+you find your way around the source. The `Lifecycle of an Arvados
+compute node
+<https://arvados.org/projects/arvados/wiki/Lifecycle_of_an_Arvados_compute_node>`_
+page explains how it works in concert with other Arvados components to
+prepare a node for compute work.
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import _strptime # See <http://bugs.python.org/issue7980#msg221094>.
+import logging
+
+logger = logging.getLogger('arvnodeman')
+logger.addHandler(logging.NullHandler())
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import logging
+import time
+
+import pykka
+
+from .config import actor_class
+
+def _notify_subscribers(response, subscribers):
+ """Send the response to all the subscriber methods.
+
+ If any of the subscriber actors have stopped, remove them from the
+ subscriber set.
+ """
+ dead_subscribers = set()
+ for subscriber in subscribers:
+ try:
+ subscriber(response)
+ except pykka.ActorDeadError:
+ dead_subscribers.add(subscriber)
+ subscribers.difference_update(dead_subscribers)
+
+class RemotePollLoopActor(actor_class):
+ """Abstract actor class to regularly poll a remote service.
+
+ This actor sends regular requests to a remote service, and sends each
+ response to subscribers. It takes care of error handling, and retrying
+ requests with exponential backoff.
+
+ To use this actor, define CLIENT_ERRORS and the _send_request method.
+ If you also define an _item_key method, this class will support
+ subscribing to a specific item by key in responses.
+ """
+ CLIENT_ERRORS = ()
+
+ def __init__(self, client, timer_actor, poll_wait=60, max_poll_wait=180):
+ super(RemotePollLoopActor, self).__init__()
+ self._client = client
+ self._timer = timer_actor
+ self._logger = logging.getLogger(self.LOGGER_NAME)
+ self._later = self.actor_ref.proxy()
+ self._polling_started = False
+ self.log_prefix = "{} (at {})".format(self.__class__.__name__, id(self))
+ self.min_poll_wait = poll_wait
+ self.max_poll_wait = max_poll_wait
+ self.poll_wait = self.min_poll_wait
+ self.all_subscribers = set()
+ self.key_subscribers = {}
+ if hasattr(self, '_item_key'):
+ self.subscribe_to = self._subscribe_to
+
+ def _start_polling(self):
+ if not self._polling_started:
+ self._polling_started = True
+ self._later.poll()
+
+ def subscribe(self, subscriber):
+ self.all_subscribers.add(subscriber)
+ self._logger.debug("%r subscribed to all events", subscriber)
+ self._start_polling()
+
+ # __init__ exposes this method to the proxy if the subclass defines
+ # _item_key.
+ def _subscribe_to(self, key, subscriber):
+ self.key_subscribers.setdefault(key, set()).add(subscriber)
+ self._logger.debug("%r subscribed to events for '%s'", subscriber, key)
+ self._start_polling()
+
+ def _send_request(self):
+ raise NotImplementedError("subclasses must implement request method")
+
+ def _got_response(self, response):
+ self._logger.debug("%s got response with %d items",
+ self.log_prefix, len(response))
+ self.poll_wait = self.min_poll_wait
+ _notify_subscribers(response, self.all_subscribers)
+ if hasattr(self, '_item_key'):
+ items = {self._item_key(x): x for x in response}
+ for key, subscribers in self.key_subscribers.iteritems():
+ _notify_subscribers(items.get(key), subscribers)
+
+ def _got_error(self, error):
+ self.poll_wait = min(self.poll_wait * 2, self.max_poll_wait)
+ return "{} got error: {} - waiting {} seconds".format(
+ self.log_prefix, error, self.poll_wait)
+
+ def poll(self, scheduled_start=None):
+ self._logger.debug("%s sending poll", self.log_prefix)
+ start_time = time.time()
+ if scheduled_start is None:
+ scheduled_start = start_time
+ try:
+ response = self._send_request()
+ except Exception as error:
+ errmsg = self._got_error(error)
+ if isinstance(error, self.CLIENT_ERRORS):
+ self._logger.warning(errmsg)
+ else:
+ self._logger.exception(errmsg)
+ next_poll = start_time + self.poll_wait
+ else:
+ self._got_response(response)
+ next_poll = scheduled_start + self.poll_wait
+ end_time = time.time()
+ if next_poll < end_time: # We've drifted too much; start fresh.
+ next_poll = end_time + self.poll_wait
+ self._timer.schedule(next_poll, self._later.poll, next_poll)
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import functools
+import itertools
+import logging
+import time
+
+import pykka
+
+from ..clientactor import _notify_subscribers
+from .. import config
+
+def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
+ hostname = arvados_node.get('hostname') or default_hostname
+ return '{}.{}'.format(hostname, arvados_node['domain'])
+
+def arvados_node_mtime(node):
+ return time.mktime(time.strptime(node['modified_at'] + 'UTC',
+ '%Y-%m-%dT%H:%M:%SZ%Z')) - time.timezone
+
+def timestamp_fresh(timestamp, fresh_time):
+ return (time.time() - timestamp) < fresh_time
+
+class BaseComputeNodeDriver(object):
+ """Abstract base class for compute node drivers.
+
+ libcloud abstracts away many of the differences between cloud providers,
+ but managing compute nodes requires some cloud-specific features (e.g.,
+ on EC2 we use tags to identify compute nodes). Compute node drivers
+ are responsible for translating the node manager's cloud requests to a
+ specific cloud's vocabulary.
+
+ Subclasses must implement arvados_create_kwargs (to update node
+ creation kwargs with information about the specific Arvados node
+ record), sync_node, and node_start_time.
+ """
+ def __init__(self, auth_kwargs, list_kwargs, create_kwargs, driver_class):
+ self.real = driver_class(**auth_kwargs)
+ self.list_kwargs = list_kwargs
+ self.create_kwargs = create_kwargs
+
+ def __getattr__(self, name):
+ # Proxy non-extension methods to the real driver.
+ if (not name.startswith('_') and not name.startswith('ex_')
+ and hasattr(self.real, name)):
+ return getattr(self.real, name)
+ else:
+ return super(BaseComputeNodeDriver, self).__getattr__(name)
+
+ def search_for(self, term, list_method, key=lambda item: item.id):
+ cache_key = (list_method, term)
+ if cache_key not in self.SEARCH_CACHE:
+ results = [item for item in getattr(self.real, list_method)()
+ if key(item) == term]
+ count = len(results)
+ if count != 1:
+ raise ValueError("{} returned {} results for '{}'".format(
+ list_method, count, term))
+ self.SEARCH_CACHE[cache_key] = results[0]
+ return self.SEARCH_CACHE[cache_key]
+
+ def list_nodes(self):
+ return self.real.list_nodes(**self.list_kwargs)
+
+ def arvados_create_kwargs(self, arvados_node):
+ raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
+
+ def create_node(self, size, arvados_node):
+ kwargs = self.create_kwargs.copy()
+ kwargs.update(self.arvados_create_kwargs(arvados_node))
+ kwargs['size'] = size
+ return self.real.create_node(**kwargs)
+
+ def sync_node(self, cloud_node, arvados_node):
+ # When a compute node first pings the API server, the API server
+ # will automatically assign some attributes on the corresponding
+ # node record, like hostname. This method should propagate that
+ # information back to the cloud node appropriately.
+ raise NotImplementedError("BaseComputeNodeDriver.sync_node")
+
+ @classmethod
+ def node_start_time(cls, node):
+ raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
+
+
+ComputeNodeDriverClass = BaseComputeNodeDriver
+
+class ComputeNodeStateChangeBase(config.actor_class):
+ """Base class for actors that change a compute node's state.
+
+ This base class takes care of retrying changes and notifying
+ subscribers when the change is finished.
+ """
+ def __init__(self, logger_name, timer_actor, retry_wait, max_retry_wait):
+ super(ComputeNodeStateChangeBase, self).__init__()
+ self._later = self.actor_ref.proxy()
+ self._timer = timer_actor
+ self._logger = logging.getLogger(logger_name)
+ self.min_retry_wait = retry_wait
+ self.max_retry_wait = max_retry_wait
+ self.retry_wait = retry_wait
+ self.subscribers = set()
+
+ @staticmethod
+ def _retry(errors):
+ """Retry decorator for an actor method that makes remote requests.
+
+ Use this function to decorator an actor method, and pass in a
+ tuple of exceptions to catch. This decorator will schedule
+ retries of that method with exponential backoff if the
+ original method raises any of the given errors.
+ """
+ def decorator(orig_func):
+ @functools.wraps(orig_func)
+ def wrapper(self, *args, **kwargs):
+ try:
+ orig_func(self, *args, **kwargs)
+ except errors as error:
+ self._logger.warning(
+ "Client error: %s - waiting %s seconds",
+ error, self.retry_wait)
+ self._timer.schedule(self.retry_wait,
+ getattr(self._later,
+ orig_func.__name__),
+ *args, **kwargs)
+ self.retry_wait = min(self.retry_wait * 2,
+ self.max_retry_wait)
+ else:
+ self.retry_wait = self.min_retry_wait
+ return wrapper
+ return decorator
+
+ def _finished(self):
+ _notify_subscribers(self._later, self.subscribers)
+ self.subscribers = None
+
+ def subscribe(self, subscriber):
+ if self.subscribers is None:
+ try:
+ subscriber(self._later)
+ except pykka.ActorDeadError:
+ pass
+ else:
+ self.subscribers.add(subscriber)
+
+
+class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
+ """Actor to create and set up a cloud compute node.
+
+ This actor prepares an Arvados node record for a new compute node
+ (either creating one or cleaning one passed in), then boots the
+ actual compute node. It notifies subscribers when the cloud node
+ is successfully created (the last step in the process for Node
+ Manager to handle).
+ """
+ def __init__(self, timer_actor, arvados_client, cloud_client,
+ cloud_size, arvados_node=None,
+ retry_wait=1, max_retry_wait=180):
+ super(ComputeNodeSetupActor, self).__init__(
+ 'arvnodeman.nodeup', timer_actor, retry_wait, max_retry_wait)
+ self._arvados = arvados_client
+ self._cloud = cloud_client
+ self.cloud_size = cloud_size
+ self.arvados_node = None
+ self.cloud_node = None
+ if arvados_node is None:
+ self._later.create_arvados_node()
+ else:
+ self._later.prepare_arvados_node(arvados_node)
+
+ @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+ def create_arvados_node(self):
+ self.arvados_node = self._arvados.nodes().create(body={}).execute()
+ self._later.create_cloud_node()
+
+ @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+ def prepare_arvados_node(self, node):
+ self.arvados_node = self._arvados.nodes().update(
+ uuid=node['uuid'],
+ body={'hostname': None,
+ 'ip_address': None,
+ 'slot_number': None,
+ 'first_ping_at': None,
+ 'last_ping_at': None,
+ 'info': {'ec2_instance_id': None,
+ 'last_action': "Prepared by Node Manager"}}
+ ).execute()
+ self._later.create_cloud_node()
+
+ @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+ def create_cloud_node(self):
+ self._logger.info("Creating cloud node with size %s.",
+ self.cloud_size.name)
+ self.cloud_node = self._cloud.create_node(self.cloud_size,
+ self.arvados_node)
+ self._logger.info("Cloud node %s created.", self.cloud_node.id)
+ self._finished()
+
+ def stop_if_no_cloud_node(self):
+ if self.cloud_node is None:
+ self.stop()
+
+
+class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
+ """Actor to shut down a compute node.
+
+ This actor simply destroys a cloud node, retrying as needed.
+ """
+ def __init__(self, timer_actor, cloud_client, cloud_node,
+ retry_wait=1, max_retry_wait=180):
+ super(ComputeNodeShutdownActor, self).__init__(
+ 'arvnodeman.nodedown', timer_actor, retry_wait, max_retry_wait)
+ self._cloud = cloud_client
+ self.cloud_node = cloud_node
+ self._later.shutdown_node()
+
+ @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+ def shutdown_node(self):
+ self._cloud.destroy_node(self.cloud_node)
+ self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
+ self._finished()
+
+
+class ComputeNodeUpdateActor(config.actor_class):
+ """Actor to dispatch one-off cloud management requests.
+
+ This actor receives requests for small cloud updates, and
+ dispatches them to a real driver. ComputeNodeMonitorActors use
+ this to perform maintenance tasks on themselves. Having a
+ dedicated actor for this gives us the opportunity to control the
+ flow of requests; e.g., by backing off when errors occur.
+
+ This actor is most like a "traditional" Pykka actor: there's no
+ subscribing, but instead methods return real driver results. If
+ you're interested in those results, you should get them from the
+ Future that the proxy method returns. Be prepared to handle exceptions
+ from the cloud driver when you do.
+ """
+ def __init__(self, cloud_factory, max_retry_wait=180):
+ super(ComputeNodeUpdateActor, self).__init__()
+ self._cloud = cloud_factory()
+ self.max_retry_wait = max_retry_wait
+ self.error_streak = 0
+ self.next_request_time = time.time()
+
+ def _throttle_errors(orig_func):
+ @functools.wraps(orig_func)
+ def wrapper(self, *args, **kwargs):
+ throttle_time = self.next_request_time - time.time()
+ if throttle_time > 0:
+ time.sleep(throttle_time)
+ self.next_request_time = time.time()
+ try:
+ result = orig_func(self, *args, **kwargs)
+ except config.CLOUD_ERRORS:
+ self.error_streak += 1
+ self.next_request_time += min(2 ** self.error_streak,
+ self.max_retry_wait)
+ raise
+ else:
+ self.error_streak = 0
+ return result
+ return wrapper
+
+ @_throttle_errors
+ def sync_node(self, cloud_node, arvados_node):
+ return self._cloud.sync_node(cloud_node, arvados_node)
+
+
+class ShutdownTimer(object):
+ """Keep track of a cloud node's shutdown windows.
+
+ Instantiate this class with a timestamp of when a cloud node started,
+ and a list of durations (in minutes) of when the node must not and may
+ be shut down, alternating. The class will tell you when a shutdown
+ window is open, and when the next open window will start.
+ """
+ def __init__(self, start_time, shutdown_windows):
+ # The implementation is easiest if we have an even number of windows,
+ # because then windows always alternate between open and closed.
+ # Rig that up: calculate the first shutdown window based on what's
+ # passed in. Then, if we were given an odd number of windows, merge
+ # that first window into the last one, since they both# represent
+ # closed state.
+ first_window = shutdown_windows[0]
+ shutdown_windows = list(shutdown_windows[1:])
+ self._next_opening = start_time + (60 * first_window)
+ if len(shutdown_windows) % 2:
+ shutdown_windows.append(first_window)
+ else:
+ shutdown_windows[-1] += first_window
+ self.shutdown_windows = itertools.cycle([60 * n
+ for n in shutdown_windows])
+ self._open_start = self._next_opening
+ self._open_for = next(self.shutdown_windows)
+
+ def _advance_opening(self):
+ while self._next_opening < time.time():
+ self._open_start = self._next_opening
+ self._next_opening += self._open_for + next(self.shutdown_windows)
+ self._open_for = next(self.shutdown_windows)
+
+ def next_opening(self):
+ self._advance_opening()
+ return self._next_opening
+
+ def window_open(self):
+ self._advance_opening()
+ return 0 < (time.time() - self._open_start) < self._open_for
+
+
+class ComputeNodeMonitorActor(config.actor_class):
+ """Actor to manage a running compute node.
+
+ This actor gets updates about a compute node's cloud and Arvados records.
+ It uses this information to notify subscribers when the node is eligible
+ for shutdown.
+ """
+ def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
+ timer_actor, update_actor, arvados_node=None,
+ poll_stale_after=600, node_stale_after=3600):
+ super(ComputeNodeMonitorActor, self).__init__()
+ self._later = self.actor_ref.proxy()
+ self._logger = logging.getLogger('arvnodeman.computenode')
+ self._last_log = None
+ self._shutdowns = shutdown_timer
+ self._timer = timer_actor
+ self._update = update_actor
+ self.cloud_node = cloud_node
+ self.cloud_node_start_time = cloud_node_start_time
+ self.poll_stale_after = poll_stale_after
+ self.node_stale_after = node_stale_after
+ self.subscribers = set()
+ self.arvados_node = None
+ self._later.update_arvados_node(arvados_node)
+ self.last_shutdown_opening = None
+ self._later.consider_shutdown()
+
+ def subscribe(self, subscriber):
+ self.subscribers.add(subscriber)
+
+ def _debug(self, msg, *args):
+ if msg == self._last_log:
+ return
+ self._last_log = msg
+ self._logger.debug(msg, *args)
+
+ def _shutdown_eligible(self):
+ if self.arvados_node is None:
+ return timestamp_fresh(self.cloud_node_start_time,
+ self.node_stale_after)
+ else:
+ return (timestamp_fresh(arvados_node_mtime(self.arvados_node),
+ self.poll_stale_after) and
+ (self.arvados_node['info'].get('slurm_state') == 'idle'))
+
+ def consider_shutdown(self):
+ next_opening = self._shutdowns.next_opening()
+ if self._shutdowns.window_open():
+ if self._shutdown_eligible():
+ self._debug("Node %s suggesting shutdown.", self.cloud_node.id)
+ _notify_subscribers(self._later, self.subscribers)
+ else:
+ self._debug("Node %s shutdown window open but node busy.",
+ self.cloud_node.id)
+ else:
+ self._debug("Node %s shutdown window closed. Next at %s.",
+ self.cloud_node.id, time.ctime(next_opening))
+ if self.last_shutdown_opening != next_opening:
+ self._timer.schedule(next_opening, self._later.consider_shutdown)
+ self.last_shutdown_opening = next_opening
+
+ def offer_arvados_pair(self, arvados_node):
+ if self.arvados_node is not None:
+ return None
+ elif arvados_node['ip_address'] in self.cloud_node.private_ips:
+ self._later.update_arvados_node(arvados_node)
+ return self.cloud_node.id
+ else:
+ return None
+
+ def update_cloud_node(self, cloud_node):
+ if cloud_node is not None:
+ self.cloud_node = cloud_node
+ self._later.consider_shutdown()
+
+ def update_arvados_node(self, arvados_node):
+ if arvados_node is not None:
+ self.arvados_node = arvados_node
+ new_hostname = arvados_node_fqdn(self.arvados_node)
+ if new_hostname != self.cloud_node.name:
+ self._update.sync_node(self.cloud_node, self.arvados_node)
+ self._later.consider_shutdown()
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+
+from . import BaseComputeNodeDriver, arvados_node_fqdn
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+ """Compute node driver wrapper for libcloud's dummy driver.
+
+ This class provides the glue necessary to run the node manager with a
+ dummy cloud. It's useful for testing.
+ """
+ DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.DUMMY)
+ DEFAULT_REAL = DEFAULT_DRIVER('ComputeNodeDriver')
+ DUMMY_START_TIME = time.time()
+
+ def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=DEFAULT_DRIVER):
+ super(ComputeNodeDriver, self).__init__(
+ auth_kwargs, list_kwargs, create_kwargs, driver_class)
+ if driver_class is self.DEFAULT_DRIVER:
+ self.real = self.DEFAULT_REAL
+
+ def _ensure_private_ip(self, node):
+ if not node.private_ips:
+ node.private_ips = ['10.10.0.{}'.format(node.id)]
+
+ def arvados_create_kwargs(self, arvados_node):
+ return {}
+
+ def list_nodes(self):
+ nodelist = super(ComputeNodeDriver, self).list_nodes()
+ for node in nodelist:
+ self._ensure_private_ip(node)
+ return nodelist
+
+ def create_node(self, size, arvados_node):
+ node = super(ComputeNodeDriver, self).create_node(size, arvados_node)
+ self._ensure_private_ip(node)
+ return node
+
+ def sync_node(self, cloud_node, arvados_node):
+ cloud_node.name = arvados_node_fqdn(arvados_node)
+
+ @classmethod
+ def node_start_time(cls, node):
+ return cls.DUMMY_START_TIME
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import libcloud.compute.base as cloud_base
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+from libcloud.compute.drivers import ec2 as cloud_ec2
+
+from . import BaseComputeNodeDriver, arvados_node_fqdn
+
+### Monkeypatch libcloud to support AWS' new SecurityGroup API.
+# These classes can be removed when libcloud support specifying
+# security groups with the SecurityGroupId parameter.
+class ANMEC2Connection(cloud_ec2.EC2Connection):
+ def request(self, *args, **kwargs):
+ params = kwargs.get('params')
+ if (params is not None) and (params.get('Action') == 'RunInstances'):
+ for key in params.keys():
+ if key.startswith('SecurityGroup.'):
+ new_key = key.replace('Group.', 'GroupId.', 1)
+ params[new_key] = params.pop(key).id
+ kwargs['params'] = params
+ return super(ANMEC2Connection, self).request(*args, **kwargs)
+
+
+class ANMEC2NodeDriver(cloud_ec2.EC2NodeDriver):
+ connectionCls = ANMEC2Connection
+
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+ """Compute node driver wrapper for EC2.
+
+ This translates cloud driver requests to EC2's specific parameters.
+ """
+ DEFAULT_DRIVER = ANMEC2NodeDriver
+### End monkeypatch
+ SEARCH_CACHE = {}
+
+ def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=DEFAULT_DRIVER):
+ # We need full lists of keys up front because these loops modify
+ # dictionaries in-place.
+ for key in list_kwargs.keys():
+ list_kwargs[key.replace('_', ':')] = list_kwargs.pop(key)
+ self.tags = {key[4:]: value
+ for key, value in list_kwargs.iteritems()
+ if key.startswith('tag:')}
+ super(ComputeNodeDriver, self).__init__(
+ auth_kwargs, {'ex_filters': list_kwargs}, create_kwargs,
+ driver_class)
+ for key in self.create_kwargs.keys():
+ init_method = getattr(self, '_init_' + key, None)
+ if init_method is not None:
+ new_pair = init_method(self.create_kwargs.pop(key))
+ if new_pair is not None:
+ self.create_kwargs[new_pair[0]] = new_pair[1]
+
+ def _init_image_id(self, image_id):
+ return 'image', self.search_for(image_id, 'list_images')
+
+ def _init_ping_host(self, ping_host):
+ self.ping_host = ping_host
+
+ def _init_security_groups(self, group_names):
+ return 'ex_security_groups', [
+ self.search_for(gname.strip(), 'ex_get_security_groups')
+ for gname in group_names.split(',')]
+
+ def _init_subnet_id(self, subnet_id):
+ return 'ex_subnet', self.search_for(subnet_id, 'ex_list_subnets')
+
+ def _init_ssh_key(self, filename):
+ with open(filename) as ssh_file:
+ key = cloud_base.NodeAuthSSHKey(ssh_file.read())
+ return 'auth', key
+
+ def arvados_create_kwargs(self, arvados_node):
+ result = {'ex_metadata': self.tags.copy(),
+ 'name': arvados_node_fqdn(arvados_node)}
+ ping_secret = arvados_node['info'].get('ping_secret')
+ if ping_secret is not None:
+ ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
+ format(self.ping_host, arvados_node['uuid'],
+ ping_secret))
+ result['ex_userdata'] = ping_url
+ return result
+
+ def sync_node(self, cloud_node, arvados_node):
+ metadata = self.arvados_create_kwargs(arvados_node)
+ tags = metadata['ex_metadata']
+ tags['Name'] = metadata['name']
+ self.real.ex_create_tags(cloud_node, tags)
+
+ @classmethod
+ def node_start_time(cls, node):
+ time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
+ return time.mktime(time.strptime(
+ time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import ConfigParser
+import importlib
+import logging
+import ssl
+
+import arvados
+import httplib2
+import libcloud.common.types as cloud_types
+import pykka
+from apiclient import errors as apierror
+
+# IOError is the base class for socket.error and friends.
+# It seems like it hits the sweet spot for operations we want to retry:
+# it's low-level, but unlikely to catch code bugs.
+NETWORK_ERRORS = (IOError, ssl.SSLError)
+ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
+CLOUD_ERRORS = NETWORK_ERRORS + (cloud_types.LibcloudError,)
+
+actor_class = pykka.ThreadingActor
+
+class NodeManagerConfig(ConfigParser.SafeConfigParser):
+ """Node Manager Configuration class.
+
+ This a standard Python ConfigParser, with additional helper methods to
+ create objects instantiated with configuration information.
+ """
+
+ LOGGING_NONLEVELS = frozenset(['file'])
+
+ def __init__(self, *args, **kwargs):
+ # Can't use super() because SafeConfigParser is an old-style class.
+ ConfigParser.SafeConfigParser.__init__(self, *args, **kwargs)
+ for sec_name, settings in {
+ 'Arvados': {'insecure': 'no',
+ 'timeout': '15'},
+ 'Daemon': {'max_nodes': '1',
+ 'poll_time': '60',
+ 'max_poll_time': '300',
+ 'poll_stale_after': '600',
+ 'node_stale_after': str(60 * 60 * 2)},
+ 'Logging': {'file': '/dev/stderr',
+ 'level': 'WARNING'},
+ }.iteritems():
+ if not self.has_section(sec_name):
+ self.add_section(sec_name)
+ for opt_name, value in settings.iteritems():
+ if not self.has_option(sec_name, opt_name):
+ self.set(sec_name, opt_name, value)
+
+ def get_section(self, section, transformer=None):
+ result = self._dict()
+ for key, value in self.items(section):
+ if transformer is not None:
+ try:
+ value = transformer(value)
+ except (TypeError, ValueError):
+ pass
+ result[key] = value
+ return result
+
+ def log_levels(self):
+ return {key: getattr(logging, self.get('Logging', key).upper())
+ for key in self.options('Logging')
+ if key not in self.LOGGING_NONLEVELS}
+
+ def new_arvados_client(self):
+ if self.has_option('Daemon', 'certs_file'):
+ certs_file = self.get('Daemon', 'certs_file')
+ else:
+ certs_file = None
+ insecure = self.getboolean('Arvados', 'insecure')
+ http = httplib2.Http(timeout=self.getint('Arvados', 'timeout'),
+ ca_certs=certs_file,
+ disable_ssl_certificate_validation=insecure)
+ return arvados.api('v1',
+ cache=False, # Don't reuse an existing client.
+ host=self.get('Arvados', 'host'),
+ token=self.get('Arvados', 'token'),
+ insecure=insecure,
+ http=http)
+
+ def new_cloud_client(self):
+ module = importlib.import_module('arvnodeman.computenode.' +
+ self.get('Cloud', 'provider'))
+ auth_kwargs = self.get_section('Cloud Credentials')
+ if 'timeout' in auth_kwargs:
+ auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
+ return module.ComputeNodeDriver(auth_kwargs,
+ self.get_section('Cloud List'),
+ self.get_section('Cloud Create'))
+
+ def node_sizes(self, all_sizes):
+ size_kwargs = {}
+ for sec_name in self.sections():
+ sec_words = sec_name.split(None, 2)
+ if sec_words[0] != 'Size':
+ continue
+ size_kwargs[sec_words[1]] = self.get_section(sec_name, int)
+ return [(size, size_kwargs[size.id]) for size in all_sizes
+ if size.id in size_kwargs]
+
+ def shutdown_windows(self):
+ return [int(n)
+ for n in self.get('Cloud', 'shutdown_windows').split(',')]
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import functools
+import logging
+import time
+
+import pykka
+
+from . import computenode as cnode
+from .config import actor_class
+
+class _ComputeNodeRecord(object):
+ def __init__(self, actor=None, cloud_node=None, arvados_node=None,
+ assignment_time=float('-inf')):
+ self.actor = actor
+ self.cloud_node = cloud_node
+ self.arvados_node = arvados_node
+ self.assignment_time = assignment_time
+
+
+class _BaseNodeTracker(object):
+ def __init__(self):
+ self.nodes = {}
+ self.orphans = {}
+
+ def __getitem__(self, key):
+ return self.nodes[key]
+
+ def __len__(self):
+ return len(self.nodes)
+
+ def get(self, key, default=None):
+ return self.nodes.get(key, default)
+
+ def record_key(self, record):
+ return self.item_key(getattr(record, self.RECORD_ATTR))
+
+ def add(self, record):
+ self.nodes[self.record_key(record)] = record
+
+ def update_record(self, key, item):
+ setattr(self.nodes[key], self.RECORD_ATTR, item)
+
+ def update_from(self, response):
+ unseen = set(self.nodes.iterkeys())
+ for item in response:
+ key = self.item_key(item)
+ if key in unseen:
+ unseen.remove(key)
+ self.update_record(key, item)
+ else:
+ yield key, item
+ self.orphans = {key: self.nodes.pop(key) for key in unseen}
+
+ def unpaired(self):
+ return (record for record in self.nodes.itervalues()
+ if getattr(record, self.PAIR_ATTR) is None)
+
+
+class _CloudNodeTracker(_BaseNodeTracker):
+ RECORD_ATTR = 'cloud_node'
+ PAIR_ATTR = 'arvados_node'
+ item_key = staticmethod(lambda cloud_node: cloud_node.id)
+
+
+class _ArvadosNodeTracker(_BaseNodeTracker):
+ RECORD_ATTR = 'arvados_node'
+ PAIR_ATTR = 'cloud_node'
+ item_key = staticmethod(lambda arvados_node: arvados_node['uuid'])
+
+ def find_stale_node(self, stale_time):
+ for record in self.nodes.itervalues():
+ node = record.arvados_node
+ if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node),
+ stale_time) and
+ not cnode.timestamp_fresh(record.assignment_time,
+ stale_time)):
+ return node
+ return None
+
+
+class NodeManagerDaemonActor(actor_class):
+ """Node Manager daemon.
+
+ This actor subscribes to all information polls about cloud nodes,
+ Arvados nodes, and the job queue. It creates a ComputeNodeMonitorActor
+ for every cloud node, subscribing them to poll updates
+ appropriately. It creates and destroys cloud nodes based on job queue
+ demand, and stops the corresponding ComputeNode actors when their work
+ is done.
+ """
+ def __init__(self, server_wishlist_actor, arvados_nodes_actor,
+ cloud_nodes_actor, cloud_update_actor, timer_actor,
+ arvados_factory, cloud_factory,
+ shutdown_windows, max_nodes,
+ poll_stale_after=600, node_stale_after=7200,
+ node_setup_class=cnode.ComputeNodeSetupActor,
+ node_shutdown_class=cnode.ComputeNodeShutdownActor,
+ node_actor_class=cnode.ComputeNodeMonitorActor):
+ super(NodeManagerDaemonActor, self).__init__()
+ self._node_setup = node_setup_class
+ self._node_shutdown = node_shutdown_class
+ self._node_actor = node_actor_class
+ self._cloud_updater = cloud_update_actor
+ self._timer = timer_actor
+ self._new_arvados = arvados_factory
+ self._new_cloud = cloud_factory
+ self._cloud_driver = self._new_cloud()
+ self._logger = logging.getLogger('arvnodeman.daemon')
+ self._later = self.actor_ref.proxy()
+ self.shutdown_windows = shutdown_windows
+ self.max_nodes = max_nodes
+ self.poll_stale_after = poll_stale_after
+ self.node_stale_after = node_stale_after
+ self.last_polls = {}
+ for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
+ poll_actor = locals()[poll_name + '_actor']
+ poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
+ setattr(self, '_{}_actor'.format(poll_name), poll_actor)
+ self.last_polls[poll_name] = -self.poll_stale_after
+ self.cloud_nodes = _CloudNodeTracker()
+ self.arvados_nodes = _ArvadosNodeTracker()
+ self.booting = {} # Actor IDs to ComputeNodeSetupActors
+ self.booted = {} # Cloud node IDs to _ComputeNodeRecords
+ self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors
+ self._logger.debug("Daemon initialized")
+
+ def _update_poll_time(self, poll_key):
+ self.last_polls[poll_key] = time.time()
+
+ def _pair_nodes(self, node_record, arvados_node):
+ self._logger.info("Cloud node %s has associated with Arvados node %s",
+ node_record.cloud_node.id, arvados_node['uuid'])
+ self._arvados_nodes_actor.subscribe_to(
+ arvados_node['uuid'], node_record.actor.update_arvados_node)
+ node_record.arvados_node = arvados_node
+ self.arvados_nodes.add(node_record)
+
+ def _new_node(self, cloud_node):
+ start_time = self._cloud_driver.node_start_time(cloud_node)
+ shutdown_timer = cnode.ShutdownTimer(start_time,
+ self.shutdown_windows)
+ actor = self._node_actor.start(
+ cloud_node=cloud_node,
+ cloud_node_start_time=start_time,
+ shutdown_timer=shutdown_timer,
+ update_actor=self._cloud_updater,
+ timer_actor=self._timer,
+ arvados_node=None,
+ poll_stale_after=self.poll_stale_after,
+ node_stale_after=self.node_stale_after).proxy()
+ actor.subscribe(self._later.node_can_shutdown)
+ self._cloud_nodes_actor.subscribe_to(cloud_node.id,
+ actor.update_cloud_node)
+ record = _ComputeNodeRecord(actor, cloud_node)
+ return record
+
+ def update_cloud_nodes(self, nodelist):
+ self._update_poll_time('cloud_nodes')
+ for key, node in self.cloud_nodes.update_from(nodelist):
+ self._logger.info("Registering new cloud node %s", key)
+ if key in self.booted:
+ record = self.booted.pop(key)
+ else:
+ record = self._new_node(node)
+ self.cloud_nodes.add(record)
+ for arv_rec in self.arvados_nodes.unpaired():
+ if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+ self._pair_nodes(record, arv_rec.arvados_node)
+ break
+ for key, record in self.cloud_nodes.orphans.iteritems():
+ record.actor.stop()
+ self.shutdowns.pop(key, None)
+
+ def update_arvados_nodes(self, nodelist):
+ self._update_poll_time('arvados_nodes')
+ for key, node in self.arvados_nodes.update_from(nodelist):
+ self._logger.info("Registering new Arvados node %s", key)
+ record = _ComputeNodeRecord(arvados_node=node)
+ self.arvados_nodes.add(record)
+ for arv_rec in self.arvados_nodes.unpaired():
+ arv_node = arv_rec.arvados_node
+ for cloud_rec in self.cloud_nodes.unpaired():
+ if cloud_rec.actor.offer_arvados_pair(arv_node).get():
+ self._pair_nodes(cloud_rec, arv_node)
+ break
+
+ def _node_count(self):
+ up = sum(len(nodelist) for nodelist in
+ [self.cloud_nodes, self.booted, self.booting])
+ return up - len(self.shutdowns)
+
+ def _nodes_wanted(self):
+ return len(self.last_wishlist) - self._node_count()
+
+ def _nodes_excess(self):
+ return -self._nodes_wanted()
+
+ def update_server_wishlist(self, wishlist):
+ self._update_poll_time('server_wishlist')
+ self.last_wishlist = wishlist[:self.max_nodes]
+ nodes_wanted = self._nodes_wanted()
+ if nodes_wanted > 0:
+ self._later.start_node()
+ elif (nodes_wanted < 0) and self.booting:
+ self._later.stop_booting_node()
+
+ def _check_poll_freshness(orig_func):
+ """Decorator to inhibit a method when poll information is stale.
+
+ This decorator checks the timestamps of all the poll information the
+ daemon has received. The decorated method is only called if none
+ of the timestamps are considered stale.
+ """
+ @functools.wraps(orig_func)
+ def wrapper(self, *args, **kwargs):
+ now = time.time()
+ if all(now - t < self.poll_stale_after
+ for t in self.last_polls.itervalues()):
+ return orig_func(self, *args, **kwargs)
+ else:
+ return None
+ return wrapper
+
+ @_check_poll_freshness
+ def start_node(self):
+ nodes_wanted = self._nodes_wanted()
+ if nodes_wanted < 1:
+ return None
+ arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+ cloud_size = self.last_wishlist[nodes_wanted - 1]
+ self._logger.info("Want %s more nodes. Booting a %s node.",
+ nodes_wanted, cloud_size.name)
+ new_setup = self._node_setup.start(
+ timer_actor=self._timer,
+ arvados_client=self._new_arvados(),
+ arvados_node=arvados_node,
+ cloud_client=self._new_cloud(),
+ cloud_size=cloud_size).proxy()
+ self.booting[new_setup.actor_ref.actor_urn] = new_setup
+ if arvados_node is not None:
+ self.arvados_nodes[arvados_node['uuid']].assignment_time = (
+ time.time())
+ new_setup.subscribe(self._later.node_up)
+ if nodes_wanted > 1:
+ self._later.start_node()
+
+ def _actor_nodes(self, node_actor):
+ return pykka.get_all([node_actor.cloud_node, node_actor.arvados_node])
+
+ def node_up(self, setup_proxy):
+ cloud_node, arvados_node = self._actor_nodes(setup_proxy)
+ del self.booting[setup_proxy.actor_ref.actor_urn]
+ setup_proxy.stop()
+ record = self.cloud_nodes.get(cloud_node.id)
+ if record is None:
+ record = self._new_node(cloud_node)
+ self.booted[cloud_node.id] = record
+ self._pair_nodes(record, arvados_node)
+
+ @_check_poll_freshness
+ def stop_booting_node(self):
+ nodes_excess = self._nodes_excess()
+ if (nodes_excess < 1) or not self.booting:
+ return None
+ for key, node in self.booting.iteritems():
+ node.stop_if_no_cloud_node().get()
+ if not node.actor_ref.is_alive():
+ del self.booting[key]
+ if nodes_excess > 1:
+ self._later.stop_booting_node()
+ break
+
+ @_check_poll_freshness
+ def node_can_shutdown(self, node_actor):
+ if self._nodes_excess() < 1:
+ return None
+ cloud_node, arvados_node = self._actor_nodes(node_actor)
+ if cloud_node.id in self.shutdowns:
+ return None
+ shutdown = self._node_shutdown.start(timer_actor=self._timer,
+ cloud_client=self._new_cloud(),
+ cloud_node=cloud_node).proxy()
+ self.shutdowns[cloud_node.id] = shutdown
+ shutdown.subscribe(self._later.node_finished_shutdown)
+
+ def node_finished_shutdown(self, shutdown_actor):
+ cloud_node_id = shutdown_actor.cloud_node.get().id
+ shutdown_actor.stop()
+ if cloud_node_id in self.booted:
+ self.booted.pop(cloud_node_id).actor.stop()
+ del self.shutdowns[cloud_node_id]
+
+ def shutdown(self):
+ self._logger.info("Shutting down after signal.")
+ self.poll_stale_after = -1 # Inhibit starting/stopping nodes
+ for bootnode in self.booting.itervalues():
+ bootnode.stop_if_no_cloud_node()
+ self._later.await_shutdown()
+
+ def await_shutdown(self):
+ if any(node.actor_ref.is_alive() for node in self.booting.itervalues()):
+ self._timer.schedule(time.time() + 1, self._later.await_shutdown)
+ else:
+ self.stop()
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import logging
+
+from . import clientactor
+from .config import ARVADOS_ERRORS
+
+class ServerCalculator(object):
+ """Generate cloud server wishlists from an Arvados job queue.
+
+ Instantiate this class with a list of cloud node sizes you're willing to
+ use, plus keyword overrides from the configuration. Then you can pass
+ job queues to servers_for_queue. It will return a list of node sizes
+ that would best satisfy the jobs, choosing the cheapest size that
+ satisfies each job, and ignoring jobs that can't be satisfied.
+ """
+
+ class CloudSizeWrapper(object):
+ def __init__(self, real_size, **kwargs):
+ self.real = real_size
+ for name in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price',
+ 'extra']:
+ setattr(self, name, getattr(self.real, name))
+ self.cores = kwargs.pop('cores')
+ self.scratch = self.disk
+ for name, override in kwargs.iteritems():
+ if not hasattr(self, name):
+ raise ValueError("unrecognized size field '%s'" % (name,))
+ setattr(self, name, override)
+
+ def meets_constraints(self, **kwargs):
+ for name, want_value in kwargs.iteritems():
+ have_value = getattr(self, name)
+ if (have_value != 0) and (have_value < want_value):
+ return False
+ return True
+
+
+ def __init__(self, server_list, max_nodes=None):
+ self.cloud_sizes = [self.CloudSizeWrapper(s, **kws)
+ for s, kws in server_list]
+ self.cloud_sizes.sort(key=lambda s: s.price)
+ self.max_nodes = max_nodes or float('inf')
+ self.logger = logging.getLogger('arvnodeman.jobqueue')
+ self.logged_jobs = set()
+
+ @staticmethod
+ def coerce_int(x, fallback):
+ try:
+ return int(x)
+ except (TypeError, ValueError):
+ return fallback
+
+ def cloud_size_for_constraints(self, constraints):
+ want_value = lambda key: self.coerce_int(constraints.get(key), 0)
+ wants = {'cores': want_value('min_cores_per_node'),
+ 'ram': want_value('min_ram_mb_per_node'),
+ 'scratch': want_value('min_scratch_mb_per_node')}
+ for size in self.cloud_sizes:
+ if size.meets_constraints(**wants):
+ return size
+ return None
+
+ def servers_for_queue(self, queue):
+ servers = []
+ seen_jobs = set()
+ for job in queue:
+ seen_jobs.add(job['uuid'])
+ constraints = job['runtime_constraints']
+ want_count = self.coerce_int(constraints.get('min_nodes'), 1)
+ cloud_size = self.cloud_size_for_constraints(constraints)
+ if cloud_size is None:
+ if job['uuid'] not in self.logged_jobs:
+ self.logged_jobs.add(job['uuid'])
+ self.logger.debug("job %s not satisfiable", job['uuid'])
+ elif (want_count < self.max_nodes):
+ servers.extend([cloud_size.real] * max(1, want_count))
+ self.logged_jobs.intersection_update(seen_jobs)
+ return servers
+
+
+class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
+ """Actor to generate server wishlists from the job queue.
+
+ This actor regularly polls Arvados' job queue, and uses the provided
+ ServerCalculator to turn that into a list of requested node sizes. That
+ list is sent to subscribers on every poll.
+ """
+
+ CLIENT_ERRORS = ARVADOS_ERRORS
+ LOGGER_NAME = 'arvnodeman.jobqueue'
+
+ def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
+ super(JobQueueMonitorActor, self).__init__(
+ client, timer_actor, *args, **kwargs)
+ self._calculator = server_calc
+
+ def _send_request(self):
+ return self._client.jobs().queue().execute()['items']
+
+ def _got_response(self, queue):
+ server_list = self._calculator.servers_for_queue(queue)
+ self._logger.debug("Sending server wishlist: %s",
+ ', '.join(s.name for s in server_list) or "(empty)")
+ return super(JobQueueMonitorActor, self)._got_response(server_list)
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import argparse
+import logging
+import signal
+import sys
+import time
+
+import daemon
+import pykka
+
+from . import config as nmconfig
+from .computenode import \
+ ComputeNodeSetupActor, ComputeNodeShutdownActor, ComputeNodeUpdateActor, \
+ ShutdownTimer
+from .daemon import NodeManagerDaemonActor
+from .jobqueue import JobQueueMonitorActor, ServerCalculator
+from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
+from .timedcallback import TimedCallBackActor
+
+node_daemon = None
+
+def abort(msg, code=1):
+ print("arvados-node-manager: " + msg)
+ sys.exit(code)
+
+def parse_cli(args):
+ parser = argparse.ArgumentParser(
+ prog='arvados-node-manager',
+ description="Dynamically allocate Arvados cloud compute nodes")
+ parser.add_argument(
+ '--foreground', action='store_true', default=False,
+ help="Run in the foreground. Don't daemonize.")
+ parser.add_argument(
+ '--config', help="Path to configuration file")
+ return parser.parse_args(args)
+
+def load_config(path):
+ if not path:
+ abort("No --config file specified", 2)
+ config = nmconfig.NodeManagerConfig()
+ try:
+ with open(path) as config_file:
+ config.readfp(config_file)
+ except (IOError, OSError) as error:
+ abort("Error reading configuration file {}: {}".format(path, error))
+ return config
+
+def setup_logging(path, level, **sublevels):
+ handler = logging.FileHandler(path)
+ handler.setFormatter(logging.Formatter(
+ '%(asctime)s %(name)s[%(process)d] %(levelname)s: %(message)s',
+ '%Y-%m-%d %H:%M:%S'))
+ root_logger = logging.getLogger()
+ root_logger.addHandler(handler)
+ root_logger.setLevel(level)
+ for logger_name, sublevel in sublevels.iteritems():
+ sublogger = logging.getLogger(logger_name)
+ sublogger.setLevel(sublevel)
+
+def launch_pollers(config):
+ cloud_client = config.new_cloud_client()
+ arvados_client = config.new_arvados_client()
+ cloud_size_list = config.node_sizes(cloud_client.list_sizes())
+ if not cloud_size_list:
+ abort("No valid node sizes configured")
+
+ server_calculator = ServerCalculator(
+ cloud_size_list, config.getint('Daemon', 'max_nodes'))
+ poll_time = config.getint('Daemon', 'poll_time')
+ max_poll_time = config.getint('Daemon', 'max_poll_time')
+
+ timer = TimedCallBackActor.start(poll_time / 10.0).proxy()
+ cloud_node_poller = CloudNodeListMonitorActor.start(
+ cloud_client, timer, poll_time, max_poll_time).proxy()
+ arvados_node_poller = ArvadosNodeListMonitorActor.start(
+ arvados_client, timer, poll_time, max_poll_time).proxy()
+ job_queue_poller = JobQueueMonitorActor.start(
+ config.new_arvados_client(), timer, server_calculator,
+ poll_time, max_poll_time).proxy()
+ return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
+
+_caught_signals = {}
+def shutdown_signal(signal_code, frame):
+ current_count = _caught_signals.get(signal_code, 0)
+ _caught_signals[signal_code] = current_count + 1
+ if node_daemon is None:
+ pykka.ActorRegistry.stop_all()
+ sys.exit(-signal_code)
+ elif current_count == 0:
+ node_daemon.shutdown()
+ elif current_count == 1:
+ pykka.ActorRegistry.stop_all()
+ else:
+ sys.exit(-signal_code)
+
+def main(args=None):
+ global node_daemon
+ args = parse_cli(args)
+ config = load_config(args.config)
+
+ if not args.foreground:
+ daemon.DaemonContext().open()
+ for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
+ signal.signal(sigcode, shutdown_signal)
+
+ setup_logging(config.get('Logging', 'file'), **config.log_levels())
+ timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
+ launch_pollers(config)
+ cloud_node_updater = ComputeNodeUpdateActor.start(
+ config.new_cloud_client).proxy()
+ node_daemon = NodeManagerDaemonActor.start(
+ job_queue_poller, arvados_node_poller, cloud_node_poller,
+ cloud_node_updater, timer,
+ config.new_arvados_client, config.new_cloud_client,
+ config.shutdown_windows(), config.getint('Daemon', 'max_nodes'),
+ config.getint('Daemon', 'poll_stale_after'),
+ config.getint('Daemon', 'node_stale_after')).proxy()
+
+ signal.pause()
+ daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
+ while not daemon_stopped():
+ time.sleep(1)
+ pykka.ActorRegistry.stop_all()
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+from . import clientactor
+from . import config
+
+class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
+ """Actor to poll the Arvados node list.
+
+ This actor regularly polls the list of Arvados node records, and
+ sends it to subscribers.
+ """
+
+ CLIENT_ERRORS = config.ARVADOS_ERRORS
+ LOGGER_NAME = 'arvnodeman.arvados_nodes'
+
+ def _item_key(self, node):
+ return node['uuid']
+
+ def _send_request(self):
+ return self._client.nodes().list(limit=10000).execute()['items']
+
+
+class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
+ """Actor to poll the cloud node list.
+
+ This actor regularly polls the cloud to get a list of running compute
+ nodes, and sends it to subscribers.
+ """
+
+ CLIENT_ERRORS = config.CLOUD_ERRORS
+ LOGGER_NAME = 'arvnodeman.cloud_nodes'
+
+ def _item_key(self, node):
+ return node.id
+
+ def _send_request(self):
+ return self._client.list_nodes()
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import heapq
+import time
+
+import pykka
+
+from .config import actor_class
+
+class TimedCallBackActor(actor_class):
+ """Send messages to other actors on a schedule.
+
+ Other actors can call the schedule() method to schedule delivery of a
+ message at a later time. This actor runs the necessary event loop for
+ delivery.
+ """
+ def __init__(self, max_sleep=1):
+ super(TimedCallBackActor, self).__init__()
+ self._proxy = self.actor_ref.proxy()
+ self.messages = []
+ self.max_sleep = max_sleep
+
+ def schedule(self, delivery_time, receiver, *args, **kwargs):
+ if not self.messages:
+ self._proxy.deliver()
+ heapq.heappush(self.messages, (delivery_time, receiver, args, kwargs))
+
+ def deliver(self):
+ if not self.messages:
+ return None
+ til_next = self.messages[0][0] - time.time()
+ if til_next < 0:
+ t, receiver, args, kwargs = heapq.heappop(self.messages)
+ try:
+ receiver(*args, **kwargs)
+ except pykka.ActorDeadError:
+ pass
+ else:
+ time.sleep(min(til_next, self.max_sleep))
+ self._proxy.deliver()
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+from arvnodeman.launcher import main
+main()
--- /dev/null
+# EC2 configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Poll EC2 nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = ec2
+
+# It's usually most cost-effective to shut down compute nodes during narrow
+# windows of time. For example, EC2 bills each node by the hour, so the best
+# time to shut down a node is right before a new hour of uptime starts.
+# Shutdown windows define these periods of time. These are windows in
+# full minutes, separated by commas. Counting from the time the node is
+# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
+# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
+# For example, "54, 5, 1" means the node may shut down from the 54th to the
+# 59th minute of each hour of uptime.
+# Specify at least two windows. You can add as many as you need beyond that.
+shutdown_windows = 54, 5, 1
+
+[Cloud Credentials]
+key = KEY
+secret = SECRET_KEY
+region = us-east-1
+timeout = 60
+
+[Cloud List]
+# This section defines filters that find compute nodes.
+# Tags that you specify here will automatically be added to nodes you create.
+# Replace colons in Amazon filters with underscores
+# (e.g., write "tag:mytag" as "tag_mytag").
+instance-state-name = running
+tag_arvados-class = dynamic-compute
+tag_cluster = zyxwv
+
+[Cloud Create]
+# New compute nodes will send pings to Arvados at this host.
+# You may specify a port, and use brackets to disambiguate IPv6 addresses.
+ping_host = hostname:port
+
+# Give the name of an SSH key on AWS...
+ex_keyname = string
+
+# ... or a file path for an SSH key that can log in to the compute node.
+# (One or the other, not both.)
+# ssh_key = path
+
+# The EC2 IDs of the image and subnet compute nodes should use.
+image_id = idstring
+subnet_id = idstring
+
+# Comma-separated EC2 IDs for the security group(s) assigned to each
+# compute node.
+security_groups = idstring1, idstring2
+
+[Size t2.medium]
+# You can define any number of Size sections to list EC2 sizes you're
+# willing to use. The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue (N.B.: defining more than one size has not been
+# tested yet).
+# Each size section MUST define the number of cores it has. You may also
+# want to define the number of mebibytes of scratch space for Crunch jobs.
+# You can also override Amazon's provided data fields by setting the same
+# names here.
+cores = 2
+scratch = 100
\ No newline at end of file
--- /dev/null
+# You can use this configuration to run a development Node Manager for
+# testing. It uses libcloud's dummy driver and your own development API server.
+# When new cloud nodes are created, you'll need to simulate the ping that
+# they send to the Arvados API server. The easiest way I've found to do that
+# is through the API server Rails console: load the Node object, set its
+# IP address to 10.10.0.N (where N is the cloud node's ID), and save.
+
+[Daemon]
+max_nodes = 8
+poll_time = 15
+max_poll_time = 60
+poll_stale_after = 600
+node_stale_after = 300
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+level = DEBUG
+pykka = DEBUG
+apiclient = WARNING
+
+[Arvados]
+host = localhost:3030
+# This is the token for the text fixture's admin user.
+token = 4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h
+insecure = yes
+timeout = 15
+
+[Cloud]
+provider = dummy
+shutdown_windows = 1, 1
+timeout = 15
+
+[Cloud Credentials]
+creds = dummycreds
+
+[Cloud List]
+[Cloud Create]
+
+[Size 2]
+cores = 4
+scratch = 1234
--- /dev/null
+#!/usr/bin/env python
+
+import os
+import subprocess
+import time
+
+from setuptools import setup, find_packages
+
+SETUP_DIR = os.path.dirname(__file__) or "."
+cmd_opts = {'egg_info': {}}
+try:
+ git_tags = subprocess.check_output(
+ ['git', 'log', '--first-parent', '--max-count=1',
+ '--format=format:%ct %h', SETUP_DIR]).split()
+ assert len(git_tags) == 2
+except (AssertionError, OSError, subprocess.CalledProcessError):
+ pass
+else:
+ git_tags[0] = time.strftime('%Y%m%d%H%M%S', time.gmtime(int(git_tags[0])))
+ cmd_opts['egg_info']['tag_build'] = '.{}.{}'.format(*git_tags)
+
+setup(name='arvados-node-manager',
+ version='0.1',
+ description='Arvados compute node manager',
+ long_description=open(os.path.join(SETUP_DIR, 'README.rst')).read(),
+ author='Arvados',
+ author_email='info@arvados.org',
+ url="https://arvados.org",
+ license='GNU Affero General Public License, version 3.0',
+ packages=find_packages(),
+ install_requires=[
+ 'apache-libcloud',
+ 'arvados-python-client',
+ 'pykka',
+ 'python-daemon',
+ ],
+ scripts=['bin/arvados-node-manager'],
+ test_suite='tests',
+ tests_require=['mock>=1.0'],
+ zip_safe=False,
+ options=cmd_opts,
+ )
--- /dev/null
+#!/usr/bin/env python
+
+import logging
+import os
+
+# Set the ANMTEST_LOGLEVEL environment variable to enable logging at that level.
+loglevel = os.environ.get('ANMTEST_LOGLEVEL', 'CRITICAL')
+logging.basicConfig(level=getattr(logging, loglevel.upper()))
+
+# Set the ANM_TIMEOUT environment variable to the maximum amount of time to
+# wait for tested actors to respond to important messages. The default value
+# is very conservative, because a small value may produce false negatives on
+# slower systems. If you're debugging a known timeout issue, however, you may
+# want to set this lower to speed up tests.
+pykka_timeout = int(os.environ.get('ANMTEST_TIMEOUT', '10'))
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import mock
+import pykka
+
+import arvnodeman.clientactor as clientactor
+from . import testutil
+
+class RemotePollLoopActorTestCase(testutil.RemotePollLoopActorTestMixin,
+ unittest.TestCase):
+ class MockClientError(Exception):
+ pass
+
+ class TestActor(clientactor.RemotePollLoopActor):
+ LOGGER_NAME = 'arvnodeman.testpoll'
+
+ def _send_request(self):
+ return self._client()
+ TestActor.CLIENT_ERRORS = (MockClientError,)
+ TEST_CLASS = TestActor
+
+
+ def build_monitor(self, side_effect, *args, **kwargs):
+ super(RemotePollLoopActorTestCase, self).build_monitor(*args, **kwargs)
+ self.client.side_effect = side_effect
+
+ def test_poll_loop_starts_after_subscription(self):
+ self.build_monitor(['test1'])
+ self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with('test1')
+ self.assertTrue(self.timer.schedule.called)
+
+ def test_poll_loop_continues_after_failure(self):
+ self.build_monitor(self.MockClientError)
+ self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
+ self.assertTrue(self.stop_proxy(self.monitor),
+ "poll loop died after error")
+ self.assertTrue(self.timer.schedule.called,
+ "poll loop did not reschedule after error")
+ self.assertFalse(self.subscriber.called,
+ "poll loop notified subscribers after error")
+
+ def test_late_subscribers_get_responses(self):
+ self.build_monitor(['pre_late_test', 'late_test'])
+ self.monitor.subscribe(lambda response: None).get(self.TIMEOUT)
+ self.monitor.subscribe(self.subscriber)
+ self.monitor.poll().get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with('late_test')
+
+ def test_survive_dead_subscriptions(self):
+ self.build_monitor(['survive1', 'survive2'])
+ dead_subscriber = mock.Mock(name='dead_subscriber')
+ dead_subscriber.side_effect = pykka.ActorDeadError
+ self.monitor.subscribe(dead_subscriber)
+ self.monitor.subscribe(self.subscriber)
+ self.monitor.poll().get(self.TIMEOUT)
+ self.assertTrue(self.stop_proxy(self.monitor),
+ "poll loop died from dead subscriber")
+ self.subscriber.assert_called_with('survive2')
+
+ def check_poll_timers(self, *test_times):
+ schedule_mock = self.timer.schedule
+ last_expect = None
+ with mock.patch('time.time') as time_mock:
+ for fake_time, expect_next in test_times:
+ time_mock.return_value = fake_time
+ self.monitor.poll(last_expect).get(self.TIMEOUT)
+ self.assertTrue(schedule_mock.called)
+ self.assertEqual(expect_next, schedule_mock.call_args[0][0])
+ schedule_mock.reset_mock()
+ last_expect = expect_next
+
+ def test_poll_timing_on_consecutive_successes_with_drift(self):
+ self.build_monitor(['1', '2'], poll_wait=3, max_poll_wait=14)
+ self.check_poll_timers((0, 3), (4, 6))
+
+ def test_poll_backoff_on_failures(self):
+ self.build_monitor(self.MockClientError, poll_wait=3, max_poll_wait=14)
+ self.check_poll_timers((0, 6), (6, 18), (18, 32))
+
+ def test_poll_timing_after_error_recovery(self):
+ self.build_monitor(['a', self.MockClientError(), 'b'],
+ poll_wait=3, max_poll_wait=14)
+ self.check_poll_timers((0, 3), (4, 10), (10, 13))
+
+ def test_no_subscriptions_by_key_without_support(self):
+ self.build_monitor([])
+ with self.assertRaises(AttributeError):
+ self.monitor.subscribe_to('key')
+
+
+class RemotePollLoopActorWithKeysTestCase(testutil.RemotePollLoopActorTestMixin,
+ unittest.TestCase):
+ class TestActor(RemotePollLoopActorTestCase.TestActor):
+ def _item_key(self, item):
+ return item['key']
+ TEST_CLASS = TestActor
+
+
+ def build_monitor(self, side_effect, *args, **kwargs):
+ super(RemotePollLoopActorWithKeysTestCase, self).build_monitor(
+ *args, **kwargs)
+ self.client.side_effect = side_effect
+
+ def test_key_subscription(self):
+ self.build_monitor([[{'key': 1}, {'key': 2}]])
+ self.monitor.subscribe_to(2, self.subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with({'key': 2})
+
+ def test_survive_dead_key_subscriptions(self):
+ item = {'key': 3}
+ self.build_monitor([[item], [item]])
+ dead_subscriber = mock.Mock(name='dead_subscriber')
+ dead_subscriber.side_effect = pykka.ActorDeadError
+ self.monitor.subscribe_to(3, dead_subscriber)
+ self.monitor.subscribe_to(3, self.subscriber)
+ self.monitor.poll().get(self.TIMEOUT)
+ self.assertTrue(self.stop_proxy(self.monitor),
+ "poll loop died from dead key subscriber")
+ self.subscriber.assert_called_with(item)
+
+ def test_mixed_subscriptions(self):
+ item = {'key': 4}
+ self.build_monitor([[item], [item]])
+ key_subscriber = mock.Mock(name='key_subscriber')
+ self.monitor.subscribe(self.subscriber)
+ self.monitor.subscribe_to(4, key_subscriber)
+ self.monitor.poll().get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with([item])
+ key_subscriber.assert_called_with(item)
+
+ def test_subscription_to_missing_key(self):
+ self.build_monitor([[]])
+ self.monitor.subscribe_to('nonesuch', self.subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with(None)
+
+
+if __name__ == '__main__':
+ unittest.main()
+
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import arvados.errors as arverror
+import httplib2
+import mock
+import pykka
+
+import arvnodeman.computenode as cnode
+from . import testutil
+
+class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
+ def make_mocks(self, arvados_effect=None, cloud_effect=None):
+ if arvados_effect is None:
+ arvados_effect = [testutil.arvados_node_mock()]
+ self.arvados_effect = arvados_effect
+ self.timer = testutil.MockTimer()
+ self.api_client = mock.MagicMock(name='api_client')
+ self.api_client.nodes().create().execute.side_effect = arvados_effect
+ self.api_client.nodes().update().execute.side_effect = arvados_effect
+ self.cloud_client = mock.MagicMock(name='cloud_client')
+ self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
+
+ def make_actor(self, arv_node=None):
+ if not hasattr(self, 'timer'):
+ self.make_mocks(arvados_effect=[arv_node])
+ self.setup_actor = cnode.ComputeNodeSetupActor.start(
+ self.timer, self.api_client, self.cloud_client,
+ testutil.MockSize(1), arv_node).proxy()
+
+ def test_creation_without_arvados_node(self):
+ self.make_actor()
+ self.assertEqual(self.arvados_effect[-1],
+ self.setup_actor.arvados_node.get(self.TIMEOUT))
+ self.assertTrue(self.api_client.nodes().create().execute.called)
+ self.assertEqual(self.cloud_client.create_node(),
+ self.setup_actor.cloud_node.get(self.TIMEOUT))
+
+ def test_creation_with_arvados_node(self):
+ self.make_actor(testutil.arvados_node_mock())
+ self.assertEqual(self.arvados_effect[-1],
+ self.setup_actor.arvados_node.get(self.TIMEOUT))
+ self.assertTrue(self.api_client.nodes().update().execute.called)
+ self.assertEqual(self.cloud_client.create_node(),
+ self.setup_actor.cloud_node.get(self.TIMEOUT))
+
+ def test_failed_calls_retried(self):
+ self.make_mocks([
+ arverror.ApiError(httplib2.Response({'status': '500'}), ""),
+ testutil.arvados_node_mock(),
+ ])
+ self.make_actor()
+ self.wait_for_assignment(self.setup_actor, 'cloud_node')
+
+ def test_stop_when_no_cloud_node(self):
+ self.make_mocks(
+ arverror.ApiError(httplib2.Response({'status': '500'}), ""))
+ self.make_actor()
+ self.setup_actor.stop_if_no_cloud_node()
+ self.assertTrue(
+ self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
+
+ def test_no_stop_when_cloud_node(self):
+ self.make_actor()
+ self.wait_for_assignment(self.setup_actor, 'cloud_node')
+ self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT)
+ self.assertTrue(self.stop_proxy(self.setup_actor),
+ "actor was stopped by stop_if_no_cloud_node")
+
+ def test_subscribe(self):
+ self.make_mocks(
+ arverror.ApiError(httplib2.Response({'status': '500'}), ""))
+ self.make_actor()
+ subscriber = mock.Mock(name='subscriber_mock')
+ self.setup_actor.subscribe(subscriber)
+ self.api_client.nodes().create().execute.side_effect = [
+ testutil.arvados_node_mock()]
+ self.wait_for_assignment(self.setup_actor, 'cloud_node')
+ self.assertEqual(self.setup_actor.actor_ref.actor_urn,
+ subscriber.call_args[0][0].actor_ref.actor_urn)
+
+ def test_late_subscribe(self):
+ self.make_actor()
+ subscriber = mock.Mock(name='subscriber_mock')
+ self.wait_for_assignment(self.setup_actor, 'cloud_node')
+ self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.setup_actor)
+ self.assertEqual(self.setup_actor.actor_ref.actor_urn,
+ subscriber.call_args[0][0].actor_ref.actor_urn)
+
+
+class ComputeNodeShutdownActorTestCase(testutil.ActorTestMixin,
+ unittest.TestCase):
+ def make_mocks(self, cloud_node=None):
+ self.timer = testutil.MockTimer()
+ self.cloud_client = mock.MagicMock(name='cloud_client')
+ if cloud_node is None:
+ cloud_node = testutil.cloud_node_mock()
+ self.cloud_node = cloud_node
+
+ def make_actor(self, arv_node=None):
+ if not hasattr(self, 'timer'):
+ self.make_mocks()
+ self.shutdown_actor = cnode.ComputeNodeShutdownActor.start(
+ self.timer, self.cloud_client, self.cloud_node).proxy()
+
+ def test_easy_shutdown(self):
+ self.make_actor()
+ self.shutdown_actor.cloud_node.get(self.TIMEOUT)
+ self.stop_proxy(self.shutdown_actor)
+ self.assertTrue(self.cloud_client.destroy_node.called)
+
+ def test_late_subscribe(self):
+ self.make_actor()
+ subscriber = mock.Mock(name='subscriber_mock')
+ self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.shutdown_actor)
+ self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
+ subscriber.call_args[0][0].actor_ref.actor_urn)
+
+
+class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
+ unittest.TestCase):
+ def make_actor(self):
+ self.driver = mock.MagicMock(name='driver_mock')
+ self.updater = cnode.ComputeNodeUpdateActor.start(self.driver).proxy()
+
+ def test_node_sync(self):
+ self.make_actor()
+ cloud_node = testutil.cloud_node_mock()
+ arv_node = testutil.arvados_node_mock()
+ self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+ self.driver().sync_node.assert_called_with(cloud_node, arv_node)
+
+
+@mock.patch('time.time', return_value=1)
+class ShutdownTimerTestCase(unittest.TestCase):
+ def test_two_length_window(self, time_mock):
+ timer = cnode.ShutdownTimer(time_mock.return_value, [8, 2])
+ self.assertEqual(481, timer.next_opening())
+ self.assertFalse(timer.window_open())
+ time_mock.return_value += 500
+ self.assertEqual(1081, timer.next_opening())
+ self.assertTrue(timer.window_open())
+ time_mock.return_value += 200
+ self.assertEqual(1081, timer.next_opening())
+ self.assertFalse(timer.window_open())
+
+ def test_three_length_window(self, time_mock):
+ timer = cnode.ShutdownTimer(time_mock.return_value, [6, 3, 1])
+ self.assertEqual(361, timer.next_opening())
+ self.assertFalse(timer.window_open())
+ time_mock.return_value += 400
+ self.assertEqual(961, timer.next_opening())
+ self.assertTrue(timer.window_open())
+ time_mock.return_value += 200
+ self.assertEqual(961, timer.next_opening())
+ self.assertFalse(timer.window_open())
+
+
+class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
+ unittest.TestCase):
+ class MockShutdownTimer(object):
+ def _set_state(self, is_open, next_opening):
+ self.window_open = lambda: is_open
+ self.next_opening = lambda: next_opening
+
+
+ def make_mocks(self, node_num):
+ self.shutdowns = self.MockShutdownTimer()
+ self.shutdowns._set_state(False, 300)
+ self.timer = mock.MagicMock(name='timer_mock')
+ self.updates = mock.MagicMock(name='update_mock')
+ self.cloud_mock = testutil.cloud_node_mock(node_num)
+ self.subscriber = mock.Mock(name='subscriber_mock')
+
+ def make_actor(self, node_num=1, arv_node=None, start_time=None):
+ if not hasattr(self, 'cloud_mock'):
+ self.make_mocks(node_num)
+ if start_time is None:
+ start_time = time.time()
+ self.node_actor = cnode.ComputeNodeMonitorActor.start(
+ self.cloud_mock, start_time, self.shutdowns, self.timer,
+ self.updates, arv_node).proxy()
+ self.subscription = self.node_actor.subscribe(self.subscriber)
+
+ def test_init_shutdown_scheduling(self):
+ self.make_actor()
+ self.subscription.get(self.TIMEOUT)
+ self.assertTrue(self.timer.schedule.called)
+ self.assertEqual(300, self.timer.schedule.call_args[0][0])
+
+ def test_shutdown_subscription(self):
+ self.make_actor()
+ self.shutdowns._set_state(True, 600)
+ self.node_actor.consider_shutdown().get(self.TIMEOUT)
+ self.assertTrue(self.subscriber.called)
+ self.assertEqual(self.node_actor.actor_ref.actor_urn,
+ self.subscriber.call_args[0][0].actor_ref.actor_urn)
+
+ def test_shutdown_without_arvados_node(self):
+ self.make_actor()
+ self.shutdowns._set_state(True, 600)
+ self.node_actor.consider_shutdown().get(self.TIMEOUT)
+ self.assertTrue(self.subscriber.called)
+
+ def test_no_shutdown_without_arvados_node_and_old_cloud_node(self):
+ self.make_actor(start_time=0)
+ self.shutdowns._set_state(True, 600)
+ self.node_actor.consider_shutdown().get(self.TIMEOUT)
+ self.assertFalse(self.subscriber.called)
+
+ def check_shutdown_rescheduled(self, window_open, next_window,
+ schedule_time=None):
+ self.shutdowns._set_state(window_open, next_window)
+ self.timer.schedule.reset_mock()
+ self.node_actor.consider_shutdown().get(self.TIMEOUT)
+ self.stop_proxy(self.node_actor)
+ self.assertTrue(self.timer.schedule.called)
+ if schedule_time is not None:
+ self.assertEqual(schedule_time, self.timer.schedule.call_args[0][0])
+ self.assertFalse(self.subscriber.called)
+
+ def test_shutdown_window_close_scheduling(self):
+ self.make_actor()
+ self.check_shutdown_rescheduled(False, 600, 600)
+
+ def test_no_shutdown_when_node_running_job(self):
+ self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
+ self.check_shutdown_rescheduled(True, 600)
+
+ def test_no_shutdown_when_node_state_unknown(self):
+ self.make_actor(5, testutil.arvados_node_mock(5, info={}))
+ self.check_shutdown_rescheduled(True, 600)
+
+ def test_no_shutdown_when_node_state_stale(self):
+ self.make_actor(6, testutil.arvados_node_mock(6, age=900))
+ self.check_shutdown_rescheduled(True, 600)
+
+ def test_arvados_node_match(self):
+ self.make_actor(2)
+ arv_node = testutil.arvados_node_mock(
+ 2, hostname='compute-two.zzzzz.arvadosapi.com')
+ pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
+ self.assertEqual(self.cloud_mock.id, pair_id)
+ self.stop_proxy(self.node_actor)
+ self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
+
+ def test_arvados_node_mismatch(self):
+ self.make_actor(3)
+ arv_node = testutil.arvados_node_mock(1)
+ self.assertIsNone(
+ self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
+
+ def test_update_cloud_node(self):
+ self.make_actor(1)
+ self.make_mocks(2)
+ self.cloud_mock.id = '1'
+ self.node_actor.update_cloud_node(self.cloud_mock)
+ current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
+ self.assertEqual([testutil.ip_address_mock(2)],
+ current_cloud.private_ips)
+
+ def test_missing_cloud_node_update(self):
+ self.make_actor(1)
+ self.node_actor.update_cloud_node(None)
+ current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
+ self.assertEqual([testutil.ip_address_mock(1)],
+ current_cloud.private_ips)
+
+ def test_update_arvados_node(self):
+ self.make_actor(3)
+ job_uuid = 'zzzzz-jjjjj-updatejobnode00'
+ new_arvados = testutil.arvados_node_mock(3, job_uuid)
+ self.node_actor.update_arvados_node(new_arvados)
+ current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
+ self.assertEqual(job_uuid, current_arvados['job_uuid'])
+
+ def test_missing_arvados_node_update(self):
+ self.make_actor(4, testutil.arvados_node_mock(4))
+ self.node_actor.update_arvados_node(None)
+ current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
+ self.assertEqual(testutil.ip_address_mock(4),
+ current_arvados['ip_address'])
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+
+import arvnodeman.computenode.ec2 as ec2
+from . import testutil
+
+class EC2ComputeNodeDriverTestCase(unittest.TestCase):
+ def setUp(self):
+ self.driver_mock = mock.MagicMock(name='driver_mock')
+
+ def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
+ create_kwargs.setdefault('ping_host', '100::')
+ return ec2.ComputeNodeDriver(
+ auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=self.driver_mock)
+
+ def test_driver_instantiation(self):
+ kwargs = {'key': 'testkey'}
+ driver = self.new_driver(auth_kwargs=kwargs)
+ self.assertTrue(self.driver_mock.called)
+ self.assertEqual(kwargs, self.driver_mock.call_args[1])
+
+ def test_list_kwargs_become_filters(self):
+ # We're also testing tag name translation.
+ driver = self.new_driver(list_kwargs={'tag_test': 'true'})
+ driver.list_nodes()
+ list_method = self.driver_mock().list_nodes
+ self.assertTrue(list_method.called)
+ self.assertEqual({'tag:test': 'true'},
+ list_method.call_args[1].get('ex_filters'))
+
+ def test_create_location_loaded_at_initialization(self):
+ kwargs = {'location': 'testregion'}
+ driver = self.new_driver(create_kwargs=kwargs)
+ self.assertTrue(self.driver_mock().list_locations)
+
+ def test_create_image_loaded_at_initialization(self):
+ kwargs = {'image': 'testimage'}
+ driver = self.new_driver(create_kwargs=kwargs)
+ self.assertTrue(self.driver_mock().list_images)
+
+ def test_create_includes_ping_secret(self):
+ arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
+ driver = self.new_driver()
+ driver.create_node(testutil.MockSize(1), arv_node)
+ create_method = self.driver_mock().create_node
+ self.assertTrue(create_method.called)
+ self.assertIn('ping_secret=ssshh',
+ create_method.call_args[1].get('ex_userdata',
+ 'arg missing'))
+
+ def test_tags_created_from_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(8)
+ cloud_node = testutil.cloud_node_mock(8)
+ driver = self.new_driver(list_kwargs={'tag:list': 'test'})
+ self.assertEqual({'ex_metadata': {'list': 'test'},
+ 'name': 'compute8.zzzzz.arvadosapi.com'},
+ driver.arvados_create_kwargs(arv_node))
+
+ def test_tags_set_default_hostname_from_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(hostname=None)
+ driver = self.new_driver()
+ actual = driver.arvados_create_kwargs(arv_node)
+ self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
+ actual['name'])
+
+ def test_sync_node(self):
+ arv_node = testutil.arvados_node_mock(1)
+ cloud_node = testutil.cloud_node_mock(2)
+ driver = self.new_driver()
+ driver.sync_node(cloud_node, arv_node)
+ tag_mock = self.driver_mock().ex_create_tags
+ self.assertTrue(tag_mock.called)
+ self.assertEqual('compute1.zzzzz.arvadosapi.com',
+ tag_mock.call_args[0][1].get('Name', 'no name'))
+
+ def test_node_create_time(self):
+ refsecs = int(time.time())
+ reftuple = time.gmtime(refsecs)
+ node = testutil.cloud_node_mock()
+ node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
+ reftuple)}
+ self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import io
+import logging
+import unittest
+
+import arvnodeman.config as nmconfig
+
+class NodeManagerConfigTestCase(unittest.TestCase):
+ TEST_CONFIG = u"""
+[Cloud]
+provider = dummy
+shutdown_windows = 52, 6, 2
+
+[Cloud Credentials]
+creds = dummy_creds
+
+[Cloud List]
+[Cloud Create]
+
+[Size 1]
+cores = 1
+
+[Logging]
+file = /dev/null
+level = DEBUG
+testlogger = INFO
+"""
+
+ def load_config(self, config=None, config_str=None):
+ if config is None:
+ config = nmconfig.NodeManagerConfig()
+ if config_str is None:
+ config_str = self.TEST_CONFIG
+ with io.StringIO(config_str) as config_fp:
+ config.readfp(config_fp)
+ return config
+
+ def test_seeded_defaults(self):
+ config = nmconfig.NodeManagerConfig()
+ sec_names = set(config.sections())
+ self.assertIn('Arvados', sec_names)
+ self.assertIn('Daemon', sec_names)
+ self.assertFalse(any(name.startswith('Size ') for name in sec_names))
+
+ def test_list_sizes(self):
+ config = self.load_config()
+ client = config.new_cloud_client()
+ sizes = config.node_sizes(client.list_sizes())
+ self.assertEqual(1, len(sizes))
+ size, kwargs = sizes[0]
+ self.assertEqual('Small', size.name)
+ self.assertEqual(1, kwargs['cores'])
+
+ def test_shutdown_windows(self):
+ config = self.load_config()
+ self.assertEqual([52, 6, 2], config.shutdown_windows())
+
+ def test_log_levels(self):
+ config = self.load_config()
+ self.assertEqual({'level': logging.DEBUG,
+ 'testlogger': logging.INFO},
+ config.log_levels())
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+
+import arvnodeman.daemon as nmdaemon
+from . import testutil
+
+class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
+ unittest.TestCase):
+ def make_daemon(self, cloud_nodes=[], arvados_nodes=[], want_sizes=[]):
+ for name in ['cloud_nodes', 'arvados_nodes', 'server_wishlist']:
+ setattr(self, name + '_poller', mock.MagicMock(name=name + '_mock'))
+ self.arv_factory = mock.MagicMock(name='arvados_mock')
+ self.cloud_factory = mock.MagicMock(name='cloud_mock')
+ self.cloud_factory().node_start_time.return_value = time.time()
+ self.cloud_updates = mock.MagicMock(name='updates_mock')
+ self.timer = testutil.MockTimer()
+ self.node_factory = mock.MagicMock(name='factory_mock')
+ self.node_setup = mock.MagicMock(name='setup_mock')
+ self.node_shutdown = mock.MagicMock(name='shutdown_mock')
+ self.daemon = nmdaemon.NodeManagerDaemonActor.start(
+ self.server_wishlist_poller, self.arvados_nodes_poller,
+ self.cloud_nodes_poller, self.cloud_updates, self.timer,
+ self.arv_factory, self.cloud_factory,
+ [54, 5, 1], 8, 600, 3600,
+ self.node_setup, self.node_shutdown, self.node_factory).proxy()
+ if cloud_nodes is not None:
+ self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
+ if arvados_nodes is not None:
+ self.daemon.update_arvados_nodes(arvados_nodes).get(self.TIMEOUT)
+ if want_sizes is not None:
+ self.daemon.update_server_wishlist(want_sizes).get(self.TIMEOUT)
+
+ def test_easy_node_creation(self):
+ size = testutil.MockSize(1)
+ self.make_daemon(want_sizes=[size])
+ self.stop_proxy(self.daemon)
+ self.assertTrue(self.node_setup.start.called)
+
+ def test_node_pairing(self):
+ cloud_node = testutil.cloud_node_mock(1)
+ arv_node = testutil.arvados_node_mock(1)
+ self.make_daemon([cloud_node], [arv_node])
+ self.stop_proxy(self.daemon)
+ self.node_factory.start().proxy().offer_arvados_pair.assert_called_with(
+ arv_node)
+
+ def test_node_pairing_after_arvados_update(self):
+ cloud_node = testutil.cloud_node_mock(2)
+ arv_node = testutil.arvados_node_mock(2, ip_address=None)
+ self.make_daemon([cloud_node], None)
+ pair_func = self.node_factory.start().proxy().offer_arvados_pair
+ pair_func().get.return_value = None
+ self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+ pair_func.assert_called_with(arv_node)
+
+ pair_func().get.return_value = cloud_node.id
+ pair_func.reset_mock()
+ arv_node = testutil.arvados_node_mock(2)
+ self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+ pair_func.assert_called_with(arv_node)
+
+ def test_old_arvados_node_not_double_assigned(self):
+ arv_node = testutil.arvados_node_mock(3, age=9000)
+ size = testutil.MockSize(3)
+ self.make_daemon(arvados_nodes=[arv_node])
+ setup_ref = self.node_setup.start().proxy().actor_ref
+ setup_ref.actor_urn = 0
+ self.node_setup.start.reset_mock()
+ self.daemon.update_server_wishlist([size]).get(self.TIMEOUT)
+ self.daemon.max_nodes.get(self.TIMEOUT)
+ setup_ref.actor_urn += 1
+ self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ used_nodes = [call[1].get('arvados_node')
+ for call in self.node_setup.start.call_args_list]
+ self.assertEqual(2, len(used_nodes))
+ self.assertIn(arv_node, used_nodes)
+ self.assertIn(None, used_nodes)
+
+ def test_node_count_satisfied(self):
+ self.make_daemon([testutil.cloud_node_mock()],
+ want_sizes=[testutil.MockSize(1)])
+ self.stop_proxy(self.daemon)
+ self.assertFalse(self.node_setup.called)
+
+ def test_booting_nodes_counted(self):
+ cloud_node = testutil.cloud_node_mock(1)
+ arv_node = testutil.arvados_node_mock(1)
+ server_wishlist = [testutil.MockSize(1)] * 2
+ self.make_daemon([cloud_node], [arv_node], server_wishlist)
+ self.daemon.max_nodes.get(self.TIMEOUT)
+ self.assertTrue(self.node_setup.start.called)
+ self.daemon.update_server_wishlist(server_wishlist).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertEqual(1, self.node_setup.start.call_count)
+
+ def mock_setup_actor(self, cloud_node, arv_node):
+ setup = mock.MagicMock(name='setup_node_mock')
+ setup.actor_ref = self.node_setup.start().proxy().actor_ref
+ self.node_setup.reset_mock()
+ setup.actor_urn = cloud_node.id
+ setup.cloud_node.get.return_value = cloud_node
+ setup.arvados_node.get.return_value = arv_node
+ return setup
+
+ def start_node_boot(self, cloud_node=None, arv_node=None, id_num=1):
+ if cloud_node is None:
+ cloud_node = testutil.cloud_node_mock(id_num)
+ if arv_node is None:
+ arv_node = testutil.arvados_node_mock(id_num)
+ self.make_daemon(want_sizes=[testutil.MockSize(id_num)])
+ self.daemon.max_nodes.get(self.TIMEOUT)
+ self.assertEqual(1, self.node_setup.start.call_count)
+ return self.mock_setup_actor(cloud_node, arv_node)
+
+ def test_no_duplication_when_booting_node_listed_fast(self):
+ # Test that we don't start two ComputeNodeMonitorActors when
+ # we learn about a booting node through a listing before we
+ # get the "node up" message from CloudNodeSetupActor.
+ cloud_node = testutil.cloud_node_mock(1)
+ setup = self.start_node_boot(cloud_node)
+ self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
+ self.assertTrue(self.node_factory.start.called)
+ self.daemon.node_up(setup).get(self.TIMEOUT)
+ self.assertEqual(1, self.node_factory.start.call_count)
+
+ def test_no_duplication_when_booted_node_listed(self):
+ cloud_node = testutil.cloud_node_mock(2)
+ setup = self.start_node_boot(cloud_node, id_num=2)
+ self.daemon.node_up(setup)
+ self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
+ self.assertEqual(1, self.node_factory.start.call_count)
+
+ def test_node_counted_after_boot_with_slow_listing(self):
+ # Test that, after we boot a compute node, we assume it exists
+ # even it doesn't appear in the listing (e.g., because of delays
+ # propagating tags).
+ setup = self.start_node_boot()
+ self.daemon.node_up(setup).get(self.TIMEOUT)
+ self.assertTrue(self.node_factory.start.called,
+ "daemon not monitoring booted node")
+ self.daemon.update_cloud_nodes([])
+ self.stop_proxy(self.daemon)
+ self.assertEqual(1, self.node_factory.start.call_count,
+ "daemon has duplicate monitors for booted node")
+ self.assertFalse(self.node_factory.start().proxy().stop.called,
+ "daemon prematurely stopped monitoring a new node")
+
+ def test_booted_unlisted_node_counted(self):
+ setup = self.start_node_boot(id_num=1)
+ self.daemon.node_up(setup)
+ self.daemon.update_server_wishlist(
+ [testutil.MockSize(1)]).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertFalse(self.node_setup.start.called,
+ "daemon did not count booted node toward wishlist")
+
+ def test_booted_node_can_shutdown(self):
+ setup = self.start_node_boot()
+ self.daemon.node_up(setup)
+ self.daemon.update_server_wishlist([])
+ self.daemon.node_can_shutdown(
+ self.node_factory.start().proxy()).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertTrue(self.node_shutdown.start.called,
+ "daemon did not shut down booted node on offer")
+
+ def test_booted_node_lifecycle(self):
+ cloud_node = testutil.cloud_node_mock(6)
+ setup = self.start_node_boot(cloud_node, id_num=6)
+ monitor = self.node_factory.start().proxy()
+ monitor.cloud_node.get.return_value = cloud_node
+ self.daemon.node_up(setup)
+ self.daemon.update_server_wishlist([])
+ self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+ self.assertTrue(self.node_shutdown.start.called,
+ "daemon did not shut down booted node on offer")
+ shutdown = self.node_shutdown.start().proxy()
+ shutdown.cloud_node.get.return_value = cloud_node
+ self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
+ self.assertTrue(shutdown.stop.called,
+ "shutdown actor not stopped after finishing")
+ self.assertTrue(monitor.stop.called,
+ "monitor for booted node not stopped after shutdown")
+ self.daemon.update_server_wishlist(
+ [testutil.MockSize(2)]).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertTrue(self.node_setup.start.called,
+ "second node not started after booted node stopped")
+
+ def test_booting_nodes_shut_down(self):
+ self.make_daemon(want_sizes=[testutil.MockSize(1)])
+ self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertTrue(
+ self.node_setup.start().proxy().stop_if_no_cloud_node.called)
+
+ def test_shutdown_declined_at_wishlist_capacity(self):
+ cloud_node = testutil.cloud_node_mock(1)
+ size = testutil.MockSize(1)
+ self.make_daemon(cloud_nodes=[cloud_node], want_sizes=[size])
+ self.daemon.node_can_shutdown(
+ self.node_factory.start().proxy()).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertFalse(self.node_shutdown.start.called)
+
+ def test_shutdown_accepted_below_capacity(self):
+ self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
+ node_actor = self.node_factory().proxy()
+ self.daemon.node_can_shutdown(node_actor).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertTrue(self.node_shutdown.start.called)
+
+ def test_clean_shutdown_waits_for_node_setup_finish(self):
+ self.make_daemon(want_sizes=[testutil.MockSize(1)])
+ self.daemon.max_nodes.get(self.TIMEOUT)
+ self.assertTrue(self.node_setup.start.called)
+ new_node = self.node_setup.start().proxy()
+ self.daemon.shutdown().get(self.TIMEOUT)
+ self.assertTrue(new_node.stop_if_no_cloud_node.called)
+ self.daemon.node_up(new_node).get(self.TIMEOUT)
+ self.assertTrue(new_node.stop.called)
+ self.assertTrue(
+ self.daemon.actor_ref.actor_stopped.wait(self.TIMEOUT))
+
+ def test_wishlist_ignored_after_shutdown(self):
+ size = testutil.MockSize(2)
+ self.make_daemon(want_sizes=[size])
+ self.daemon.shutdown().get(self.TIMEOUT)
+ self.daemon.update_server_wishlist([size] * 2).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertEqual(1, self.node_setup.start.call_count)
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import arvnodeman.jobqueue as jobqueue
+from . import testutil
+
+class ServerCalculatorTestCase(unittest.TestCase):
+ def make_calculator(self, factors, **kwargs):
+ return jobqueue.ServerCalculator(
+ [(testutil.MockSize(n), {'cores': n}) for n in factors], **kwargs)
+
+ def calculate(self, servcalc, *constraints):
+ return servcalc.servers_for_queue(
+ [{'uuid': 'zzzzz-jjjjj-{:015x}'.format(index),
+ 'runtime_constraints': cdict}
+ for index, cdict in enumerate(constraints)])
+
+ def test_empty_queue_needs_no_servers(self):
+ servcalc = self.make_calculator([1])
+ self.assertEqual([], servcalc.servers_for_queue([]))
+
+ def test_easy_server_count(self):
+ servcalc = self.make_calculator([1])
+ servlist = self.calculate(servcalc, {'min_nodes': 3})
+ self.assertEqual(3, len(servlist))
+
+ def test_implicit_server_count(self):
+ servcalc = self.make_calculator([1])
+ servlist = self.calculate(servcalc, {}, {'min_nodes': 3})
+ self.assertEqual(4, len(servlist))
+
+ def test_bad_min_nodes_override(self):
+ servcalc = self.make_calculator([1])
+ servlist = self.calculate(servcalc,
+ {'min_nodes': -2}, {'min_nodes': 'foo'})
+ self.assertEqual(2, len(servlist))
+
+ def test_ignore_unsatisfiable_jobs(self):
+ servcalc = self.make_calculator([1], max_nodes=9)
+ servlist = self.calculate(servcalc,
+ {'min_cores_per_node': 2},
+ {'min_ram_mb_per_node': 256},
+ {'min_nodes': 6},
+ {'min_nodes': 12},
+ {'min_scratch_mb_per_node': 200})
+ self.assertEqual(6, len(servlist))
+
+
+class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
+ unittest.TestCase):
+ TEST_CLASS = jobqueue.JobQueueMonitorActor
+
+ class MockCalculator(object):
+ @staticmethod
+ def servers_for_queue(queue):
+ return [testutil.MockSize(n) for n in queue]
+
+
+ def build_monitor(self, side_effect, *args, **kwargs):
+ super(JobQueueMonitorActorTestCase, self).build_monitor(*args, **kwargs)
+ self.client.jobs().queue().execute.side_effect = side_effect
+
+ def test_subscribers_get_server_lists(self):
+ self.build_monitor([{'items': [1, 2]}], self.MockCalculator())
+ self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with([testutil.MockSize(1),
+ testutil.MockSize(2)])
+
+
+if __name__ == '__main__':
+ unittest.main()
+
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import unittest
+
+import arvnodeman.nodelist as nodelist
+from . import testutil
+
+class ArvadosNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
+ unittest.TestCase):
+ TEST_CLASS = nodelist.ArvadosNodeListMonitorActor
+
+ def build_monitor(self, side_effect, *args, **kwargs):
+ super(ArvadosNodeListMonitorActorTestCase, self).build_monitor(
+ *args, **kwargs)
+ self.client.nodes().list().execute.side_effect = side_effect
+
+ def test_uuid_is_subscription_key(self):
+ node = testutil.arvados_node_mock()
+ self.build_monitor([{'items': [node]}])
+ self.monitor.subscribe_to(node['uuid'],
+ self.subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with(node)
+
+
+class CloudNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
+ unittest.TestCase):
+ TEST_CLASS = nodelist.CloudNodeListMonitorActor
+
+ class MockNode(object):
+ def __init__(self, count):
+ self.id = str(count)
+ self.name = 'test{}.example.com'.format(count)
+ self.private_ips = ['10.0.0.{}'.format(count)]
+ self.public_ips = []
+ self.size = None
+ self.state = 0
+
+
+ def build_monitor(self, side_effect, *args, **kwargs):
+ super(CloudNodeListMonitorActorTestCase, self).build_monitor(
+ *args, **kwargs)
+ self.client.list_nodes.side_effect = side_effect
+
+ def test_id_is_subscription_key(self):
+ node = self.MockNode(1)
+ self.build_monitor([[node]])
+ self.monitor.subscribe_to('1', self.subscriber).get(self.TIMEOUT)
+ self.stop_proxy(self.monitor)
+ self.subscriber.assert_called_with(node)
+
+
+if __name__ == '__main__':
+ unittest.main()
+
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+import pykka
+
+import arvnodeman.timedcallback as timedcallback
+from . import testutil
+
+@testutil.no_sleep
+class TimedCallBackActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
+ def test_immediate_turnaround(self):
+ receiver = mock.Mock()
+ deliverer = timedcallback.TimedCallBackActor.start().proxy()
+ deliverer.schedule(time.time() - 1, receiver,
+ 'immediate').get(self.TIMEOUT)
+ self.stop_proxy(deliverer)
+ receiver.assert_called_with('immediate')
+
+ def test_delayed_turnaround(self):
+ receiver = mock.Mock()
+ with mock.patch('time.time', return_value=0) as mock_now:
+ deliverer = timedcallback.TimedCallBackActor.start().proxy()
+ deliverer.schedule(1, receiver, 'delayed')
+ deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+ self.assertFalse(receiver.called)
+ mock_now.return_value = 2
+ deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+ self.stop_proxy(deliverer)
+ receiver.assert_called_with('delayed')
+
+ def test_out_of_order_scheduling(self):
+ receiver = mock.Mock()
+ with mock.patch('time.time', return_value=1.5) as mock_now:
+ deliverer = timedcallback.TimedCallBackActor.start().proxy()
+ deliverer.schedule(2, receiver, 'second')
+ deliverer.schedule(1, receiver, 'first')
+ deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+ receiver.assert_called_with('first')
+ mock_now.return_value = 2.5
+ deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
+ self.stop_proxy(deliverer)
+ receiver.assert_called_with('second')
+
+ def test_dead_actors_ignored(self):
+ receiver = mock.Mock(name='dead_actor', spec=pykka.ActorRef)
+ receiver.tell.side_effect = pykka.ActorDeadError
+ deliverer = timedcallback.TimedCallBackActor.start().proxy()
+ deliverer.schedule(time.time() - 1, receiver.tell,
+ 'error').get(self.TIMEOUT)
+ self.assertTrue(self.stop_proxy(deliverer), "deliverer died")
+ receiver.tell.assert_called_with('error')
+
+
+if __name__ == '__main__':
+ unittest.main()
+
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+
+import mock
+import pykka
+
+from . import pykka_timeout
+
+no_sleep = mock.patch('time.sleep', lambda n: None)
+
+def arvados_node_mock(node_num=99, job_uuid=None, age=0, **kwargs):
+ if job_uuid is True:
+ job_uuid = 'zzzzz-jjjjj-jobjobjobjobjob'
+ slurm_state = 'idle' if (job_uuid is None) else 'alloc'
+ node = {'uuid': 'zzzzz-yyyyy-12345abcde67890',
+ 'created_at': '2014-01-01T01:02:03Z',
+ 'modified_at': time.strftime('%Y-%m-%dT%H:%M:%SZ',
+ time.gmtime(time.time() - age)),
+ 'hostname': 'compute{}'.format(node_num),
+ 'domain': 'zzzzz.arvadosapi.com',
+ 'ip_address': ip_address_mock(node_num),
+ 'job_uuid': job_uuid,
+ 'info': {'slurm_state': slurm_state}}
+ node.update(kwargs)
+ return node
+
+def cloud_node_mock(node_num=99):
+ node = mock.NonCallableMagicMock(
+ ['id', 'name', 'state', 'public_ips', 'private_ips', 'driver', 'size',
+ 'image', 'extra'],
+ name='cloud_node')
+ node.id = str(node_num)
+ node.name = node.id
+ node.public_ips = []
+ node.private_ips = [ip_address_mock(node_num)]
+ return node
+
+def ip_address_mock(last_octet):
+ return '10.20.30.{}'.format(last_octet)
+
+class MockSize(object):
+ def __init__(self, factor):
+ self.id = 'z{}.test'.format(factor)
+ self.name = self.id
+ self.ram = 128 * factor
+ self.disk = 100 * factor
+ self.bandwidth = 16 * factor
+ self.price = float(factor)
+ self.extra = {}
+
+ def __eq__(self, other):
+ return self.id == other.id
+
+
+class MockTimer(object):
+ def schedule(self, want_time, callback, *args, **kwargs):
+ return callback(*args, **kwargs)
+
+
+class ActorTestMixin(object):
+ FUTURE_CLASS = pykka.ThreadingFuture
+ TIMEOUT = pykka_timeout
+
+ def tearDown(self):
+ pykka.ActorRegistry.stop_all()
+
+ def stop_proxy(self, proxy):
+ return proxy.actor_ref.stop(timeout=self.TIMEOUT)
+
+ def wait_for_assignment(self, proxy, attr_name, unassigned=None,
+ timeout=TIMEOUT):
+ deadline = time.time() + timeout
+ while True:
+ loop_timeout = deadline - time.time()
+ if loop_timeout <= 0:
+ self.fail("actor did not assign {} in time".format(attr_name))
+ result = getattr(proxy, attr_name).get(loop_timeout)
+ if result is not unassigned:
+ return result
+
+
+class RemotePollLoopActorTestMixin(ActorTestMixin):
+ def build_monitor(self, *args, **kwargs):
+ self.timer = mock.MagicMock(name='timer_mock')
+ self.client = mock.MagicMock(name='client_mock')
+ self.subscriber = mock.Mock(name='subscriber_mock')
+ self.monitor = self.TEST_CLASS.start(
+ self.client, self.timer, *args, **kwargs).proxy()