10111: Refactored the graph creation code to minimize the amount of API calls neeeded.
[arvados.git] / apps / workbench / app / helpers / provenance_helper.rb
1 module ProvenanceHelper
2
3   class GenerateGraph
4     def initialize(pdata, opts)
5       @pdata = pdata
6       @opts = opts
7       @visited = {}
8       @jobs = {}
9       @node_extra = {}
10     end
11
12     def self.collection_uuid(uuid)
13       Keep::Locator.parse(uuid).andand.strip_hints.andand.to_s
14     end
15
16     def url_for u
17       p = { :host => @opts[:request].host,
18         :port => @opts[:request].port,
19         :protocol => @opts[:request].protocol }
20       p.merge! u
21       Rails.application.routes.url_helpers.url_for (p)
22     end
23
24     def determine_fillcolor(n)
25       fillcolor = %w(666666 669966 666699 666666 996666)[n || 0] || '666666'
26       "style=\"filled\",color=\"#ffffff\",fillcolor=\"##{fillcolor}\",fontcolor=\"#ffffff\""
27     end
28
29     def describe_node(uuid, describe_opts={})
30       bgcolor = determine_fillcolor (describe_opts[:pip] || @opts[:pips].andand[uuid])
31
32       rsc = ArvadosBase::resource_class_for_uuid uuid
33
34       if GenerateGraph::collection_uuid(uuid) || rsc == Collection
35         if Collection.is_empty_blob_locator? uuid.to_s
36           # special case
37           return "\"#{uuid}\" [label=\"(empty collection)\"];\n"
38         end
39
40         href = url_for ({:controller => Collection.to_s.tableize,
41                           :action => :show,
42                           :id => uuid.to_s })
43
44         return "\"#{uuid}\" [label=\"#{encode_quotes(describe_opts[:label] || (@pdata[uuid] and @pdata[uuid][:name]) || uuid)}\",shape=box,href=\"#{href}\",#{bgcolor}];\n"
45       else
46         href = ""
47         if describe_opts[:href]
48           href = ",href=\"#{url_for ({:controller => describe_opts[:href][:controller],
49                             :action => :show,
50                             :id => describe_opts[:href][:id] })}\""
51         end
52         return "\"#{uuid}\" [label=\"#{encode_quotes(describe_opts[:label] || uuid)}\",#{bgcolor},shape=#{describe_opts[:shape] || 'box'}#{href}];\n"
53       end
54     end
55
56     def job_uuid(job)
57       d = Digest::MD5.hexdigest(job[:script_parameters].to_json)
58       if @opts[:combine_jobs] == :script_only
59         uuid = "#{job[:script]}_#{d}"
60       elsif @opts[:combine_jobs] == :script_and_version
61         uuid = "#{job[:script]}_#{job[:script_version]}_#{d}"
62       else
63         uuid = "#{job[:uuid]}"
64       end
65
66       @jobs[uuid] = [] unless @jobs[uuid]
67       @jobs[uuid] << job unless @jobs[uuid].include? job
68
69       uuid
70     end
71
72     def edge(tail, head, extra)
73       if @opts[:direction] == :bottom_up
74         gr = "\"#{encode_quotes head}\" -> \"#{encode_quotes tail}\""
75       else
76         gr = "\"#{encode_quotes tail}\" -> \"#{encode_quotes head}\""
77       end
78
79       if extra.length > 0
80         gr += " ["
81         extra.each do |k, v|
82           gr += "#{k}=\"#{encode_quotes v}\","
83         end
84         gr += "]"
85       end
86       gr += ";\n"
87       gr
88     end
89
90     def script_param_edges(uuid, sp)
91       gr = ""
92
93       sp.each do |k, v|
94         if @opts[:all_script_parameters]
95           if v.is_a? Array or v.is_a? Hash
96             encv = JSON.pretty_generate(v).gsub("\n", "\\l") + "\\l"
97           else
98             encv = v.to_json
99           end
100           gr += "\"#{encode_quotes encv}\" [shape=box];\n"
101           gr += edge(encv, uuid, {:label => k})
102         end
103       end
104       gr
105     end
106
107     def cr_input_pdhs cr
108       pdhs = []
109       input_obj = cr[:mounts].andand[:"/var/lib/cwl/cwl.input.json"].andand[:content] || cr[:mounts]
110       if input_obj
111         ProvenanceHelper::find_collections input_obj do |col_hash, col_uuid, key|
112           if col_hash
113             pdhs << col_hash
114           end
115         end
116       end
117       pdhs
118     end
119
120     def job_edges job, edge_opts={}
121       uuid = job_uuid(job)
122       gr = ""
123
124       ProvenanceHelper::find_collections job[:script_parameters] do |collection_hash, collection_uuid, key|
125         if collection_uuid
126           gr += describe_node(collection_uuid)
127           gr += edge(collection_uuid, uuid, {:label => key})
128         else
129           gr += describe_node(collection_hash)
130           gr += edge(collection_hash, uuid, {:label => key})
131         end
132       end
133
134       if job[:docker_image_locator] and !@opts[:no_docker]
135         gr += describe_node(job[:docker_image_locator], {label: (job[:runtime_constraints].andand[:docker_image] || job[:docker_image_locator])})
136         gr += edge(job[:docker_image_locator], uuid, {label: "docker_image"})
137       end
138
139       if @opts[:script_version_nodes]
140         gr += describe_node(job[:script_version], {:label => "git:#{job[:script_version]}"})
141         gr += edge(job[:script_version], uuid, {:label => "script_version"})
142       end
143
144       if job[:output] and !edge_opts[:no_output]
145         gr += describe_node(job[:output])
146         gr += edge(uuid, job[:output], {label: "output" })
147       end
148
149       if job[:log] and !edge_opts[:no_log]
150         gr += describe_node(job[:log])
151         gr += edge(uuid, job[:log], {label: "log"})
152       end
153
154       gr
155     end
156
157     def generate_provenance_edges(uuid)
158       gr = ""
159       m = GenerateGraph::collection_uuid(uuid)
160       uuid = m if m
161
162       if uuid.nil? or uuid.empty? or @visited[uuid]
163         return ""
164       end
165
166       if @pdata[uuid].nil?
167         return ""
168       else
169         @visited[uuid] = true
170       end
171
172       if uuid.start_with? "component_"
173         # Pipeline component inputs
174         job = @pdata[@pdata[uuid][:job].andand[:uuid]]
175
176         if job
177           gr += describe_node(job_uuid(job), {label: uuid[38..-1], pip: @opts[:pips].andand[job[:uuid]], shape: "oval",
178                                 href: {controller: 'jobs', id: job[:uuid]}})
179           gr += job_edges job, {no_output: true, no_log: true}
180         end
181
182         # Pipeline component output
183         outuuid = @pdata[uuid][:output_uuid]
184         if outuuid
185           outcollection = @pdata[outuuid]
186           if outcollection
187             gr += edge(job_uuid(job), outcollection[:portable_data_hash], {label: "output"})
188             gr += describe_node(outcollection[:portable_data_hash], {label: outcollection[:name]})
189           end
190         elsif job and job[:output]
191           gr += describe_node(job[:output])
192           gr += edge(job_uuid(job), job[:output], {label: "output" })
193         end
194       else
195         rsc = ArvadosBase::resource_class_for_uuid uuid
196
197         if rsc == Job
198           job = @pdata[uuid]
199           gr += job_edges job if job
200         elsif rsc == ContainerRequest
201           cr = @pdata[uuid]
202           if cr
203             child_crs = []
204             col_uuids = []
205             col_pdhs = []
206             col_uuids << cr[:output_uuid] if cr[:output_uuid]
207             col_pdhs += cr_input_pdhs(cr)
208             # Search for child CRs
209             if cr[:container_uuid]
210               child_crs = ContainerRequest.where(requesting_container_uuid: cr[:container_uuid])
211               child_crs.each do |child|
212                 col_uuids << child[:output_uuid] if child[:output_uuid]
213                 col_pdhs += cr_input_pdhs(child)
214               end
215             end
216
217             output_cols = {} # Indexed by UUID
218             input_cols = {} # Indexed by PDH
219
220             # Batch requests to get all related collections
221             Collection.filter([['uuid', 'in', col_uuids.uniq]]).each do |c|
222               output_cols[c[:uuid]] = c
223             end
224             output_pdhs = output_cols.values.map{|c| c[:portable_data_hash]}.uniq
225             Collection.filter([['portable_data_hash', 'in',  col_pdhs - output_pdhs]]).each do |c|
226               if input_cols[c[:portable_data_hash]]
227                 input_cols[c[:portable_data_hash]] << c
228               else
229                 input_cols[c[:portable_data_hash]] = [c]
230               end
231             end
232
233             # Make the graph
234             visited_pdhs = []
235             all_cr_nodes = [cr] + child_crs.results
236
237             # First pass: add the CR nodes with their outputs, because they're
238             # referenced by UUID.
239             all_cr_nodes.each do |cr_node|
240               # CR node
241               gr += describe_node(cr_node[:uuid], {href: {controller: 'container_requests',
242                                                           id: cr_node[:uuid]},
243                                                    label: cr_node[:name],
244                                                    shape: 'oval'})
245               # Connect child CRs with the main one
246               if cr_node != cr
247                 gr += edge(cr_node[:uuid], cr[:uuid], {label: 'child'})
248               end
249               # Output collection node
250               if cr_node[:output_uuid] and output_cols[cr_node[:output_uuid]]
251                 c = output_cols[cr_node[:output_uuid]]
252                 visited_pdhs << c[:portable_data_hash]
253                 gr += describe_node(c[:portable_data_hash], {label: c[:name]})
254                 gr += edge(cr_node[:uuid], c[:portable_data_hash], {label: 'output'})
255               end
256             end
257
258             # Second pass: add the input collection nodes.
259             all_cr_nodes.each do |cr_node|
260               cr_input_pdhs(cr_node).each do |pdh|
261                 if not visited_pdhs.include?(pdh)
262                   visited_pdhs << pdh
263                   if input_cols[pdh]
264                     # First search for collections within the CR project
265                     cols = input_cols[pdh].select{|x| x[:owner_uuid] == cr_node[:owner_uuid]}
266                     if cols.empty?
267                       # Search for any collection with this PDH
268                       cols = input_cols[pdh]
269                     end
270                     names = cols.collect{|x| x[:name]}.uniq
271                     input_name = names.first
272                     if names.length > 1
273                       input_name += " + #{names.length - 1} others"
274                     end
275                   else
276                     # No collection found by this PDH
277                     input_name = pdh
278                   end
279                   gr += describe_node(pdh, {label: input_name})
280                 end
281                 gr += edge(pdh, cr_node[:uuid], {label: 'input'})
282               end
283             end
284           end
285         end
286       end
287
288       @pdata.each do |k, link|
289         if link[:head_uuid] == uuid.to_s and link[:link_class] == "provenance"
290           href = url_for ({:controller => Link.to_s.tableize,
291                             :action => :show,
292                             :id => link[:uuid] })
293
294           gr += describe_node(link[:tail_uuid])
295           gr += edge(link[:head_uuid], link[:tail_uuid], {:label => link[:name], :href => href})
296           gr += generate_provenance_edges(link[:tail_uuid])
297         end
298       end
299
300       gr
301     end
302
303     def describe_jobs
304       gr = ""
305       @jobs.each do |k, v|
306         href = url_for ({:controller => Job.to_s.tableize,
307                           :action => :index })
308
309         gr += "\"#{k}\" [href=\"#{href}?"
310
311         n = 0
312         v.each do |u|
313           gr += ";" unless gr.end_with? "?"
314           gr += "uuid%5b%5d=#{u[:uuid]}"
315           n |= @opts[:pips][u[:uuid]] if @opts[:pips] and @opts[:pips][u[:uuid]]
316         end
317
318         gr += "\",label=\""
319
320         label = "#{v[0][:script]}"
321
322         if label == "run-command" and v[0][:script_parameters][:command].is_a? Array
323           label = v[0][:script_parameters][:command].join(' ')
324         end
325
326         if not @opts[:combine_jobs]
327           label += "\\n#{v[0][:finished_at]}"
328         end
329
330         gr += encode_quotes label
331
332         gr += "\",#{determine_fillcolor n}];\n"
333       end
334       gr
335     end
336
337     def encode_quotes value
338       value.to_s.gsub("\"", "\\\"").gsub("\n", "\\n")
339     end
340   end
341
342   def self.create_provenance_graph(pdata, svgId, opts={})
343     if pdata.is_a? Array or pdata.is_a? ArvadosResourceList
344       p2 = {}
345       pdata.each do |k|
346         p2[k[:uuid]] = k if k[:uuid]
347       end
348       pdata = p2
349     end
350
351     unless pdata.is_a? Hash
352       raise "create_provenance_graph accepts Array or Hash for pdata only, pdata is #{pdata.class}"
353     end
354
355     gr = """strict digraph {
356 node [fontsize=10,fontname=\"Helvetica,Arial,sans-serif\"];
357 edge [fontsize=10,fontname=\"Helvetica,Arial,sans-serif\"];
358 """
359
360     if opts[:direction] == :bottom_up
361       gr += "edge [dir=back];"
362     end
363
364     begin
365       pdata = pdata.stringify_keys
366
367       g = GenerateGraph.new(pdata, opts)
368
369       pdata.each do |k, v|
370         if !opts[:only_components] or k.start_with? "component_"
371           gr += g.generate_provenance_edges(k)
372         else
373           #gr += describe_node(k)
374         end
375       end
376
377       if !opts[:only_components]
378         gr += g.describe_jobs
379       end
380
381     rescue => e
382       Rails.logger.warn "#{e.inspect}"
383       Rails.logger.warn "#{e.backtrace.join("\n\t")}"
384       raise
385     end
386
387     gr += "}"
388     svg = ""
389
390     require 'open3'
391
392     Open3.popen2("dot", "-Tsvg") do |stdin, stdout, wait_thr|
393       stdin.print(gr)
394       stdin.close
395       svg = stdout.read()
396       wait_thr.value
397       stdout.close()
398     end
399
400     svg = svg.sub(/<\?xml.*?\?>/m, "")
401     svg = svg.sub(/<!DOCTYPE.*?>/m, "")
402     svg = svg.sub(/<svg /, "<svg id=\"#{svgId}\" ")
403   end
404
405   # yields hash, uuid
406   # Position indicates whether it is a content hash or arvados uuid.
407   # One will hold a value, the other will always be nil.
408   def self.find_collections(sp, key=nil, &b)
409     case sp
410     when ArvadosBase
411       sp.class.columns.each do |c|
412         find_collections(sp[c.name.to_sym], nil, &b)
413       end
414     when Hash
415       sp.each do |k, v|
416         find_collections(v, key || k, &b)
417       end
418     when Array
419       sp.each do |v|
420         find_collections(v, key, &b)
421       end
422     when String
423       if m = /[a-f0-9]{32}\+\d+/.match(sp)
424         yield m[0], nil, key
425       elsif m = /[0-9a-z]{5}-4zz18-[0-9a-z]{15}/.match(sp)
426         yield nil, m[0], key
427       end
428     end
429   end
430 end