21121: Handle comparing numbers better in table sort.
[arvados.git] / tools / cluster-activity / arvados_cluster_activity / report.py
1 #!/usr/bin/env python3
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 import logging
7 import ciso8601
8 import arvados.util
9 import re
10 import csv
11 import math
12 import collections
13 import json
14 from datetime import date, datetime, timedelta
15 import pkg_resources
16
17 from dataclasses import dataclass
18
19 import crunchstat_summary.dygraphs
20 from crunchstat_summary.summarizer import Task
21
22 from arvados_cluster_activity.prometheus import get_metric_usage, get_data_usage
23
24 sortablejs = """
25 /**
26  * sortable v3.2.3
27  *
28  * https://www.npmjs.com/package/sortable-tablesort
29  * https://github.com/tofsjonas/sortable
30  *
31  * Makes html tables sortable, No longer ie9+ ðŸ˜¢
32  *
33  * Styling is done in css.
34  *
35  * Copyleft 2017 Jonas Earendel
36  *
37  * This is free and unencumbered software released into the public domain.
38  *
39  * Anyone is free to copy, modify, publish, use, compile, sell, or
40  * distribute this software, either in source code form or as a compiled
41  * binary, for any purpose, commercial or non-commercial, and by any
42  * means.
43  *
44  * In jurisdictions that recognize copyright laws, the author or authors
45  * of this software dedicate any and all copyright interest in the
46  * software to the public domain. We make this dedication for the benefit
47  * of the public at large and to the detriment of our heirs and
48  * successors. We intend this dedication to be an overt act of
49  * relinquishment in perpetuity of all present and future rights to this
50  * software under copyright law.
51  *
52  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
53  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
54  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
55  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
56  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
57  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
58  * OTHER DEALINGS IN THE SOFTWARE.
59  *
60  * For more information, please refer to <http://unlicense.org>
61  *
62  */
63 document.addEventListener('click', function (e) {
64     try {
65         // allows for elements inside TH
66         function findElementRecursive(element, tag) {
67             return element.nodeName === tag ? element : findElementRecursive(element.parentNode, tag);
68         }
69         var ascending_table_sort_class = 'asc';
70         var no_sort_class = 'no-sort';
71         var null_last_class = 'n-last';
72         var table_class_name = 'sortable';
73         var alt_sort_1 = e.shiftKey || e.altKey;
74         var element = findElementRecursive(e.target, 'TH');
75         var tr = element.parentNode;
76         var thead = tr.parentNode;
77         var table = thead.parentNode;
78         function getValue(element) {
79             var _a;
80             var value = alt_sort_1 ? element.dataset.sortAlt : (_a = element.dataset.sort) !== null && _a !== void 0 ? _a : element.textContent;
81             return value;
82         }
83         if (thead.nodeName === 'THEAD' && // sortable only triggered in `thead`
84             table.classList.contains(table_class_name) &&
85             !element.classList.contains(no_sort_class) // .no-sort is now core functionality, no longer handled in CSS
86         ) {
87             var column_index_1;
88             var nodes = tr.cells;
89             var tiebreaker_1 = +element.dataset.sortTbr;
90             // Reset thead cells and get column index
91             for (var i = 0; i < nodes.length; i++) {
92                 if (nodes[i] === element) {
93                     column_index_1 = +element.dataset.sortCol || i;
94                 }
95                 else {
96                     nodes[i].setAttribute('aria-sort', 'none');
97                 }
98             }
99             var direction = 'descending';
100             if (element.getAttribute('aria-sort') === 'descending' ||
101                 (table.classList.contains(ascending_table_sort_class) && element.getAttribute('aria-sort') !== 'ascending')) {
102                 direction = 'ascending';
103             }
104             // Update the `th` class accordingly
105             element.setAttribute('aria-sort', direction);
106             var reverse_1 = direction === 'ascending';
107             var sort_null_last_1 = table.classList.contains(null_last_class);
108             var compare_1 = function (a, b, index) {
109                 var x = getValue(b.cells[index]);
110                 var y = getValue(a.cells[index]);
111                 if (sort_null_last_1) {
112                     if (x === '' && y !== '') {
113                         return -1;
114                     }
115                     if (y === '' && x !== '') {
116                         return 1;
117                     }
118                 }
119                 // Before comparing, clean up formatted numbers that may have a leading dollar sign and/or commas.
120                 x = x.replace("$", "").replace(",", "");
121                 y = y.replace("$", "").replace(",", "");
122                 var temp = +x - +y;
123                 var bool = isNaN(temp) ? x.localeCompare(y) : temp;
124                 return reverse_1 ? -bool : bool;
125             };
126             // loop through all tbodies and sort them
127             for (var i = 0; i < table.tBodies.length; i++) {
128                 var org_tbody = table.tBodies[i];
129                 // Put the array rows in an array, so we can sort them...
130                 var rows = [].slice.call(org_tbody.rows, 0);
131                 // Sort them using Array.prototype.sort()
132                 rows.sort(function (a, b) {
133                     var bool = compare_1(a, b, column_index_1);
134                     return bool === 0 && !isNaN(tiebreaker_1) ? compare_1(a, b, tiebreaker_1) : bool;
135                 });
136                 // Make an empty clone
137                 var clone_tbody = org_tbody.cloneNode();
138                 // Put the sorted rows inside the clone
139                 clone_tbody.append.apply(clone_tbody, rows);
140                 // And finally replace the unsorted tbody with the sorted one
141                 table.replaceChild(clone_tbody, org_tbody);
142             }
143         }
144         // eslint-disable-next-line no-unused-vars
145     }
146     catch (error) {
147         // console.log(error)
148     }
149 });
150 """
151
152 sortablecss = """
153 @charset "UTF-8";
154 .sortable thead th:not(.no-sort) {
155   cursor: pointer;
156 }
157 .sortable thead th:not(.no-sort)::after, .sortable thead th:not(.no-sort)::before {
158   transition: color 0.1s ease-in-out;
159   font-size: 1.2em;
160   color: transparent;
161 }
162 .sortable thead th:not(.no-sort)::after {
163   margin-left: 3px;
164   content: "â–¸";
165 }
166 .sortable thead th:not(.no-sort):hover::after {
167   color: inherit;
168 }
169 .sortable thead th:not(.no-sort)[aria-sort=descending]::after {
170   color: inherit;
171   content: "â–¾";
172 }
173 .sortable thead th:not(.no-sort)[aria-sort=ascending]::after {
174   color: inherit;
175   content: "â–´";
176 }
177 .sortable thead th:not(.no-sort).indicator-left::after {
178   content: "";
179 }
180 .sortable thead th:not(.no-sort).indicator-left::before {
181   margin-right: 3px;
182   content: "â–¸";
183 }
184 .sortable thead th:not(.no-sort).indicator-left:hover::before {
185   color: inherit;
186 }
187 .sortable thead th:not(.no-sort).indicator-left[aria-sort=descending]::before {
188   color: inherit;
189   content: "â–¾";
190 }
191 .sortable thead th:not(.no-sort).indicator-left[aria-sort=ascending]::before {
192   color: inherit;
193   content: "â–´";
194 }
195
196 /*# sourceMappingURL=sortable.css.map */
197 """
198
199 @dataclass
200 class WorkflowRunSummary:
201     name: str
202     uuid: str
203     count: int = 0
204     cost: float = 0
205     hours: float = 0
206
207 @dataclass
208 class ProjectSummary:
209     users: set
210     uuid: str
211     runs: dict[str, WorkflowRunSummary]
212     earliest: datetime = datetime(year=9999, month=1, day=1)
213     latest: datetime = datetime(year=1900, month=1, day=1)
214     name: str = ""
215     cost: float = 0
216     count: int = 0
217     hours: float = 0
218     activityspan: str = ""
219     tablerow: str = ""
220
221 @dataclass
222 class Summarizer:
223     label: str
224     tasks: collections.defaultdict[str, Task]
225
226     def long_label(self):
227         return self.label
228
229
230 def date_export(item):
231     if isinstance(item, datetime):
232         return """@new Date("{}")@""".format(item.strftime("%Y-%m-%dT%H:%M:%SZ"))
233
234 def aws_monthly_cost(value):
235     value_gb = value / (1024*1024*1024)
236     first_50tb = min(1024*50, value_gb)
237     next_450tb = max(min(1024*450, value_gb-1024*50), 0)
238     over_500tb = max(value_gb-1024*500, 0)
239
240     monthly_cost = (first_50tb * 0.023) + (next_450tb * 0.022) + (over_500tb * 0.021)
241     return monthly_cost
242
243
244 def format_with_suffix_base2(summary_value):
245     for scale in ["KiB", "MiB", "GiB", "TiB", "PiB", "EiB"]:
246         summary_value = summary_value / 1024
247         if summary_value < 1024:
248             return "%.3f %s" % (summary_value, scale)
249
250 def format_with_suffix_base10(summary_value):
251     for scale in ["KB", "MB", "GB", "TB", "PB", "EB"]:
252         summary_value = summary_value / 1000
253         if summary_value < 1000:
254             return "%.3f %s" % (summary_value, scale)
255
256 containers_category = 'Concurrent running containers'
257 storage_category = 'Storage Usage'
258
259 class ReportChart(crunchstat_summary.dygraphs.DygraphsChart):
260     def sections(self):
261         return [
262             {
263                 'label': s.long_label(),
264                 'charts': [
265                     self.chartdata(s.label, s.tasks, stat)
266                     for stat in ((containers_category, ['containers']),
267                                  (storage_category, ['storage used']),
268                                  )
269                     ],
270             }
271             for s in self.summarizers]
272
273     def js(self):
274         return 'var chartdata = {};\n{}'.format(
275             json.dumps(self.sections(), default=date_export).replace('"@', '').replace('@"', '').replace('\\"', '"'),
276             '\n'.join([pkg_resources.resource_string('crunchstat_summary', jsa).decode('utf-8') for jsa in self.JSASSETS]))
277
278     def _collate_data(self, tasks, stats):
279         data = []
280         nulls = []
281         # uuid is category for crunch2
282         for uuid, task in tasks.items():
283             # All stats in a category are assumed to have the same time base and same number of samples
284             category = stats[0]
285             series_names = stats[1]
286             sn0 = series_names[0]
287             series = task.series[(category,sn0)]
288             for i in range(len(series)):
289                 pt = series[i]
290                 vals = [task.series[(category,stat)][i][1] for stat in series_names[1:]]
291                 data.append([pt[0]] + nulls + [pt[1]] + vals)
292             nulls.append(None)
293         return sorted(data)
294
295
296 WEBCHART_CLASS = ReportChart
297
298
299 def runtime_str(container_request, containers):
300     length = ciso8601.parse_datetime(containers[container_request["container_uuid"]]["finished_at"]) - ciso8601.parse_datetime(containers[container_request["container_uuid"]]["started_at"])
301
302     hours = length.days * 24 + (length.seconds // 3600)
303     minutes = (length.seconds // 60) % 60
304     seconds = length.seconds % 60
305
306     return "%i:%02i:%02i" % (hours, minutes, seconds)
307
308 def runtime_in_hours(runtime):
309     sp = runtime.split(":")
310     hours = float(sp[0])
311     hours += float(sp[1]) / 60
312     hours += float(sp[2]) / 3600
313     return hours
314
315 def hours_to_runtime_str(frac_hours):
316     hours = math.floor(frac_hours)
317     minutes = (frac_hours - math.floor(frac_hours)) * 60.0
318     seconds = (minutes - math.floor(minutes)) * 60.0
319
320     return "%i:%02i:%02i" % (hours, minutes, seconds)
321
322
323 def csv_dateformat(datestr):
324     dt = ciso8601.parse_datetime(datestr)
325     return dt.strftime("%Y-%m-%d %H:%M:%S")
326
327
328 class ClusterActivityReport(object):
329     def __init__(self, prom_client, label=None, threads=1, **kwargs):
330         self.threadcount = threads
331         self.arv_client = arvados.api()
332         self.prom_client = prom_client
333         self.cluster = self.arv_client.config()["ClusterID"]
334
335         self.active_users = set()
336         self.project_summary = {}
337         self.total_hours = 0
338         self.total_cost = 0
339         self.total_workflows = 0
340         self.summarizers = []
341         self.storage_cost = 0
342
343     def run(self):
344         pass
345
346     def collect_graph(self, s1, since, to, taskname, legend, metric, resampleTo, extra=None):
347         if not self.prom_client:
348             return
349
350         task = s1.tasks[taskname]
351
352         for series in get_metric_usage(self.prom_client, since, to, metric % self.cluster, resampleTo=resampleTo):
353             for t in series.itertuples():
354                 task.series[taskname, legend].append(t)
355                 if extra:
356                     extra(t)
357
358     def collect_storage_cost(self, t):
359         self.storage_cost += aws_monthly_cost(t.y) / (30*24)
360
361     def html_report(self, since, to, exclude):
362
363         self.label = "Cluster report for %s from %s to %s" % (self.cluster, since.date(), to.date())
364
365         for row in self.report_from_api(since, to, False, exclude):
366             pass
367
368         logging.info("Getting container hours time series")
369         s1 = Summarizer(label="", tasks=collections.defaultdict(Task))
370         self.collect_graph(s1, since, to, containers_category, "containers",
371                            "arvados_dispatchcloud_containers_running{cluster='%s'}", resampleTo="5min")
372
373         logging.info("Getting data usage time series")
374         s2 = Summarizer(label="", tasks=collections.defaultdict(Task))
375         self.collect_graph(s2, since, to, storage_category, "managed",
376                            "arvados_keep_collection_bytes{cluster='%s'}", resampleTo="60min")
377
378         self.collect_graph(s2, since, to, storage_category, "storage used",
379                            "arvados_keep_total_bytes{cluster='%s'}", resampleTo="60min", extra=self.collect_storage_cost)
380
381         managed_data_now = s2.tasks[storage_category].series[storage_category,"managed"][-1]
382         storage_used_now = s2.tasks[storage_category].series[storage_category,"storage used"][-1]
383
384         storage_cost = aws_monthly_cost(storage_used_now.y)
385         dedup_ratio = managed_data_now.y/storage_used_now.y
386         dedup_savings = aws_monthly_cost(managed_data_now.y) - storage_cost
387
388         self.summarizers = [s1, s2]
389
390         tophtml = ""
391         bottomhtml = ""
392         label = self.label
393
394         tophtml = []
395
396         workbench = self.arv_client.config()["Services"]["Workbench2"]["ExternalURL"]
397         if workbench.endswith("/"):
398             workbench = workbench[:-1]
399
400         if to.date() == date.today():
401             tophtml.append("""<h2>Cluster status as of {now}</h2>
402             <table class='aggtable'><tbody>
403             <tr><td><a href="{workbench}/users">Total users</a></td><td>{total_users}</td></tr>
404             <tr><td>Total projects</td><td>{total_projects}</td></tr>
405             <tr><td>Total data under management</td><td>{managed_data_now}</td></tr>
406             <tr><td>Total storage usage</td><td>{storage_used_now}</td></tr>
407             <tr><td>Deduplication ratio</td><td>{dedup_ratio:.1f}</td></tr>
408             <tr><td>Approximate monthly storage cost</td><td>${storage_cost:,.2f}</td></tr>
409             <tr><td>Monthly savings from storage deduplication</td><td>${dedup_savings:,.2f}</td></tr>
410             </tbody></table>
411             """.format(now=date.today(),
412                        total_users=self.total_users,
413                        total_projects=self.total_projects,
414                        managed_data_now=format_with_suffix_base10(managed_data_now.y),
415                        storage_used_now=format_with_suffix_base10(storage_used_now.y),
416                        dedup_savings=dedup_savings,
417                        storage_cost=storage_cost,
418                        dedup_ratio=dedup_ratio,
419                            workbench=workbench))
420
421         tophtml.append("""<h2>Activity and cost over the {reporting_days} day period {since} to {to}</h2>
422         <table class='aggtable'><tbody>
423         <tr><td>Active users</td><td>{active_users}</td></tr>
424         <tr><td><a href="#Active_Projects">Active projects</a></td><td>{active_projects}</td></tr>
425         <tr><td>Workflow runs</td><td>{total_workflows:,}</td></tr>
426         <tr><td>Compute used</td><td>{total_hours:,.1f} hours</td></tr>
427         <tr><td>Compute cost</td><td>${total_cost:,.2f}</td></tr>
428         <tr><td>Storage cost</td><td>${storage_cost:,.2f}</td></tr>
429         </tbody></table>
430         """.format(active_users=len(self.active_users),
431                    total_users=self.total_users,
432                    total_hours=self.total_hours,
433                    total_cost=self.total_cost,
434                    total_workflows=self.total_workflows,
435                    active_projects=len(self.project_summary),
436                    since=since.date(), to=to.date(),
437                    reporting_days=(to - since).days,
438                    storage_cost=self.storage_cost))
439
440         bottomhtml = []
441
442         projectlist = sorted(self.project_summary.items(), key=lambda x: x[1].cost, reverse=True)
443
444         for k, prj in projectlist:
445             if prj.earliest.date() == prj.latest.date():
446                 prj.activityspan = "{}".format(prj.earliest.date())
447             else:
448                 prj.activityspan = "{} to {}".format(prj.earliest.date(), prj.latest.date())
449
450             prj.tablerow = """<td>{users}</td> <td>{active}</td> <td>{hours:,.1f}</td> <td>${cost:,.2f}</td>""".format(
451                 name=prj.name,
452                 active=prj.activityspan,
453                 cost=prj.cost,
454                 hours=prj.hours,
455                 users=", ".join(prj.users),
456             )
457
458         bottomhtml.append(
459             """
460             <style>{style}</style>
461             <script>{script}</script>
462             <a id="Active_Projects"><h2>Active Projects</h2></a>
463             <table class='sortable'>
464             <thead><tr><th>Project</th> <th>Users</th> <th>Active</th> <th>Compute usage (hours)</th> <th>Compute cost</th> </tr></thead>
465             <tbody><tr>{projects}</tr></tbody>
466             </table>
467             """.format(projects="</tr>\n<tr>".join("""<td><a href="#{name}">{name}</a></td>{rest}""".format(name=prj.name, rest=prj.tablerow) for k, prj in projectlist),
468                        script=sortablejs,
469                        style=sortablecss))
470
471         for k, prj in projectlist:
472             wfsum = []
473             for k2, r in sorted(prj.runs.items(), key=lambda x: x[1].count, reverse=True):
474                 wfsum.append("""
475                 <tr><td>{count}</td> <td>{workflowlink}</td>  <td>{runtime}</td> <td>${cost:,.2f}</td> <td>${totalcost:,.2f}</td></tr>
476                 """.format(
477                     count=r.count,
478                     runtime=hours_to_runtime_str(r.hours/r.count),
479                     cost=r.cost/r.count,
480                     totalcost=r.cost,
481                     workflowlink="""<a href="{workbench}/workflows/{uuid}">{name}</a>""".format(workbench=workbench,uuid=r.uuid,name=r.name)
482                     if r.uuid != "none" else r.name))
483
484                 # <table>
485                 # <thead><tr><th>Users</th> <th>Active</th> <th>Compute usage</th> <th>Compute cost</th> </tr></thead>
486                 # <tbody><tr>{projectrow}</tr></tbody>
487                 # </table>
488
489
490             bottomhtml.append(
491                 """<a id="{name}"></a><a href="{workbench}/projects/{uuid}"><h2>{name}</h2></a>
492
493                 <table>
494                 <tbody><tr>{projectrow}</tr></tbody>
495                 </table>
496
497                 <table class='sortable'>
498                 <thead><tr><th>Workflow run count</th> <th>Workflow name</th> <th>Mean runtime</th> <th>Mean cost per run</th> <th>Sum cost over runs</th></tr></thead>
499                 <tbody>
500                 {wfsum}
501                 </tbody></table>
502                 """.format(name=prj.name,
503                            users=", ".join(prj.users),
504                            cost=prj.cost,
505                            hours=prj.hours,
506                            wfsum=" ".join(wfsum),
507                            earliest=prj.earliest.date(),
508                            latest=prj.latest.date(),
509                            activity=prj.activityspan,
510                            userplural='s' if len(prj.users) > 1 else '',
511                            projectrow=prj.tablerow,
512                            workbench=workbench,
513                            uuid=prj.uuid)
514             )
515
516         return WEBCHART_CLASS(label, self.summarizers).html(tophtml, bottomhtml)
517
518     def flush_containers(self, pending, include_steps, exclude):
519         containers = {}
520
521         for container in arvados.util.keyset_list_all(
522             self.arv_client.containers().list,
523             filters=[
524                 ["uuid", "in", [c["container_uuid"] for c in pending if c["container_uuid"]]],
525             ],
526             select=["uuid", "started_at", "finished_at", "cost"]):
527
528             containers[container["uuid"]] = container
529
530         workflows = {}
531         workflows["none"] = "workflow run from command line"
532
533         for wf in arvados.util.keyset_list_all(
534                 self.arv_client.workflows().list,
535                 filters=[
536                     ["uuid", "in", list(set(c["properties"]["template_uuid"]
537                                             for c in pending
538                                             if "template_uuid" in c["properties"] and c["properties"]["template_uuid"].startswith(self.arv_client.config()["ClusterID"])))],
539                 ],
540                 select=["uuid", "name"]):
541             workflows[wf["uuid"]] = wf["name"]
542
543         projects = {}
544
545         for pr in arvados.util.keyset_list_all(
546                 self.arv_client.groups().list,
547                 filters=[
548                     ["uuid", "in", list(set(c["owner_uuid"] for c in pending if c["owner_uuid"][6:11] == 'j7d0g'))],
549                 ],
550                 select=["uuid", "name"]):
551             projects[pr["uuid"]] = pr["name"]
552
553         for pr in arvados.util.keyset_list_all(
554                 self.arv_client.users().list,
555                 filters=[
556                     ["uuid", "in", list(set(c["owner_uuid"] for c in pending if c["owner_uuid"][6:11] == 'tpzed')|set(c["modified_by_user_uuid"] for c in pending))],
557                 ],
558                 select=["uuid", "full_name", "first_name", "last_name"]):
559             projects[pr["uuid"]] = pr["full_name"]
560
561         if include_steps:
562             name_regex = re.compile(r"(.+)_[0-9]+")
563             child_crs = {}
564             child_cr_containers = set()
565             stepcount = 0
566
567             logging.info("Getting workflow steps")
568             for cr in arvados.util.keyset_list_all(
569                 self.arv_client.container_requests().list,
570                 filters=[
571                     ["requesting_container_uuid", "in", list(containers.keys())],
572                 ],
573                 select=["uuid", "name", "cumulative_cost", "requesting_container_uuid", "container_uuid"]):
574
575                 if cr["cumulative_cost"] == 0:
576                     continue
577
578                 g = name_regex.fullmatch(cr["name"])
579                 if g:
580                     cr["name"] = g[1]
581
582                 child_crs.setdefault(cr["requesting_container_uuid"], []).append(cr)
583                 child_cr_containers.add(cr["container_uuid"])
584                 if len(child_cr_containers) == 1000:
585                     stepcount += len(child_cr_containers)
586                     for container in arvados.util.keyset_list_all(
587                             self.arv_client.containers().list,
588                             filters=[
589                                 ["uuid", "in", list(child_cr_containers)],
590                             ],
591                             select=["uuid", "started_at", "finished_at", "cost"]):
592
593                         containers[container["uuid"]] = container
594
595                     logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
596                     child_cr_containers.clear()
597
598             if child_cr_containers:
599                 stepcount += len(child_cr_containers)
600                 for container in arvados.util.keyset_list_all(
601                         self.arv_client.containers().list,
602                         filters=[
603                             ["uuid", "in", list(child_cr_containers)],
604                         ],
605                         select=["uuid", "started_at", "finished_at", "cost"]):
606
607                     containers[container["uuid"]] = container
608                 logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
609
610         for container_request in pending:
611             if not container_request["container_uuid"] or not containers[container_request["container_uuid"]]["started_at"] or not containers[container_request["container_uuid"]]["finished_at"]:
612                 continue
613
614             template_uuid = container_request["properties"].get("template_uuid", "none")
615             workflowname = container_request["name"] if template_uuid == "none" else workflows.get(template_uuid, "workflow missing")
616
617             if exclude and re.search(exclude, workflowname, flags=re.IGNORECASE):
618                 continue
619
620             yield {
621                 "Project": projects.get(container_request["owner_uuid"], "unknown owner"),
622                 "ProjectUUID": container_request["owner_uuid"],
623                 "Workflow": workflowname,
624                 "WorkflowUUID": container_request["properties"].get("template_uuid", "none"),
625                 "Step": "workflow runner",
626                 "StepUUID": container_request["uuid"],
627                 "Sample": container_request["name"],
628                 "SampleUUID": container_request["uuid"],
629                 "User": projects.get(container_request["modified_by_user_uuid"], "unknown user"),
630                 "UserUUID": container_request["modified_by_user_uuid"],
631                 "Submitted": csv_dateformat(container_request["created_at"]),
632                 "Started": csv_dateformat(containers[container_request["container_uuid"]]["started_at"]),
633                 "Finished": csv_dateformat(containers[container_request["container_uuid"]]["finished_at"]),
634                 "Runtime": runtime_str(container_request, containers),
635                 "Cost": round(containers[container_request["container_uuid"]]["cost"] if include_steps else container_request["cumulative_cost"], 3),
636                 "CumulativeCost": round(container_request["cumulative_cost"], 3)
637                 }
638
639             if include_steps:
640                 for child_cr in child_crs.get(container_request["container_uuid"], []):
641                     if not child_cr["container_uuid"] or not containers[child_cr["container_uuid"]]["started_at"] or not containers[child_cr["container_uuid"]]["finished_at"]:
642                         continue
643                     yield {
644                         "Project": projects.get(container_request["owner_uuid"], "unknown owner"),
645                         "ProjectUUID": container_request["owner_uuid"],
646                         "Workflow": workflows.get(container_request["properties"].get("template_uuid", "none"), "workflow missing"),
647                         "WorkflowUUID": container_request["properties"].get("template_uuid", "none"),
648                         "Step": child_cr["name"],
649                         "StepUUID": child_cr["uuid"],
650                         "Sample": container_request["name"],
651                         "SampleUUID": container_request["name"],
652                         "User": projects.get(container_request["modified_by_user_uuid"], "unknown user"),
653                         "UserUUID": container_request["modified_by_user_uuid"],
654                         "Submitted": csv_dateformat(child_cr["created_at"]),
655                         "Started": csv_dateformat(containers[child_cr["container_uuid"]]["started_at"]),
656                         "Finished": csv_dateformat(containers[child_cr["container_uuid"]]["finished_at"]),
657                         "Runtime": runtime_str(child_cr, containers),
658                         "Cost": round(containers[child_cr["container_uuid"]]["cost"], 3),
659                         "CumulativeCost": round(containers[child_cr["container_uuid"]]["cost"], 3),
660                         }
661
662
663     def collect_summary_stats(self, row):
664         self.active_users.add(row["User"])
665         self.project_summary.setdefault(row["ProjectUUID"],
666                                         ProjectSummary(users=set(),
667                                                        runs={},
668                                                        uuid=row["ProjectUUID"],
669                                                        name=row["Project"]))
670         prj = self.project_summary[row["ProjectUUID"]]
671         cost = row["Cost"]
672         prj.cost += cost
673         prj.count += 1
674         prj.users.add(row["User"])
675         hrs = runtime_in_hours(row["Runtime"])
676         prj.hours += hrs
677
678         started = datetime.strptime(row["Started"], "%Y-%m-%d %H:%M:%S")
679         finished = datetime.strptime(row["Finished"], "%Y-%m-%d %H:%M:%S")
680
681         if started < prj.earliest:
682             prj.earliest = started
683
684         if finished > prj.latest:
685             prj.latest = finished
686
687         if row["Step"] == "workflow runner":
688             prj.runs.setdefault(row["Workflow"], WorkflowRunSummary(name=row["Workflow"],
689                                                                     uuid=row["WorkflowUUID"]))
690             wfuuid = row["Workflow"]
691             prj.runs[wfuuid].count += 1
692             prj.runs[wfuuid].cost += row["CumulativeCost"]
693             prj.runs[wfuuid].hours += hrs
694             self.total_workflows += 1
695
696         self.total_hours += hrs
697         self.total_cost += cost
698
699     def report_from_api(self, since, to, include_steps, exclude):
700         pending = []
701
702         count = 0
703         for container_request in arvados.util.keyset_list_all(
704                 self.arv_client.container_requests().list,
705                 filters=[
706                     ["command", "like", "[\"arvados-cwl-runner%"],
707                     ["created_at", ">=", since.strftime("%Y%m%dT%H%M%SZ")],
708                 ],
709                 select=["uuid", "owner_uuid", "container_uuid", "name", "cumulative_cost", "properties", "modified_by_user_uuid", "created_at"]):
710
711             if container_request["cumulative_cost"] == 0:
712                 continue
713
714             if len(pending) < 1000:
715                 pending.append(container_request)
716             else:
717                 count += len(pending)
718                 logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
719                 for row in self.flush_containers(pending, include_steps, exclude):
720                     self.collect_summary_stats(row)
721                     yield row
722                 pending.clear()
723
724         count += len(pending)
725         logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
726         for row in self.flush_containers(pending, include_steps, exclude):
727             self.collect_summary_stats(row)
728             yield row
729
730         userinfo = self.arv_client.users().list(filters=[["is_active", "=", True]], limit=0).execute()
731         self.total_users = userinfo["items_available"]
732
733         groupinfo = self.arv_client.groups().list(filters=[["group_class", "=", "project"]], limit=0).execute()
734         self.total_projects = groupinfo["items_available"]
735
736     def csv_report(self, since, to, out, include_steps, columns, exclude):
737         if columns:
738             columns = columns.split(",")
739         else:
740             if include_steps:
741                 columns = ("Project", "Workflow", "Step", "Sample", "User", "Submitted", "Runtime", "Cost")
742             else:
743                 columns = ("Project", "Workflow", "Sample", "User", "Submitted", "Runtime", "Cost")
744
745         csvwriter = csv.DictWriter(out, fieldnames=columns, extrasaction="ignore")
746         csvwriter.writeheader()
747
748         for row in self.report_from_api(since, to, include_steps, exclude):
749             csvwriter.writerow(row)