tools/cluster-activity/arvados_cluster_activity/report.py

   1 #!/usr/bin/env python3
   2 # Copyright (C) The Arvados Authors. All rights reserved.
   3 #
   4 # SPDX-License-Identifier: AGPL-3.0
   5
   6 import logging
   7 import ciso8601
   8 import arvados.util
   9 import re
  10 import csv
  11 import math
  12 import collections
  13 import json
  14 from datetime import date, datetime, timedelta
  15 import pkg_resources
  16
  17 from dataclasses import dataclass
  18
  19 import crunchstat_summary.dygraphs
  20 from crunchstat_summary.summarizer import Task
  21
  22 from arvados_cluster_activity.prometheus import get_metric_usage, get_data_usage
  23
  24 sortablejs = """
  25 /**
  26  * sortable v3.2.3
  27  *
  28  * https://www.npmjs.com/package/sortable-tablesort
  29  * https://github.com/tofsjonas/sortable
  30  *
  31  * Makes html tables sortable, No longer ie9+ 😢
  32  *
  33  * Styling is done in css.
  34  *
  35  * Copyleft 2017 Jonas Earendel
  36  *
  37  * This is free and unencumbered software released into the public domain.
  38  *
  39  * Anyone is free to copy, modify, publish, use, compile, sell, or
  40  * distribute this software, either in source code form or as a compiled
  41  * binary, for any purpose, commercial or non-commercial, and by any
  42  * means.
  43  *
  44  * In jurisdictions that recognize copyright laws, the author or authors
  45  * of this software dedicate any and all copyright interest in the
  46  * software to the public domain. We make this dedication for the benefit
  47  * of the public at large and to the detriment of our heirs and
  48  * successors. We intend this dedication to be an overt act of
  49  * relinquishment in perpetuity of all present and future rights to this
  50  * software under copyright law.
  51  *
  52  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  53  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  54  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  55  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  56  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  57  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  58  * OTHER DEALINGS IN THE SOFTWARE.
  59  *
  60  * For more information, please refer to <http://unlicense.org>
  61  *
  62  */
  63 document.addEventListener('click', function (e) {
  64     try {
  65         // allows for elements inside TH
  66         function findElementRecursive(element, tag) {
  67             return element.nodeName === tag ? element : findElementRecursive(element.parentNode, tag);
  68         }
  69         var ascending_table_sort_class = 'asc';
  70         var no_sort_class = 'no-sort';
  71         var null_last_class = 'n-last';
  72         var table_class_name = 'sortable';
  73         var alt_sort_1 = e.shiftKey || e.altKey;
  74         var element = findElementRecursive(e.target, 'TH');
  75         var tr = element.parentNode;
  76         var thead = tr.parentNode;
  77         var table = thead.parentNode;
  78         function getValue(element) {
  79             var _a;
  80             var value = alt_sort_1 ? element.dataset.sortAlt : (_a = element.dataset.sort) !== null && _a !== void 0 ? _a : element.textContent;
  81             return value;
  82         }
  83         if (thead.nodeName === 'THEAD' && // sortable only triggered in `thead`
  84             table.classList.contains(table_class_name) &&
  85             !element.classList.contains(no_sort_class) // .no-sort is now core functionality, no longer handled in CSS
  86         ) {
  87             var column_index_1;
  88             var nodes = tr.cells;
  89             var tiebreaker_1 = +element.dataset.sortTbr;
  90             // Reset thead cells and get column index
  91             for (var i = 0; i < nodes.length; i++) {
  92                 if (nodes[i] === element) {
  93                     column_index_1 = +element.dataset.sortCol || i;
  94                 }
  95                 else {
  96                     nodes[i].setAttribute('aria-sort', 'none');
  97                 }
  98             }
  99             var direction = 'descending';
 100             if (element.getAttribute('aria-sort') === 'descending' ||
 101                 (table.classList.contains(ascending_table_sort_class) && element.getAttribute('aria-sort') !== 'ascending')) {
 102                 direction = 'ascending';
 103             }
 104             // Update the `th` class accordingly
 105             element.setAttribute('aria-sort', direction);
 106             var reverse_1 = direction === 'ascending';
 107             var sort_null_last_1 = table.classList.contains(null_last_class);
 108             var compare_1 = function (a, b, index) {
 109                 var x = getValue(b.cells[index]);
 110                 var y = getValue(a.cells[index]);
 111                 if (sort_null_last_1) {
 112                     if (x === '' && y !== '') {
 113                         return -1;
 114                     }
 115                     if (y === '' && x !== '') {
 116                         return 1;
 117                     }
 118                 }
 119                 // Before comparing, clean up formatted numbers that may have a leading dollar sign and/or commas.
 120                 x = x.replace("$", "").replace(",", "");
 121                 y = y.replace("$", "").replace(",", "");
 122                 var temp = +x - +y;
 123                 var bool = isNaN(temp) ? x.localeCompare(y) : temp;
 124                 return reverse_1 ? -bool : bool;
 125             };
 126             // loop through all tbodies and sort them
 127             for (var i = 0; i < table.tBodies.length; i++) {
 128                 var org_tbody = table.tBodies[i];
 129                 // Put the array rows in an array, so we can sort them...
 130                 var rows = [].slice.call(org_tbody.rows, 0);
 131                 // Sort them using Array.prototype.sort()
 132                 rows.sort(function (a, b) {
 133                     var bool = compare_1(a, b, column_index_1);
 134                     return bool === 0 && !isNaN(tiebreaker_1) ? compare_1(a, b, tiebreaker_1) : bool;
 135                 });
 136                 // Make an empty clone
 137                 var clone_tbody = org_tbody.cloneNode();
 138                 // Put the sorted rows inside the clone
 139                 clone_tbody.append.apply(clone_tbody, rows);
 140                 // And finally replace the unsorted tbody with the sorted one
 141                 table.replaceChild(clone_tbody, org_tbody);
 142             }
 143         }
 144         // eslint-disable-next-line no-unused-vars
 145     }
 146     catch (error) {
 147         // console.log(error)
 148     }
 149 });
 150 """
 151
 152 sortablecss = """
 153 @charset "UTF-8";
 154 .sortable thead th:not(.no-sort) {
 155   cursor: pointer;
 156 }
 157 .sortable thead th:not(.no-sort)::after, .sortable thead th:not(.no-sort)::before {
 158   transition: color 0.1s ease-in-out;
 159   font-size: 1.2em;
 160   color: transparent;
 161 }
 162 .sortable thead th:not(.no-sort)::after {
 163   margin-left: 3px;
 164   content: "▸";
 165 }
 166 .sortable thead th:not(.no-sort):hover::after {
 167   color: inherit;
 168 }
 169 .sortable thead th:not(.no-sort)[aria-sort=descending]::after {
 170   color: inherit;
 171   content: "▾";
 172 }
 173 .sortable thead th:not(.no-sort)[aria-sort=ascending]::after {
 174   color: inherit;
 175   content: "▴";
 176 }
 177 .sortable thead th:not(.no-sort).indicator-left::after {
 178   content: "";
 179 }
 180 .sortable thead th:not(.no-sort).indicator-left::before {
 181   margin-right: 3px;
 182   content: "▸";
 183 }
 184 .sortable thead th:not(.no-sort).indicator-left:hover::before {
 185   color: inherit;
 186 }
 187 .sortable thead th:not(.no-sort).indicator-left[aria-sort=descending]::before {
 188   color: inherit;
 189   content: "▾";
 190 }
 191 .sortable thead th:not(.no-sort).indicator-left[aria-sort=ascending]::before {
 192   color: inherit;
 193   content: "▴";
 194 }
 195
 196 /*# sourceMappingURL=sortable.css.map */
 197 """
 198
 199 @dataclass
 200 class WorkflowRunSummary:
 201     name: str
 202     uuid: str
 203     count: int = 0
 204     cost: float = 0
 205     hours: float = 0
 206
 207 @dataclass
 208 class ProjectSummary:
 209     users: set
 210     uuid: str
 211     runs: dict[str, WorkflowRunSummary]
 212     earliest: datetime = datetime(year=9999, month=1, day=1)
 213     latest: datetime = datetime(year=1900, month=1, day=1)
 214     name: str = ""
 215     cost: float = 0
 216     count: int = 0
 217     hours: float = 0
 218     activityspan: str = ""
 219     tablerow: str = ""
 220
 221 @dataclass
 222 class Summarizer:
 223     label: str
 224     tasks: collections.defaultdict[str, Task]
 225
 226     def long_label(self):
 227         return self.label
 228
 229
 230 def date_export(item):
 231     if isinstance(item, datetime):
 232         return """@new Date("{}")@""".format(item.strftime("%Y-%m-%dT%H:%M:%SZ"))
 233
 234 def aws_monthly_cost(value):
 235     value_gb = value / (1024*1024*1024)
 236     first_50tb = min(1024*50, value_gb)
 237     next_450tb = max(min(1024*450, value_gb-1024*50), 0)
 238     over_500tb = max(value_gb-1024*500, 0)
 239
 240     monthly_cost = (first_50tb * 0.023) + (next_450tb * 0.022) + (over_500tb * 0.021)
 241     return monthly_cost
 242
 243
 244 def format_with_suffix_base2(summary_value):
 245     for scale in ["KiB", "MiB", "GiB", "TiB", "PiB", "EiB"]:
 246         summary_value = summary_value / 1024
 247         if summary_value < 1024:
 248             return "%.3f %s" % (summary_value, scale)
 249
 250 def format_with_suffix_base10(summary_value):
 251     for scale in ["KB", "MB", "GB", "TB", "PB", "EB"]:
 252         summary_value = summary_value / 1000
 253         if summary_value < 1000:
 254             return "%.3f %s" % (summary_value, scale)
 255
 256 containers_category = 'Concurrent running containers'
 257 storage_category = 'Storage Usage'
 258
 259 class ReportChart(crunchstat_summary.dygraphs.DygraphsChart):
 260     def sections(self):
 261         return [
 262             {
 263                 'label': s.long_label(),
 264                 'charts': [
 265                     self.chartdata(s.label, s.tasks, stat)
 266                     for stat in ((containers_category, ['containers']),
 267                                  (storage_category, ['storage used']),
 268                                  )
 269                     ],
 270             }
 271             for s in self.summarizers]
 272
 273     def js(self):
 274         return 'var chartdata = {};\n{}'.format(
 275             json.dumps(self.sections(), default=date_export).replace('"@', '').replace('@"', '').replace('\\"', '"'),
 276             '\n'.join([pkg_resources.resource_string('crunchstat_summary', jsa).decode('utf-8') for jsa in self.JSASSETS]))
 277
 278     def _collate_data(self, tasks, stats):
 279         data = []
 280         nulls = []
 281         # uuid is category for crunch2
 282         for uuid, task in tasks.items():
 283             # All stats in a category are assumed to have the same time base and same number of samples
 284             category = stats[0]
 285             series_names = stats[1]
 286             sn0 = series_names[0]
 287             series = task.series[(category,sn0)]
 288             for i in range(len(series)):
 289                 pt = series[i]
 290                 vals = [task.series[(category,stat)][i][1] for stat in series_names[1:]]
 291                 data.append([pt[0]] + nulls + [pt[1]] + vals)
 292             nulls.append(None)
 293         return sorted(data)
 294
 295
 296 WEBCHART_CLASS = ReportChart
 297
 298
 299 def runtime_str(container_request, containers):
 300     length = ciso8601.parse_datetime(containers[container_request["container_uuid"]]["finished_at"]) - ciso8601.parse_datetime(containers[container_request["container_uuid"]]["started_at"])
 301
 302     hours = length.days * 24 + (length.seconds // 3600)
 303     minutes = (length.seconds // 60) % 60
 304     seconds = length.seconds % 60
 305
 306     return "%i:%02i:%02i" % (hours, minutes, seconds)
 307
 308 def runtime_in_hours(runtime):
 309     sp = runtime.split(":")
 310     hours = float(sp[0])
 311     hours += float(sp[1]) / 60
 312     hours += float(sp[2]) / 3600
 313     return hours
 314
 315 def hours_to_runtime_str(frac_hours):
 316     hours = math.floor(frac_hours)
 317     minutes = (frac_hours - math.floor(frac_hours)) * 60.0
 318     seconds = (minutes - math.floor(minutes)) * 60.0
 319
 320     return "%i:%02i:%02i" % (hours, minutes, seconds)
 321
 322
 323 def csv_dateformat(datestr):
 324     dt = ciso8601.parse_datetime(datestr)
 325     return dt.strftime("%Y-%m-%d %H:%M:%S")
 326
 327
 328 class ClusterActivityReport(object):
 329     def __init__(self, prom_client, label=None, threads=1, **kwargs):
 330         self.threadcount = threads
 331         self.arv_client = arvados.api()
 332         self.prom_client = prom_client
 333         self.cluster = self.arv_client.config()["ClusterID"]
 334
 335         self.active_users = set()
 336         self.project_summary = {}
 337         self.total_hours = 0
 338         self.total_cost = 0
 339         self.total_workflows = 0
 340         self.summarizers = []
 341         self.storage_cost = 0
 342
 343     def run(self):
 344         pass
 345
 346     def collect_graph(self, s1, since, to, taskname, legend, metric, resampleTo, extra=None):
 347         if not self.prom_client:
 348             return
 349
 350         task = s1.tasks[taskname]
 351
 352         for series in get_metric_usage(self.prom_client, since, to, metric % self.cluster, resampleTo=resampleTo):
 353             for t in series.itertuples():
 354                 task.series[taskname, legend].append(t)
 355                 if extra:
 356                     extra(t)
 357
 358     def collect_storage_cost(self, t):
 359         self.storage_cost += aws_monthly_cost(t.y) / (30*24)
 360
 361     def html_report(self, since, to, exclude):
 362
 363         self.label = "Cluster report for %s from %s to %s" % (self.cluster, since.date(), to.date())
 364
 365         for row in self.report_from_api(since, to, False, exclude):
 366             pass
 367
 368         logging.info("Getting container hours time series")
 369         s1 = Summarizer(label="", tasks=collections.defaultdict(Task))
 370         self.collect_graph(s1, since, to, containers_category, "containers",
 371                            "arvados_dispatchcloud_containers_running{cluster='%s'}", resampleTo="5min")
 372
 373         logging.info("Getting data usage time series")
 374         s2 = Summarizer(label="", tasks=collections.defaultdict(Task))
 375         self.collect_graph(s2, since, to, storage_category, "managed",
 376                            "arvados_keep_collection_bytes{cluster='%s'}", resampleTo="60min")
 377
 378         self.collect_graph(s2, since, to, storage_category, "storage used",
 379                            "arvados_keep_total_bytes{cluster='%s'}", resampleTo="60min", extra=self.collect_storage_cost)
 380
 381         managed_data_now = s2.tasks[storage_category].series[storage_category,"managed"][-1]
 382         storage_used_now = s2.tasks[storage_category].series[storage_category,"storage used"][-1]
 383
 384         storage_cost = aws_monthly_cost(storage_used_now.y)
 385         dedup_ratio = managed_data_now.y/storage_used_now.y
 386         dedup_savings = aws_monthly_cost(managed_data_now.y) - storage_cost
 387
 388         self.summarizers = [s1, s2]
 389
 390         tophtml = ""
 391         bottomhtml = ""
 392         label = self.label
 393
 394         tophtml = []
 395
 396         workbench = self.arv_client.config()["Services"]["Workbench2"]["ExternalURL"]
 397         if workbench.endswith("/"):
 398             workbench = workbench[:-1]
 399
 400         if to.date() == date.today():
 401             tophtml.append("""<h2>Cluster status as of {now}</h2>
 402             <table class='aggtable'><tbody>
 403             <tr><td><a href="{workbench}/users">Total users</a></td><td>{total_users}</td></tr>
 404             <tr><td>Total projects</td><td>{total_projects}</td></tr>
 405             <tr><td>Total data under management</td><td>{managed_data_now}</td></tr>
 406             <tr><td>Total storage usage</td><td>{storage_used_now}</td></tr>
 407             <tr><td>Deduplication ratio</td><td>{dedup_ratio:.1f}</td></tr>
 408             <tr><td>Approximate monthly storage cost</td><td>${storage_cost:,.2f}</td></tr>
 409             <tr><td>Monthly savings from storage deduplication</td><td>${dedup_savings:,.2f}</td></tr>
 410             </tbody></table>
 411             """.format(now=date.today(),
 412                        total_users=self.total_users,
 413                        total_projects=self.total_projects,
 414                        managed_data_now=format_with_suffix_base10(managed_data_now.y),
 415                        storage_used_now=format_with_suffix_base10(storage_used_now.y),
 416                        dedup_savings=dedup_savings,
 417                        storage_cost=storage_cost,
 418                        dedup_ratio=dedup_ratio,
 419                            workbench=workbench))
 420
 421         tophtml.append("""<h2>Activity and cost over the {reporting_days} day period {since} to {to}</h2>
 422         <table class='aggtable'><tbody>
 423         <tr><td>Active users</td><td>{active_users}</td></tr>
 424         <tr><td><a href="#Active_Projects">Active projects</a></td><td>{active_projects}</td></tr>
 425         <tr><td>Workflow runs</td><td>{total_workflows:,}</td></tr>
 426         <tr><td>Compute used</td><td>{total_hours:,.1f} hours</td></tr>
 427         <tr><td>Compute cost</td><td>${total_cost:,.2f}</td></tr>
 428         <tr><td>Storage cost</td><td>${storage_cost:,.2f}</td></tr>
 429         </tbody></table>
 430         """.format(active_users=len(self.active_users),
 431                    total_users=self.total_users,
 432                    total_hours=self.total_hours,
 433                    total_cost=self.total_cost,
 434                    total_workflows=self.total_workflows,
 435                    active_projects=len(self.project_summary),
 436                    since=since.date(), to=to.date(),
 437                    reporting_days=(to - since).days,
 438                    storage_cost=self.storage_cost))
 439
 440         bottomhtml = []
 441
 442         projectlist = sorted(self.project_summary.items(), key=lambda x: x[1].cost, reverse=True)
 443
 444         for k, prj in projectlist:
 445             if prj.earliest.date() == prj.latest.date():
 446                 prj.activityspan = "{}".format(prj.earliest.date())
 447             else:
 448                 prj.activityspan = "{} to {}".format(prj.earliest.date(), prj.latest.date())
 449
 450             prj.tablerow = """<td>{users}</td> <td>{active}</td> <td>{hours:,.1f}</td> <td>${cost:,.2f}</td>""".format(
 451                 name=prj.name,
 452                 active=prj.activityspan,
 453                 cost=prj.cost,
 454                 hours=prj.hours,
 455                 users=", ".join(prj.users),
 456             )
 457
 458         bottomhtml.append(
 459             """
 460             <style>{style}</style>
 461             <script>{script}</script>
 462             <a id="Active_Projects"><h2>Active Projects</h2></a>
 463             <table class='sortable'>
 464             <thead><tr><th>Project</th> <th>Users</th> <th>Active</th> <th>Compute usage (hours)</th> <th>Compute cost</th> </tr></thead>
 465             <tbody><tr>{projects}</tr></tbody>
 466             </table>
 467             """.format(projects="</tr>\n<tr>".join("""<td><a href="#{name}">{name}</a></td>{rest}""".format(name=prj.name, rest=prj.tablerow) for k, prj in projectlist),
 468                        script=sortablejs,
 469                        style=sortablecss))
 470
 471         for k, prj in projectlist:
 472             wfsum = []
 473             for k2, r in sorted(prj.runs.items(), key=lambda x: x[1].count, reverse=True):
 474                 wfsum.append("""
 475                 <tr><td>{count}</td> <td>{workflowlink}</td>  <td>{runtime}</td> <td>${cost:,.2f}</td> <td>${totalcost:,.2f}</td></tr>
 476                 """.format(
 477                     count=r.count,
 478                     runtime=hours_to_runtime_str(r.hours/r.count),
 479                     cost=r.cost/r.count,
 480                     totalcost=r.cost,
 481                     workflowlink="""<a href="{workbench}/workflows/{uuid}">{name}</a>""".format(workbench=workbench,uuid=r.uuid,name=r.name)
 482                     if r.uuid != "none" else r.name))
 483
 484                 # <table>
 485                 # <thead><tr><th>Users</th> <th>Active</th> <th>Compute usage</th> <th>Compute cost</th> </tr></thead>
 486                 # <tbody><tr>{projectrow}</tr></tbody>
 487                 # </table>
 488
 489
 490             bottomhtml.append(
 491                 """<a id="{name}"></a><a href="{workbench}/projects/{uuid}"><h2>{name}</h2></a>
 492
 493                 <table>
 494                 <tbody><tr>{projectrow}</tr></tbody>
 495                 </table>
 496
 497                 <table class='sortable'>
 498                 <thead><tr><th>Workflow run count</th> <th>Workflow name</th> <th>Mean runtime</th> <th>Mean cost per run</th> <th>Sum cost over runs</th></tr></thead>
 499                 <tbody>
 500                 {wfsum}
 501                 </tbody></table>
 502                 """.format(name=prj.name,
 503                            users=", ".join(prj.users),
 504                            cost=prj.cost,
 505                            hours=prj.hours,
 506                            wfsum=" ".join(wfsum),
 507                            earliest=prj.earliest.date(),
 508                            latest=prj.latest.date(),
 509                            activity=prj.activityspan,
 510                            userplural='s' if len(prj.users) > 1 else '',
 511                            projectrow=prj.tablerow,
 512                            workbench=workbench,
 513                            uuid=prj.uuid)
 514             )
 515
 516         return WEBCHART_CLASS(label, self.summarizers).html(tophtml, bottomhtml)
 517
 518     def flush_containers(self, pending, include_steps, exclude):
 519         containers = {}
 520
 521         for container in arvados.util.keyset_list_all(
 522             self.arv_client.containers().list,
 523             filters=[
 524                 ["uuid", "in", [c["container_uuid"] for c in pending if c["container_uuid"]]],
 525             ],
 526             select=["uuid", "started_at", "finished_at", "cost"]):
 527
 528             containers[container["uuid"]] = container
 529
 530         workflows = {}
 531         workflows["none"] = "workflow run from command line"
 532
 533         for wf in arvados.util.keyset_list_all(
 534                 self.arv_client.workflows().list,
 535                 filters=[
 536                     ["uuid", "in", list(set(c["properties"]["template_uuid"]
 537                                             for c in pending
 538                                             if "template_uuid" in c["properties"] and c["properties"]["template_uuid"].startswith(self.arv_client.config()["ClusterID"])))],
 539                 ],
 540                 select=["uuid", "name"]):
 541             workflows[wf["uuid"]] = wf["name"]
 542
 543         projects = {}
 544
 545         for pr in arvados.util.keyset_list_all(
 546                 self.arv_client.groups().list,
 547                 filters=[
 548                     ["uuid", "in", list(set(c["owner_uuid"] for c in pending if c["owner_uuid"][6:11] == 'j7d0g'))],
 549                 ],
 550                 select=["uuid", "name"]):
 551             projects[pr["uuid"]] = pr["name"]
 552
 553         for pr in arvados.util.keyset_list_all(
 554                 self.arv_client.users().list,
 555                 filters=[
 556                     ["uuid", "in", list(set(c["owner_uuid"] for c in pending if c["owner_uuid"][6:11] == 'tpzed')|set(c["modified_by_user_uuid"] for c in pending))],
 557                 ],
 558                 select=["uuid", "full_name", "first_name", "last_name"]):
 559             projects[pr["uuid"]] = pr["full_name"]
 560
 561         if include_steps:
 562             name_regex = re.compile(r"(.+)_[0-9]+")
 563             child_crs = {}
 564             child_cr_containers = set()
 565             stepcount = 0
 566
 567             logging.info("Getting workflow steps")
 568             for cr in arvados.util.keyset_list_all(
 569                 self.arv_client.container_requests().list,
 570                 filters=[
 571                     ["requesting_container_uuid", "in", list(containers.keys())],
 572                 ],
 573                 select=["uuid", "name", "cumulative_cost", "requesting_container_uuid", "container_uuid"]):
 574
 575                 if cr["cumulative_cost"] == 0:
 576                     continue
 577
 578                 g = name_regex.fullmatch(cr["name"])
 579                 if g:
 580                     cr["name"] = g[1]
 581
 582                 child_crs.setdefault(cr["requesting_container_uuid"], []).append(cr)
 583                 child_cr_containers.add(cr["container_uuid"])
 584                 if len(child_cr_containers) == 1000:
 585                     stepcount += len(child_cr_containers)
 586                     for container in arvados.util.keyset_list_all(
 587                             self.arv_client.containers().list,
 588                             filters=[
 589                                 ["uuid", "in", list(child_cr_containers)],
 590                             ],
 591                             select=["uuid", "started_at", "finished_at", "cost"]):
 592
 593                         containers[container["uuid"]] = container
 594
 595                     logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
 596                     child_cr_containers.clear()
 597
 598             if child_cr_containers:
 599                 stepcount += len(child_cr_containers)
 600                 for container in arvados.util.keyset_list_all(
 601                         self.arv_client.containers().list,
 602                         filters=[
 603                             ["uuid", "in", list(child_cr_containers)],
 604                         ],
 605                         select=["uuid", "started_at", "finished_at", "cost"]):
 606
 607                     containers[container["uuid"]] = container
 608                 logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
 609
 610         for container_request in pending:
 611             if not container_request["container_uuid"] or not containers[container_request["container_uuid"]]["started_at"] or not containers[container_request["container_uuid"]]["finished_at"]:
 612                 continue
 613
 614             template_uuid = container_request["properties"].get("template_uuid", "none")
 615             workflowname = container_request["name"] if template_uuid == "none" else workflows.get(template_uuid, "workflow missing")
 616
 617             if exclude and re.search(exclude, workflowname, flags=re.IGNORECASE):
 618                 continue
 619
 620             yield {
 621                 "Project": projects.get(container_request["owner_uuid"], "unknown owner"),
 622                 "ProjectUUID": container_request["owner_uuid"],
 623                 "Workflow": workflowname,
 624                 "WorkflowUUID": container_request["properties"].get("template_uuid", "none"),
 625                 "Step": "workflow runner",
 626                 "StepUUID": container_request["uuid"],
 627                 "Sample": container_request["name"],
 628                 "SampleUUID": container_request["uuid"],
 629                 "User": projects.get(container_request["modified_by_user_uuid"], "unknown user"),
 630                 "UserUUID": container_request["modified_by_user_uuid"],
 631                 "Submitted": csv_dateformat(container_request["created_at"]),
 632                 "Started": csv_dateformat(containers[container_request["container_uuid"]]["started_at"]),
 633                 "Finished": csv_dateformat(containers[container_request["container_uuid"]]["finished_at"]),
 634                 "Runtime": runtime_str(container_request, containers),
 635                 "Cost": round(containers[container_request["container_uuid"]]["cost"] if include_steps else container_request["cumulative_cost"], 3),
 636                 "CumulativeCost": round(container_request["cumulative_cost"], 3)
 637                 }
 638
 639             if include_steps:
 640                 for child_cr in child_crs.get(container_request["container_uuid"], []):
 641                     if not child_cr["container_uuid"] or not containers[child_cr["container_uuid"]]["started_at"] or not containers[child_cr["container_uuid"]]["finished_at"]:
 642                         continue
 643                     yield {
 644                         "Project": projects.get(container_request["owner_uuid"], "unknown owner"),
 645                         "ProjectUUID": container_request["owner_uuid"],
 646                         "Workflow": workflows.get(container_request["properties"].get("template_uuid", "none"), "workflow missing"),
 647                         "WorkflowUUID": container_request["properties"].get("template_uuid", "none"),
 648                         "Step": child_cr["name"],
 649                         "StepUUID": child_cr["uuid"],
 650                         "Sample": container_request["name"],
 651                         "SampleUUID": container_request["name"],
 652                         "User": projects.get(container_request["modified_by_user_uuid"], "unknown user"),
 653                         "UserUUID": container_request["modified_by_user_uuid"],
 654                         "Submitted": csv_dateformat(child_cr["created_at"]),
 655                         "Started": csv_dateformat(containers[child_cr["container_uuid"]]["started_at"]),
 656                         "Finished": csv_dateformat(containers[child_cr["container_uuid"]]["finished_at"]),
 657                         "Runtime": runtime_str(child_cr, containers),
 658                         "Cost": round(containers[child_cr["container_uuid"]]["cost"], 3),
 659                         "CumulativeCost": round(containers[child_cr["container_uuid"]]["cost"], 3),
 660                         }
 661
 662
 663     def collect_summary_stats(self, row):
 664         self.active_users.add(row["User"])
 665         self.project_summary.setdefault(row["ProjectUUID"],
 666                                         ProjectSummary(users=set(),
 667                                                        runs={},
 668                                                        uuid=row["ProjectUUID"],
 669                                                        name=row["Project"]))
 670         prj = self.project_summary[row["ProjectUUID"]]
 671         cost = row["Cost"]
 672         prj.cost += cost
 673         prj.count += 1
 674         prj.users.add(row["User"])
 675         hrs = runtime_in_hours(row["Runtime"])
 676         prj.hours += hrs
 677
 678         started = datetime.strptime(row["Started"], "%Y-%m-%d %H:%M:%S")
 679         finished = datetime.strptime(row["Finished"], "%Y-%m-%d %H:%M:%S")
 680
 681         if started < prj.earliest:
 682             prj.earliest = started
 683
 684         if finished > prj.latest:
 685             prj.latest = finished
 686
 687         if row["Step"] == "workflow runner":
 688             prj.runs.setdefault(row["Workflow"], WorkflowRunSummary(name=row["Workflow"],
 689                                                                     uuid=row["WorkflowUUID"]))
 690             wfuuid = row["Workflow"]
 691             prj.runs[wfuuid].count += 1
 692             prj.runs[wfuuid].cost += row["CumulativeCost"]
 693             prj.runs[wfuuid].hours += hrs
 694             self.total_workflows += 1
 695
 696         self.total_hours += hrs
 697         self.total_cost += cost
 698
 699     def report_from_api(self, since, to, include_steps, exclude):
 700         pending = []
 701
 702         count = 0
 703         for container_request in arvados.util.keyset_list_all(
 704                 self.arv_client.container_requests().list,
 705                 filters=[
 706                     ["command", "like", "[\"arvados-cwl-runner%"],
 707                     ["created_at", ">=", since.strftime("%Y%m%dT%H%M%SZ")],
 708                 ],
 709                 select=["uuid", "owner_uuid", "container_uuid", "name", "cumulative_cost", "properties", "modified_by_user_uuid", "created_at"]):
 710
 711             if container_request["cumulative_cost"] == 0:
 712                 continue
 713
 714             if len(pending) < 1000:
 715                 pending.append(container_request)
 716             else:
 717                 count += len(pending)
 718                 logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
 719                 for row in self.flush_containers(pending, include_steps, exclude):
 720                     self.collect_summary_stats(row)
 721                     yield row
 722                 pending.clear()
 723
 724         count += len(pending)
 725         logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
 726         for row in self.flush_containers(pending, include_steps, exclude):
 727             self.collect_summary_stats(row)
 728             yield row
 729
 730         userinfo = self.arv_client.users().list(filters=[["is_active", "=", True]], limit=0).execute()
 731         self.total_users = userinfo["items_available"]
 732
 733         groupinfo = self.arv_client.groups().list(filters=[["group_class", "=", "project"]], limit=0).execute()
 734         self.total_projects = groupinfo["items_available"]
 735
 736     def csv_report(self, since, to, out, include_steps, columns, exclude):
 737         if columns:
 738             columns = columns.split(",")
 739         else:
 740             if include_steps:
 741                 columns = ("Project", "Workflow", "Step", "Sample", "User", "Submitted", "Runtime", "Cost")
 742             else:
 743                 columns = ("Project", "Workflow", "Sample", "User", "Submitted", "Runtime", "Cost")
 744
 745         csvwriter = csv.DictWriter(out, fieldnames=columns, extrasaction="ignore")
 746         csvwriter.writeheader()
 747
 748         for row in self.report_from_api(since, to, include_steps, exclude):
 749             csvwriter.writerow(row)