22273: Remove built-in type subscript

[arvados.git] / tools / cluster-activity / arvados_cluster_activity / report.py
diff --git a/tools/cluster-activity/arvados_cluster_activity/report.py b/tools/cluster-activity/arvados_cluster_activity/report.py

index 13e4df83c4fdda41de88721fadbc2ab4e526b592..5737a2e20e9b36f998c77a80fdc62730458e159e 100644 (file)
--- a/tools/cluster-activity/arvados_cluster_activity/report.py
+++ b/tools/cluster-activity/arvados_cluster_activity/report.py
@@ -12,47 +12,37 @@ import math
  import collections
  import json
  from datetime import date, datetime, timedelta
-import pkg_resources
+from typing import Dict, List
+import statistics
  
  from dataclasses import dataclass
-
-import crunchstat_summary.dygraphs
-from crunchstat_summary.summarizer import Task
-
  from arvados_cluster_activity.prometheus import get_metric_usage, get_data_usage
+from arvados_cluster_activity.reportchart import ReportChart
+
  
  @dataclass
  class WorkflowRunSummary:
      name: str
      uuid: str
+    cost: List[float]
+    hours: List[float]
      count: int = 0
-    cost: float = 0
-    hours: float = 0
+
  
  @dataclass
  class ProjectSummary:
      users: set
      uuid: str
-    runs: dict[str, WorkflowRunSummary]
+    runs: Dict[str, WorkflowRunSummary]
      earliest: datetime = datetime(year=9999, month=1, day=1)
      latest: datetime = datetime(year=1900, month=1, day=1)
      name: str = ""
      cost: float = 0
      count: int = 0
      hours: float = 0
+    activityspan: str = ""
+    tablerow: str = ""
  
-@dataclass
-class Summarizer:
-    label: str
-    tasks: collections.defaultdict[str, Task]
-
-    def long_label(self):
-        return self.label
-
-
-def date_export(item):
-    if isinstance(item, datetime):
-        return """@new Date("{}")@""".format(item.strftime("%Y-%m-%dT%H:%M:%SZ"))
  
  def aws_monthly_cost(value):
      value_gb = value / (1024*1024*1024)
@@ -76,44 +66,9 @@ def format_with_suffix_base10(summary_value):
          if summary_value < 1000:
              return "%.3f %s" % (summary_value, scale)
  
-class ReportChart(crunchstat_summary.dygraphs.DygraphsChart):
-    def sections(self):
-        return [
-            {
-                'label': s.long_label(),
-                'charts': [
-                    self.chartdata(s.label, s.tasks, stat)
-                    for stat in (('Concurrent running containers', ['containers']),
-                                 ('Data under management', ['actual storage used']),
-                                 )
-                    ],
-            }
-            for s in self.summarizers]
-
-    def js(self):
-        return 'var chartdata = {};\n{}'.format(
-            json.dumps(self.sections(), default=date_export).replace('"@', '').replace('@"', '').replace('\\"', '"'),
-            '\n'.join([pkg_resources.resource_string('crunchstat_summary', jsa).decode('utf-8') for jsa in self.JSASSETS]))
-
-    def _collate_data(self, tasks, stats):
-        data = []
-        nulls = []
-        # uuid is category for crunch2
-        for uuid, task in tasks.items():
-            # All stats in a category are assumed to have the same time base and same number of samples
-            category = stats[0]
-            series_names = stats[1]
-            sn0 = series_names[0]
-            series = task.series[(category,sn0)]
-            for i in range(len(series)):
-                pt = series[i]
-                vals = [task.series[(category,stat)][i][1] for stat in series_names[1:]]
-                data.append([pt[0]] + nulls + [pt[1]] + vals)
-            nulls.append(None)
-        return sorted(data)
-
-
-WEBCHART_CLASS = ReportChart
+containers_graph = ('Concurrent running containers', 'containers')
+storage_graph = ('Storage usage', 'used')
+managed_graph = ('Data under management', 'managed')
  
  
  def runtime_str(container_request, containers):
@@ -146,8 +101,7 @@ def csv_dateformat(datestr):
  
  
  class ClusterActivityReport(object):
-    def __init__(self, prom_client, label=None, threads=1, **kwargs):
-        self.threadcount = threads
+    def __init__(self, prom_client):
          self.arv_client = arvados.api()
          self.prom_client = prom_client
          self.cluster = self.arv_client.config()["ClusterID"]
@@ -157,141 +111,315 @@ class ClusterActivityReport(object):
          self.total_hours = 0
          self.total_cost = 0
          self.total_workflows = 0
-        self.summarizers = []
          self.storage_cost = 0
+        self.summary_fetched = False
+        self.graphs = {}
  
-    def run(self):
-        pass
-
-    def collect_graph(self, s1, since, to, taskname, legend, metric, resampleTo, extra=None):
+    def collect_graph(self, since, to, metric, resample_to, extra=None):
          if not self.prom_client:
              return
  
-        task = s1.tasks[taskname]
+        flatdata = []
  
-        for series in get_metric_usage(self.prom_client, since, to, metric % self.cluster, resampleTo=resampleTo):
+        for series in get_metric_usage(self.prom_client, since, to, metric % self.cluster, resampleTo=resample_to):
              for t in series.itertuples():
-                task.series[taskname, legend].append(t)
+                flatdata.append([t[0], t[1]])
                  if extra:
-                    extra(t)
+                    extra(t[0], t[1])
+
+        return flatdata
  
-    def collect_storage_cost(self, t):
-        self.storage_cost += aws_monthly_cost(t.y) / (30*24)
+    def collect_storage_cost(self, timestamp, value):
+        self.storage_cost += aws_monthly_cost(value) / (30*24)
  
-    def html_report(self, since, to, exclude):
+    def html_report(self, since, to, exclude, include_workflow_steps):
+        """Get a cluster activity report for the desired time period,
+        returning a string containing the report as an HTML document."""
  
          self.label = "Cluster report for %s from %s to %s" % (self.cluster, since.date(), to.date())
  
-        for row in self.report_from_api(since, to, True, exclude):
-            pass
+        if not self.summary_fetched:
+            # If we haven't done it already, need to fetch everything
+            # from the API to collect summary stats (report_from_api
+            # calls collect_summary_stats on each row).
+            #
+            # Because it is a Python generator, we need call it in a
+            # loop to process all the rows.  This method also yields
+            # each row which is used by a different function to create
+            # the CSV report, but for the HTML report we just discard
+            # them.
+            for row in self.report_from_api(since, to, include_workflow_steps, exclude):
+                pass
+
+        container_cumulative_hours = 0
+        def collect_container_hours(timestamp, value):
+            nonlocal container_cumulative_hours
+            # resampled to 5 minute increments but we want
+            # a sum of hours
+            container_cumulative_hours += value / 12
  
          logging.info("Getting container hours time series")
-        s1 = Summarizer(label="", tasks=collections.defaultdict(Task))
-        self.collect_graph(s1, since, to, "Concurrent running containers", "containers",
-                           "arvados_dispatchcloud_containers_running{cluster='%s'}", resampleTo="5min")
+
+        self.graphs[containers_graph] = self.collect_graph(since, to,
+                           "arvados_dispatchcloud_containers_running{cluster='%s'}",
+                           resample_to="5min",
+                           extra=collect_container_hours
+                           )
  
          logging.info("Getting data usage time series")
-        s2 = Summarizer(label="", tasks=collections.defaultdict(Task))
-        self.collect_graph(s2, since, to, "Data under management", "managed",
-                           "arvados_keep_collection_bytes{cluster='%s'}", resampleTo="60min")
+        self.graphs[managed_graph] = self.collect_graph(since, to,
+                           "arvados_keep_collection_bytes{cluster='%s'}", resample_to="60min")
  
-        self.collect_graph(s2, since, to, "Data under management", "actual storage used",
-                           "arvados_keep_total_bytes{cluster='%s'}", resampleTo="60min", extra=self.collect_storage_cost)
+        self.graphs[storage_graph] = self.collect_graph(since, to,
+                           "arvados_keep_total_bytes{cluster='%s'}", resample_to="60min",
+                                                        extra=self.collect_storage_cost)
  
-        managed_data_now = s2.tasks["Data under management"].series["Data under management","managed"][-1]
-        storage_used_now = s2.tasks["Data under management"].series["Data under management","actual storage used"][-1]
+        managed_data_now = None
+        storage_used_now = None
  
-        storage_cost = aws_monthly_cost(storage_used_now.y)
-        dedup_ratio = managed_data_now.y/storage_used_now.y
-        dedup_savings = aws_monthly_cost(managed_data_now.y) - storage_cost
+        if len(self.graphs.get(managed_graph, [])) > 0:
+            managed_data_now = self.graphs[managed_graph][-1][1]
  
-        self.summarizers = [s1, s2]
+        if len(self.graphs.get(storage_graph, [])) > 0:
+            storage_used_now = self.graphs[storage_graph][-1][1]
  
-        tophtml = ""
-        bottomhtml = ""
-        label = self.label
+        if managed_data_now and storage_used_now:
+            storage_cost = aws_monthly_cost(storage_used_now)
+            dedup_ratio = managed_data_now/storage_used_now
  
-        tophtml = []
  
-        if to.date() == date.today():
-            tophtml.append("""<h2>Cluster status as of {now}</h2>
+        label = self.label
+
+        cards = []
+
+        workbench = self.arv_client.config()["Services"]["Workbench2"]["ExternalURL"]
+        if workbench.endswith("/"):
+            workbench = workbench[:-1]
+
+        if to.date() == self.today():
+            # The deduplication ratio overstates things a bit, you can
+            # have collections which reference a small slice of a large
+            # block, and this messes up the intuitive value of this ratio
+            # and exagerates the effect.
+            #
+            # So for now, as much fun as this is, I'm excluding it from
+            # the report.
+            #
+            # dedup_savings = aws_monthly_cost(managed_data_now) - storage_cost
+            # <tr><th>Monthly savings from storage deduplication</th> <td>${dedup_savings:,.2f}</td></tr>
+
+            data_rows = ""
+            if managed_data_now and storage_used_now:
+                data_rows = """
+            <tr><th>Total data under management</th> <td>{managed_data_now}</td></tr>
+            <tr><th>Total storage usage</th> <td>{storage_used_now}</td></tr>
+            <tr><th>Deduplication ratio</th> <td>{dedup_ratio:.1f}</td></tr>
+            <tr><th>Approximate monthly storage cost</th> <td>${storage_cost:,.2f}</td></tr>
+                """.format(
+                       managed_data_now=format_with_suffix_base10(managed_data_now),
+                       storage_used_now=format_with_suffix_base10(storage_used_now),
+                       storage_cost=storage_cost,
+                       dedup_ratio=dedup_ratio,
+                )
+
+            cards.append("""<h2>Cluster status as of {now}</h2>
              <table class='aggtable'><tbody>
-            <tr><td>Total users</td><td>{total_users}</td></tr>
-            <tr><td>Total projects</td><td>{total_projects}</td></tr>
-            <tr><td>Total data under management</td><td>{managed_data_now}</td></tr>
-            <tr><td>Total storage usage</td><td>{storage_used_now}</td></tr>
-            <tr><td>Deduplication ratio</td><td>{dedup_ratio}</td></tr>
-            <tr><td>Approximate monthly storage cost</td><td>${storage_cost}</td></tr>
-            <tr><td>Monthly savings from storage deduplication</td><td>${dedup_savings}</td></tr>
+            <tr><th><a href="{workbench}/users">Total users</a></th><td>{total_users}</td></tr>
+            <tr><th>Total projects</th><td>{total_projects}</td></tr>
+            {data_rows}
              </tbody></table>
-            """.format(now=date.today(),
+            <p>See <a href="#prices">note on usage and cost calculations</a> for details on how costs are calculated.</p>
+            """.format(now=self.today(),
                         total_users=self.total_users,
                         total_projects=self.total_projects,
-                       managed_data_now=format_with_suffix_base10(managed_data_now.y),
-                       storage_used_now=format_with_suffix_base10(storage_used_now.y),
-                       dedup_savings=round(dedup_savings, 2),
-                       storage_cost=round(storage_cost, 2),
-                       dedup_ratio=round(dedup_ratio, 3)))
-
-        tophtml.append("""<h2>Activity and cost over the {reporting_days} day period {since} to {to}</h2>
+                       workbench=workbench,
+                       data_rows=data_rows))
+
+        # We have a couple of options for getting total container hours
+        #
+        # total_hours=container_cumulative_hours
+        #
+        # calculates the sum from prometheus metrics
+        #
+        # total_hours=self.total_hours
+        #
+        # calculates the sum of the containers that were fetched
+        #
+        # The problem is these numbers tend not to match, especially
+        # if the report generation was not called with "include
+        # workflow steps".
+        #
+        # I decided to use the sum from containers fetched, because it
+        # will match the sum of compute time for each project listed
+        # in the report.
+
+        cards.append("""<h2>Activity and cost over the {reporting_days} day period {since} to {to}</h2>
          <table class='aggtable'><tbody>
-        <tr><td>Active users</td><td>{active_users}</td></tr>
-        <tr><td>Active projects</td><td>{active_projects}</td></tr>
-        <tr><td>Workflow runs</td><td>{total_workflows}</td></tr>
-        <tr><td>Compute used</td><td>{total_hours} hours</td></tr>
-        <tr><td>Compute cost</td><td>${total_cost}</td></tr>
-        <tr><td>Storage cost</td><td>${storage_cost}</td></tr>
+        <tr><th>Active users</th> <td>{active_users}</td></tr>
+        <tr><th><a href="#Active_Projects">Active projects</a></th> <td>{active_projects}</td></tr>
+        <tr><th>Workflow runs</th> <td>{total_workflows:,}</td></tr>
+        <tr><th>Compute used</th> <td>{total_hours:,.1f} hours</td></tr>
+        <tr><th>Compute cost</th> <td>${total_cost:,.2f}</td></tr>
+        <tr><th>Storage cost</th> <td>${storage_cost:,.2f}</td></tr>
          </tbody></table>
+        <p>See <a href="#prices">note on usage and cost calculations</a> for details on how costs are calculated.</p>
          """.format(active_users=len(self.active_users),
                     total_users=self.total_users,
-                   total_hours=round(self.total_hours, 1),
-                   total_cost=round(self.total_cost, 2),
+                   total_hours=self.total_hours,
+                   total_cost=self.total_cost,
                     total_workflows=self.total_workflows,
                     active_projects=len(self.project_summary),
                     since=since.date(), to=to.date(),
                     reporting_days=(to - since).days,
-                   storage_cost=round(self.storage_cost, 2)))
+                   storage_cost=self.storage_cost))
  
-        bottomhtml = []
-
-        for k, prj in sorted(self.project_summary.items(), key=lambda x: x[1].cost, reverse=True):
-            wfsum = []
-            for k2, r in sorted(prj.runs.items(), key=lambda x: x[1].count, reverse=True):
-                wfsum.append( """
-                <tr><td>{count}</td> <td>{name}</td>  <td>{runtime}</td> <td>${cost}</td></tr>
-                """.format(name=r.name, count=r.count, runtime=hours_to_runtime_str(r.hours/r.count), cost=round(r.cost/r.count, 2)))
+        projectlist = sorted(self.project_summary.items(), key=lambda x: x[1].cost, reverse=True)
  
+        for k, prj in projectlist:
              if prj.earliest.date() == prj.latest.date():
-                activityspan = "{}".format(prj.earliest.date())
+                prj.activityspan = "{}".format(prj.earliest.date())
              else:
-                activityspan = "{} to {}".format(prj.earliest.date(), prj.latest.date())
-
-            bottomhtml.append(
-                """<h2>{name}</h2>
-                <table class='aggtable'><tbody>
-                <tr><td>User{userplural}</td><td>{users}</td></tr>
-                <tr><td>Active</td><td>{activity}</td></tr>
-                <tr><td>Compute usage</td><td>{hours} hours</td></tr>
-                <tr><td>Compute cost</td><td>${cost}</td></tr>
-                </tbody></table>
-                <table class='aggtable'><tbody>
-                <tr><th>Workflow run count</th> <th>Workflow name</th> <th>Avg runtime</th> <th>Avg cost per run</th></tr>
+                prj.activityspan = "{} to {}".format(prj.earliest.date(), prj.latest.date())
+
+            prj.tablerow = """<td>{users}</td> <td>{active}</td> <td>{hours:,.1f}</td> <td>${cost:,.2f}</td>""".format(
+                active=prj.activityspan,
+                cost=prj.cost,
+                hours=prj.hours,
+                users=", ".join(prj.users),
+            )
+
+        cards.append("""
+                <div id="chart"></div>
+            """)
+
+        cards.append(
+            """
+            <a id="Active_Projects"><h2>Active Projects</h2></a>
+            <table class='sortable active-projects'>
+            <thead><tr><th>Project</th> <th>Users</th> <th>Active</th> <th>Compute usage (hours)</th> <th>Compute cost</th> </tr></thead>
+            <tbody><tr>{projects}</tr></tbody>
+            </table>
+            <p>See <a href="#prices">note on usage and cost calculations</a> for details on how costs are calculated.</p>
+            """.format(projects="</tr>\n<tr>".join("""<td><a href="#{name}">{name}</a></td>{rest}""".format(name=prj.name, rest=prj.tablerow) for k, prj in projectlist)))
+
+        for k, prj in projectlist:
+            wfsum = []
+            for k2, r in sorted(prj.runs.items(), key=lambda x: x[1].count, reverse=True):
+                wfsum.append("""
+                <tr><td>{count}</td> <td>{workflowlink}</td> <td>{median_runtime}</td> <td>{mean_runtime}</td> <td>${median_cost:,.2f}</td> <td>${mean_cost:,.2f}</td> <td>${totalcost:,.2f}</td></tr>
+                """.format(
+                    count=r.count,
+                    mean_runtime=hours_to_runtime_str(statistics.mean(r.hours)),
+                    median_runtime=hours_to_runtime_str(statistics.median(r.hours)),
+                    mean_cost=statistics.mean(r.cost),
+                    median_cost=statistics.median(r.cost),
+                    totalcost=sum(r.cost),
+                    workflowlink="""<a href="{workbench}/workflows/{uuid}">{name}</a>""".format(workbench=workbench,uuid=r.uuid,name=r.name)
+                    if r.uuid != "none" else r.name))
+
+            cards.append(
+                """<a id="{name}"></a><a href="{workbench}/projects/{uuid}"><h2>{name}</h2></a>
+
+                <table class='sortable single-project'>
+                <thead><tr> <th>Users</th> <th>Active</th> <th>Compute usage (hours)</th> <th>Compute cost</th> </tr></thead>
+                <tbody><tr>{projectrow}</tr></tbody>
+                </table>
+
+                <table class='sortable project'>
+                <thead><tr><th>Workflow run count</th> <th>Workflow name</th> <th>Median runtime</th> <th>Mean runtime</th> <th>Median cost per run</th> <th>Mean cost per run</th> <th>Sum cost over runs</th></tr></thead>
+                <tbody>
                  {wfsum}
                  </tbody></table>
                  """.format(name=prj.name,
-                           users=", ".join(prj.users),
-                           cost=round(prj.cost, 2),
-                           hours=round(prj.hours, 1),
-                           wfsum="</p><p>".join(wfsum),
-                           earliest=prj.earliest.date(),
-                           latest=prj.latest.date(),
-                           activity=activityspan,
-                           userplural='s' if len(prj.users) > 1 else '')
+                           wfsum=" ".join(wfsum),
+                           projectrow=prj.tablerow,
+                           workbench=workbench,
+                           uuid=prj.uuid)
              )
  
-        return WEBCHART_CLASS(label, self.summarizers).html(tophtml, bottomhtml)
-
-    def flush_containers(self, pending, include_steps, exclude):
+        # The deduplication ratio overstates things a bit, you can
+        # have collections which reference a small slice of a large
+        # block, and this messes up the intuitive value of this ratio
+        # and exagerates the effect.
+        #
+        # So for now, as much fun as this is, I'm excluding it from
+        # the report.
+        #
+        # <p>"Monthly savings from storage deduplication" is the
+        # estimated cost difference between "storage usage" and "data
+        # under management" as a way of comparing with other
+        # technologies that do not support data deduplication.</p>
+
+
+        cards.append("""
+        <h2 id="prices">Note on usage and cost calculations</h2>
+
+        <div style="max-width: 60em">
+
+        <p>The numbers presented in this report are estimates and will
+        not perfectly match your cloud bill.  Nevertheless this report
+        should be useful for identifying your main cost drivers.</p>
+
+        <h3>Storage</h3>
+
+        <p>"Total data under management" is what you get if you add up
+        all blocks referenced by all collections in Workbench, without
+        considering deduplication.</p>
+
+        <p>"Total storage usage" is the actual underlying storage
+        usage, accounting for data deduplication.</p>
+
+        <p>Storage costs are based on AWS "S3 Standard"
+        described on the <a href="https://aws.amazon.com/s3/pricing/">Amazon S3 pricing</a> page:</p>
+
+        <ul>
+        <li>$0.023 per GB / Month for the first 50 TB</li>
+        <li>$0.022 per GB / Month for the next 450 TB</li>
+        <li>$0.021 per GB / Month over 500 TB</li>
+        </ul>
+
+        <p>Finally, this only the base storage cost, and does not
+        include any fees associated with S3 API usage.  However, there
+        are generally no ingress/egress fees if your Arvados instance
+        and S3 bucket are in the same region, which is the normal
+        recommended configuration.</p>
+
+        <h3>Compute</h3>
+
+        <p>"Compute usage" are instance-hours used in running
+        workflows.  Because multiple steps may run in parallel on
+        multiple instances, a workflow that completes in four hours
+        but runs parallel steps on five instances, would be reported
+        as using 20 instance hours.</p>
+
+        <p>"Runtime" is the actual wall clock time that it took to
+        complete a workflow.  This does not include time spent in the
+        queue for the workflow itself, but does include queuing time
+        of individual workflow steps.</p>
+
+        <p>Computational costs are derived from Arvados cost
+        calculations of container runs.  For on-demand instances, this
+        uses the prices from the InstanceTypes section of the Arvado
+        config file, set by the system administrator.  For spot
+        instances, this uses current spot prices retrieved on the fly
+        the AWS API.</p>
+
+        <p>Be aware that the cost calculations are only for the time
+        the container is running and only do not take into account the
+        overhead of launching instances or idle time between scheduled
+        tasks or prior to automatic shutdown.</p>
+
+        </div>
+        """)
+
+        return ReportChart(label, cards, self.graphs).html()
+
+    def iter_container_info(self, pending, include_steps, exclude):
+        # "pending" is a list of arvados-cwl-runner container requests
+        # returned by the API.  This method fetches detailed
+        # information about the runs and yields report rows.
+
+        # 1. Get container records corresponding to container requests.
          containers = {}
  
          for container in arvados.util.keyset_list_all(
@@ -303,6 +431,8 @@ class ClusterActivityReport(object):
  
              containers[container["uuid"]] = container
  
+        # 2. Look for the template_uuid property and fetch the
+        # corresponding workflow record.
          workflows = {}
          workflows["none"] = "workflow run from command line"
  
@@ -316,6 +446,7 @@ class ClusterActivityReport(object):
                  select=["uuid", "name"]):
              workflows[wf["uuid"]] = wf["name"]
  
+        # 3. Look at owner_uuid and fetch owning projects and users
          projects = {}
  
          for pr in arvados.util.keyset_list_all(
@@ -326,6 +457,7 @@ class ClusterActivityReport(object):
                  select=["uuid", "name"]):
              projects[pr["uuid"]] = pr["name"]
  
+        # 4. Look at owner_uuid and modified_by_user_uuid and get user records
          for pr in arvados.util.keyset_list_all(
                  self.arv_client.users().list,
                  filters=[
@@ -334,12 +466,14 @@ class ClusterActivityReport(object):
                  select=["uuid", "full_name", "first_name", "last_name"]):
              projects[pr["uuid"]] = pr["full_name"]
  
+        # 5. Optionally iterate over individual workflow steps.
          if include_steps:
              name_regex = re.compile(r"(.+)_[0-9]+")
              child_crs = {}
              child_cr_containers = set()
              stepcount = 0
  
+            # 5.1. Go through the container requests owned by the toplevel workflow container
              logging.info("Getting workflow steps")
              for cr in arvados.util.keyset_list_all(
                  self.arv_client.container_requests().list,
@@ -355,6 +489,10 @@ class ClusterActivityReport(object):
                  if g:
                      cr["name"] = g[1]
  
+                # 5.2. Get the containers corresponding to the
+                # container requests.  This has the same logic as
+                # report_from_api where we batch it into 1000 items at
+                # a time.
                  child_crs.setdefault(cr["requesting_container_uuid"], []).append(cr)
                  child_cr_containers.add(cr["container_uuid"])
                  if len(child_cr_containers) == 1000:
@@ -371,6 +509,7 @@ class ClusterActivityReport(object):
                      logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
                      child_cr_containers.clear()
  
+            # Get any remaining containers
              if child_cr_containers:
                  stepcount += len(child_cr_containers)
                  for container in arvados.util.keyset_list_all(
@@ -383,12 +522,15 @@ class ClusterActivityReport(object):
                      containers[container["uuid"]] = container
                  logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
  
+        # 6. Now go through the list of workflow runs, yield a row
+        # with all the information we have collected, as well as the
+        # details for each workflow step (if enabled)
          for container_request in pending:
              if not container_request["container_uuid"] or not containers[container_request["container_uuid"]]["started_at"] or not containers[container_request["container_uuid"]]["finished_at"]:
                  continue
  
              template_uuid = container_request["properties"].get("template_uuid", "none")
-            workflowname = container_request["name"] if template_uuid == "none" else workflows.get(template_uuid, "workflow missing")
+            workflowname = container_request["name"] if template_uuid == "none" else workflows.get(template_uuid, template_uuid)
  
              if exclude and re.search(exclude, workflowname, flags=re.IGNORECASE):
                  continue
@@ -462,11 +604,12 @@ class ClusterActivityReport(object):
  
          if row["Step"] == "workflow runner":
              prj.runs.setdefault(row["Workflow"], WorkflowRunSummary(name=row["Workflow"],
-                                                                    uuid=row["WorkflowUUID"]))
+                                                                    uuid=row["WorkflowUUID"],
+                                                                    cost=[], hours=[]))
              wfuuid = row["Workflow"]
              prj.runs[wfuuid].count += 1
-            prj.runs[wfuuid].cost += row["CumulativeCost"]
-            prj.runs[wfuuid].hours += hrs
+            prj.runs[wfuuid].cost.append(row["CumulativeCost"])
+            prj.runs[wfuuid].hours.append(hrs)
              self.total_workflows += 1
  
          self.total_hours += hrs
@@ -481,25 +624,36 @@ class ClusterActivityReport(object):
                  filters=[
                      ["command", "like", "[\"arvados-cwl-runner%"],
                      ["created_at", ">=", since.strftime("%Y%m%dT%H%M%SZ")],
+                    ["created_at", "<=", to.strftime("%Y%m%dT%H%M%SZ")],
                  ],
                  select=["uuid", "owner_uuid", "container_uuid", "name", "cumulative_cost", "properties", "modified_by_user_uuid", "created_at"]):
  
              if container_request["cumulative_cost"] == 0:
                  continue
  
+            # Every 1000 container requests, we fetch the
+            # corresponding container records.
+            #
+            # What's so special about 1000?  Because that's the
+            # maximum Arvados page size, so when we use ['uuid', 'in',
+            # [...]] to fetch associated records it doesn't make sense
+            # to provide more than 1000 uuids.
+            #
+            # TODO: use the ?include=container_uuid feature so a
+            # separate request to the containers table isn't necessary.
              if len(pending) < 1000:
                  pending.append(container_request)
              else:
                  count += len(pending)
                  logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
-                for row in self.flush_containers(pending, include_steps, exclude):
+                for row in self.iter_container_info(pending, include_steps, exclude):
                      self.collect_summary_stats(row)
                      yield row
                  pending.clear()
  
          count += len(pending)
          logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
-        for row in self.flush_containers(pending, include_steps, exclude):
+        for row in self.iter_container_info(pending, include_steps, exclude):
              self.collect_summary_stats(row)
              yield row
  
@@ -523,3 +677,8 @@ class ClusterActivityReport(object):
  
          for row in self.report_from_api(since, to, include_steps, exclude):
              csvwriter.writerow(row)
+
+        self.summary_fetched = True
+
+    def today(self):
+        return date.today()