import collections
import json
from datetime import date, datetime, timedelta
-import pkg_resources
+from typing import Dict, List
+import statistics
from dataclasses import dataclass
-
-import crunchstat_summary.dygraphs
-from crunchstat_summary.summarizer import Task
-
from arvados_cluster_activity.prometheus import get_metric_usage, get_data_usage
+from arvados_cluster_activity.reportchart import ReportChart
+
@dataclass
class WorkflowRunSummary:
name: str
uuid: str
+ cost: List[float]
+ hours: List[float]
count: int = 0
- cost: float = 0
- hours: float = 0
+
@dataclass
class ProjectSummary:
users: set
uuid: str
- runs: dict[str, WorkflowRunSummary]
+ runs: Dict[str, WorkflowRunSummary]
earliest: datetime = datetime(year=9999, month=1, day=1)
latest: datetime = datetime(year=1900, month=1, day=1)
name: str = ""
cost: float = 0
count: int = 0
hours: float = 0
+ activityspan: str = ""
+ tablerow: str = ""
-@dataclass
-class Summarizer:
- label: str
- tasks: collections.defaultdict[str, Task]
-
- def long_label(self):
- return self.label
-
-
-def date_export(item):
- if isinstance(item, datetime):
- return """@new Date("{}")@""".format(item.strftime("%Y-%m-%dT%H:%M:%SZ"))
def aws_monthly_cost(value):
value_gb = value / (1024*1024*1024)
if summary_value < 1000:
return "%.3f %s" % (summary_value, scale)
-class ReportChart(crunchstat_summary.dygraphs.DygraphsChart):
- def sections(self):
- return [
- {
- 'label': s.long_label(),
- 'charts': [
- self.chartdata(s.label, s.tasks, stat)
- for stat in (('Concurrent running containers', ['containers']),
- ('Data under management', ['actual storage used']),
- )
- ],
- }
- for s in self.summarizers]
-
- def js(self):
- return 'var chartdata = {};\n{}'.format(
- json.dumps(self.sections(), default=date_export).replace('"@', '').replace('@"', '').replace('\\"', '"'),
- '\n'.join([pkg_resources.resource_string('crunchstat_summary', jsa).decode('utf-8') for jsa in self.JSASSETS]))
-
- def _collate_data(self, tasks, stats):
- data = []
- nulls = []
- # uuid is category for crunch2
- for uuid, task in tasks.items():
- # All stats in a category are assumed to have the same time base and same number of samples
- category = stats[0]
- series_names = stats[1]
- sn0 = series_names[0]
- series = task.series[(category,sn0)]
- for i in range(len(series)):
- pt = series[i]
- vals = [task.series[(category,stat)][i][1] for stat in series_names[1:]]
- data.append([pt[0]] + nulls + [pt[1]] + vals)
- nulls.append(None)
- return sorted(data)
-
-
-WEBCHART_CLASS = ReportChart
+containers_graph = ('Concurrent running containers', 'containers')
+storage_graph = ('Storage usage', 'used')
+managed_graph = ('Data under management', 'managed')
def runtime_str(container_request, containers):
class ClusterActivityReport(object):
- def __init__(self, prom_client, label=None, threads=1, **kwargs):
- self.threadcount = threads
+ def __init__(self, prom_client):
self.arv_client = arvados.api()
self.prom_client = prom_client
self.cluster = self.arv_client.config()["ClusterID"]
self.total_hours = 0
self.total_cost = 0
self.total_workflows = 0
- self.summarizers = []
self.storage_cost = 0
+ self.summary_fetched = False
+ self.graphs = {}
- def run(self):
- pass
-
- def collect_graph(self, s1, since, to, taskname, legend, metric, resampleTo, extra=None):
+ def collect_graph(self, since, to, metric, resample_to, extra=None):
if not self.prom_client:
return
- task = s1.tasks[taskname]
+ flatdata = []
- for series in get_metric_usage(self.prom_client, since, to, metric % self.cluster, resampleTo=resampleTo):
+ for series in get_metric_usage(self.prom_client, since, to, metric % self.cluster, resampleTo=resample_to):
for t in series.itertuples():
- task.series[taskname, legend].append(t)
+ flatdata.append([t[0], t[1]])
if extra:
- extra(t)
+ extra(t[0], t[1])
+
+ return flatdata
- def collect_storage_cost(self, t):
- self.storage_cost += aws_monthly_cost(t.y) / (30*24)
+ def collect_storage_cost(self, timestamp, value):
+ self.storage_cost += aws_monthly_cost(value) / (30*24)
- def html_report(self, since, to, exclude):
+ def html_report(self, since, to, exclude, include_workflow_steps):
+ """Get a cluster activity report for the desired time period,
+ returning a string containing the report as an HTML document."""
self.label = "Cluster report for %s from %s to %s" % (self.cluster, since.date(), to.date())
- for row in self.report_from_api(since, to, True, exclude):
- pass
+ if not self.summary_fetched:
+ # If we haven't done it already, need to fetch everything
+ # from the API to collect summary stats (report_from_api
+ # calls collect_summary_stats on each row).
+ #
+ # Because it is a Python generator, we need call it in a
+ # loop to process all the rows. This method also yields
+ # each row which is used by a different function to create
+ # the CSV report, but for the HTML report we just discard
+ # them.
+ for row in self.report_from_api(since, to, include_workflow_steps, exclude):
+ pass
+
+ container_cumulative_hours = 0
+ def collect_container_hours(timestamp, value):
+ nonlocal container_cumulative_hours
+ # resampled to 5 minute increments but we want
+ # a sum of hours
+ container_cumulative_hours += value / 12
logging.info("Getting container hours time series")
- s1 = Summarizer(label="", tasks=collections.defaultdict(Task))
- self.collect_graph(s1, since, to, "Concurrent running containers", "containers",
- "arvados_dispatchcloud_containers_running{cluster='%s'}", resampleTo="5min")
+
+ self.graphs[containers_graph] = self.collect_graph(since, to,
+ "arvados_dispatchcloud_containers_running{cluster='%s'}",
+ resample_to="5min",
+ extra=collect_container_hours
+ )
logging.info("Getting data usage time series")
- s2 = Summarizer(label="", tasks=collections.defaultdict(Task))
- self.collect_graph(s2, since, to, "Data under management", "managed",
- "arvados_keep_collection_bytes{cluster='%s'}", resampleTo="60min")
+ self.graphs[managed_graph] = self.collect_graph(since, to,
+ "arvados_keep_collection_bytes{cluster='%s'}", resample_to="60min")
- self.collect_graph(s2, since, to, "Data under management", "actual storage used",
- "arvados_keep_total_bytes{cluster='%s'}", resampleTo="60min", extra=self.collect_storage_cost)
+ self.graphs[storage_graph] = self.collect_graph(since, to,
+ "arvados_keep_total_bytes{cluster='%s'}", resample_to="60min",
+ extra=self.collect_storage_cost)
- managed_data_now = s2.tasks["Data under management"].series["Data under management","managed"][-1]
- storage_used_now = s2.tasks["Data under management"].series["Data under management","actual storage used"][-1]
+ managed_data_now = None
+ storage_used_now = None
- storage_cost = aws_monthly_cost(storage_used_now.y)
- dedup_ratio = managed_data_now.y/storage_used_now.y
- dedup_savings = aws_monthly_cost(managed_data_now.y) - storage_cost
+ if len(self.graphs.get(managed_graph, [])) > 0:
+ managed_data_now = self.graphs[managed_graph][-1][1]
- self.summarizers = [s1, s2]
+ if len(self.graphs.get(storage_graph, [])) > 0:
+ storage_used_now = self.graphs[storage_graph][-1][1]
- tophtml = ""
- bottomhtml = ""
- label = self.label
+ if managed_data_now and storage_used_now:
+ storage_cost = aws_monthly_cost(storage_used_now)
+ dedup_ratio = managed_data_now/storage_used_now
- tophtml = []
- if to.date() == date.today():
- tophtml.append("""<h2>Cluster status as of {now}</h2>
+ label = self.label
+
+ cards = []
+
+ workbench = self.arv_client.config()["Services"]["Workbench2"]["ExternalURL"]
+ if workbench.endswith("/"):
+ workbench = workbench[:-1]
+
+ if to.date() == self.today():
+ # The deduplication ratio overstates things a bit, you can
+ # have collections which reference a small slice of a large
+ # block, and this messes up the intuitive value of this ratio
+ # and exagerates the effect.
+ #
+ # So for now, as much fun as this is, I'm excluding it from
+ # the report.
+ #
+ # dedup_savings = aws_monthly_cost(managed_data_now) - storage_cost
+ # <tr><th>Monthly savings from storage deduplication</th> <td>${dedup_savings:,.2f}</td></tr>
+
+ data_rows = ""
+ if managed_data_now and storage_used_now:
+ data_rows = """
+ <tr><th>Total data under management</th> <td>{managed_data_now}</td></tr>
+ <tr><th>Total storage usage</th> <td>{storage_used_now}</td></tr>
+ <tr><th>Deduplication ratio</th> <td>{dedup_ratio:.1f}</td></tr>
+ <tr><th>Approximate monthly storage cost</th> <td>${storage_cost:,.2f}</td></tr>
+ """.format(
+ managed_data_now=format_with_suffix_base10(managed_data_now),
+ storage_used_now=format_with_suffix_base10(storage_used_now),
+ storage_cost=storage_cost,
+ dedup_ratio=dedup_ratio,
+ )
+
+ cards.append("""<h2>Cluster status as of {now}</h2>
<table class='aggtable'><tbody>
- <tr><td>Total users</td><td>{total_users}</td></tr>
- <tr><td>Total projects</td><td>{total_projects}</td></tr>
- <tr><td>Total data under management</td><td>{managed_data_now}</td></tr>
- <tr><td>Total storage usage</td><td>{storage_used_now}</td></tr>
- <tr><td>Deduplication ratio</td><td>{dedup_ratio}</td></tr>
- <tr><td>Approximate monthly storage cost</td><td>${storage_cost}</td></tr>
- <tr><td>Monthly savings from storage deduplication</td><td>${dedup_savings}</td></tr>
+ <tr><th><a href="{workbench}/users">Total users</a></th><td>{total_users}</td></tr>
+ <tr><th>Total projects</th><td>{total_projects}</td></tr>
+ {data_rows}
</tbody></table>
- """.format(now=date.today(),
+ <p>See <a href="#prices">note on usage and cost calculations</a> for details on how costs are calculated.</p>
+ """.format(now=self.today(),
total_users=self.total_users,
total_projects=self.total_projects,
- managed_data_now=format_with_suffix_base10(managed_data_now.y),
- storage_used_now=format_with_suffix_base10(storage_used_now.y),
- dedup_savings=round(dedup_savings, 2),
- storage_cost=round(storage_cost, 2),
- dedup_ratio=round(dedup_ratio, 3)))
-
- tophtml.append("""<h2>Activity and cost over the {reporting_days} day period {since} to {to}</h2>
+ workbench=workbench,
+ data_rows=data_rows))
+
+ # We have a couple of options for getting total container hours
+ #
+ # total_hours=container_cumulative_hours
+ #
+ # calculates the sum from prometheus metrics
+ #
+ # total_hours=self.total_hours
+ #
+ # calculates the sum of the containers that were fetched
+ #
+ # The problem is these numbers tend not to match, especially
+ # if the report generation was not called with "include
+ # workflow steps".
+ #
+ # I decided to use the sum from containers fetched, because it
+ # will match the sum of compute time for each project listed
+ # in the report.
+
+ cards.append("""<h2>Activity and cost over the {reporting_days} day period {since} to {to}</h2>
<table class='aggtable'><tbody>
- <tr><td>Active users</td><td>{active_users}</td></tr>
- <tr><td>Active projects</td><td>{active_projects}</td></tr>
- <tr><td>Workflow runs</td><td>{total_workflows}</td></tr>
- <tr><td>Compute used</td><td>{total_hours} hours</td></tr>
- <tr><td>Compute cost</td><td>${total_cost}</td></tr>
- <tr><td>Storage cost</td><td>${storage_cost}</td></tr>
+ <tr><th>Active users</th> <td>{active_users}</td></tr>
+ <tr><th><a href="#Active_Projects">Active projects</a></th> <td>{active_projects}</td></tr>
+ <tr><th>Workflow runs</th> <td>{total_workflows:,}</td></tr>
+ <tr><th>Compute used</th> <td>{total_hours:,.1f} hours</td></tr>
+ <tr><th>Compute cost</th> <td>${total_cost:,.2f}</td></tr>
+ <tr><th>Storage cost</th> <td>${storage_cost:,.2f}</td></tr>
</tbody></table>
+ <p>See <a href="#prices">note on usage and cost calculations</a> for details on how costs are calculated.</p>
""".format(active_users=len(self.active_users),
total_users=self.total_users,
- total_hours=round(self.total_hours, 1),
- total_cost=round(self.total_cost, 2),
+ total_hours=self.total_hours,
+ total_cost=self.total_cost,
total_workflows=self.total_workflows,
active_projects=len(self.project_summary),
since=since.date(), to=to.date(),
reporting_days=(to - since).days,
- storage_cost=round(self.storage_cost, 2)))
+ storage_cost=self.storage_cost))
- bottomhtml = []
-
- for k, prj in sorted(self.project_summary.items(), key=lambda x: x[1].cost, reverse=True):
- wfsum = []
- for k2, r in sorted(prj.runs.items(), key=lambda x: x[1].count, reverse=True):
- wfsum.append( """
- <tr><td>{count}</td> <td>{name}</td> <td>{runtime}</td> <td>${cost}</td></tr>
- """.format(name=r.name, count=r.count, runtime=hours_to_runtime_str(r.hours/r.count), cost=round(r.cost/r.count, 2)))
+ projectlist = sorted(self.project_summary.items(), key=lambda x: x[1].cost, reverse=True)
+ for k, prj in projectlist:
if prj.earliest.date() == prj.latest.date():
- activityspan = "{}".format(prj.earliest.date())
+ prj.activityspan = "{}".format(prj.earliest.date())
else:
- activityspan = "{} to {}".format(prj.earliest.date(), prj.latest.date())
-
- bottomhtml.append(
- """<h2>{name}</h2>
- <table class='aggtable'><tbody>
- <tr><td>User{userplural}</td><td>{users}</td></tr>
- <tr><td>Active</td><td>{activity}</td></tr>
- <tr><td>Compute usage</td><td>{hours} hours</td></tr>
- <tr><td>Compute cost</td><td>${cost}</td></tr>
- </tbody></table>
- <table class='aggtable'><tbody>
- <tr><th>Workflow run count</th> <th>Workflow name</th> <th>Avg runtime</th> <th>Avg cost per run</th></tr>
+ prj.activityspan = "{} to {}".format(prj.earliest.date(), prj.latest.date())
+
+ prj.tablerow = """<td>{users}</td> <td>{active}</td> <td>{hours:,.1f}</td> <td>${cost:,.2f}</td>""".format(
+ active=prj.activityspan,
+ cost=prj.cost,
+ hours=prj.hours,
+ users=", ".join(prj.users),
+ )
+
+ cards.append("""
+ <div id="chart"></div>
+ """)
+
+ cards.append(
+ """
+ <a id="Active_Projects"><h2>Active Projects</h2></a>
+ <table class='sortable active-projects'>
+ <thead><tr><th>Project</th> <th>Users</th> <th>Active</th> <th>Compute usage (hours)</th> <th>Compute cost</th> </tr></thead>
+ <tbody><tr>{projects}</tr></tbody>
+ </table>
+ <p>See <a href="#prices">note on usage and cost calculations</a> for details on how costs are calculated.</p>
+ """.format(projects="</tr>\n<tr>".join("""<td><a href="#{name}">{name}</a></td>{rest}""".format(name=prj.name, rest=prj.tablerow) for k, prj in projectlist)))
+
+ for k, prj in projectlist:
+ wfsum = []
+ for k2, r in sorted(prj.runs.items(), key=lambda x: x[1].count, reverse=True):
+ wfsum.append("""
+ <tr><td>{count}</td> <td>{workflowlink}</td> <td>{median_runtime}</td> <td>{mean_runtime}</td> <td>${median_cost:,.2f}</td> <td>${mean_cost:,.2f}</td> <td>${totalcost:,.2f}</td></tr>
+ """.format(
+ count=r.count,
+ mean_runtime=hours_to_runtime_str(statistics.mean(r.hours)),
+ median_runtime=hours_to_runtime_str(statistics.median(r.hours)),
+ mean_cost=statistics.mean(r.cost),
+ median_cost=statistics.median(r.cost),
+ totalcost=sum(r.cost),
+ workflowlink="""<a href="{workbench}/workflows/{uuid}">{name}</a>""".format(workbench=workbench,uuid=r.uuid,name=r.name)
+ if r.uuid != "none" else r.name))
+
+ cards.append(
+ """<a id="{name}"></a><a href="{workbench}/projects/{uuid}"><h2>{name}</h2></a>
+
+ <table class='sortable single-project'>
+ <thead><tr> <th>Users</th> <th>Active</th> <th>Compute usage (hours)</th> <th>Compute cost</th> </tr></thead>
+ <tbody><tr>{projectrow}</tr></tbody>
+ </table>
+
+ <table class='sortable project'>
+ <thead><tr><th>Workflow run count</th> <th>Workflow name</th> <th>Median runtime</th> <th>Mean runtime</th> <th>Median cost per run</th> <th>Mean cost per run</th> <th>Sum cost over runs</th></tr></thead>
+ <tbody>
{wfsum}
</tbody></table>
""".format(name=prj.name,
- users=", ".join(prj.users),
- cost=round(prj.cost, 2),
- hours=round(prj.hours, 1),
- wfsum="</p><p>".join(wfsum),
- earliest=prj.earliest.date(),
- latest=prj.latest.date(),
- activity=activityspan,
- userplural='s' if len(prj.users) > 1 else '')
+ wfsum=" ".join(wfsum),
+ projectrow=prj.tablerow,
+ workbench=workbench,
+ uuid=prj.uuid)
)
- return WEBCHART_CLASS(label, self.summarizers).html(tophtml, bottomhtml)
-
- def flush_containers(self, pending, include_steps, exclude):
+ # The deduplication ratio overstates things a bit, you can
+ # have collections which reference a small slice of a large
+ # block, and this messes up the intuitive value of this ratio
+ # and exagerates the effect.
+ #
+ # So for now, as much fun as this is, I'm excluding it from
+ # the report.
+ #
+ # <p>"Monthly savings from storage deduplication" is the
+ # estimated cost difference between "storage usage" and "data
+ # under management" as a way of comparing with other
+ # technologies that do not support data deduplication.</p>
+
+
+ cards.append("""
+ <h2 id="prices">Note on usage and cost calculations</h2>
+
+ <div style="max-width: 60em">
+
+ <p>The numbers presented in this report are estimates and will
+ not perfectly match your cloud bill. Nevertheless this report
+ should be useful for identifying your main cost drivers.</p>
+
+ <h3>Storage</h3>
+
+ <p>"Total data under management" is what you get if you add up
+ all blocks referenced by all collections in Workbench, without
+ considering deduplication.</p>
+
+ <p>"Total storage usage" is the actual underlying storage
+ usage, accounting for data deduplication.</p>
+
+ <p>Storage costs are based on AWS "S3 Standard"
+ described on the <a href="https://aws.amazon.com/s3/pricing/">Amazon S3 pricing</a> page:</p>
+
+ <ul>
+ <li>$0.023 per GB / Month for the first 50 TB</li>
+ <li>$0.022 per GB / Month for the next 450 TB</li>
+ <li>$0.021 per GB / Month over 500 TB</li>
+ </ul>
+
+ <p>Finally, this only the base storage cost, and does not
+ include any fees associated with S3 API usage. However, there
+ are generally no ingress/egress fees if your Arvados instance
+ and S3 bucket are in the same region, which is the normal
+ recommended configuration.</p>
+
+ <h3>Compute</h3>
+
+ <p>"Compute usage" are instance-hours used in running
+ workflows. Because multiple steps may run in parallel on
+ multiple instances, a workflow that completes in four hours
+ but runs parallel steps on five instances, would be reported
+ as using 20 instance hours.</p>
+
+ <p>"Runtime" is the actual wall clock time that it took to
+ complete a workflow. This does not include time spent in the
+ queue for the workflow itself, but does include queuing time
+ of individual workflow steps.</p>
+
+ <p>Computational costs are derived from Arvados cost
+ calculations of container runs. For on-demand instances, this
+ uses the prices from the InstanceTypes section of the Arvado
+ config file, set by the system administrator. For spot
+ instances, this uses current spot prices retrieved on the fly
+ the AWS API.</p>
+
+ <p>Be aware that the cost calculations are only for the time
+ the container is running and only do not take into account the
+ overhead of launching instances or idle time between scheduled
+ tasks or prior to automatic shutdown.</p>
+
+ </div>
+ """)
+
+ return ReportChart(label, cards, self.graphs).html()
+
+ def iter_container_info(self, pending, include_steps, exclude):
+ # "pending" is a list of arvados-cwl-runner container requests
+ # returned by the API. This method fetches detailed
+ # information about the runs and yields report rows.
+
+ # 1. Get container records corresponding to container requests.
containers = {}
for container in arvados.util.keyset_list_all(
containers[container["uuid"]] = container
+ # 2. Look for the template_uuid property and fetch the
+ # corresponding workflow record.
workflows = {}
workflows["none"] = "workflow run from command line"
select=["uuid", "name"]):
workflows[wf["uuid"]] = wf["name"]
+ # 3. Look at owner_uuid and fetch owning projects and users
projects = {}
for pr in arvados.util.keyset_list_all(
select=["uuid", "name"]):
projects[pr["uuid"]] = pr["name"]
+ # 4. Look at owner_uuid and modified_by_user_uuid and get user records
for pr in arvados.util.keyset_list_all(
self.arv_client.users().list,
filters=[
select=["uuid", "full_name", "first_name", "last_name"]):
projects[pr["uuid"]] = pr["full_name"]
+ # 5. Optionally iterate over individual workflow steps.
if include_steps:
name_regex = re.compile(r"(.+)_[0-9]+")
child_crs = {}
child_cr_containers = set()
stepcount = 0
+ # 5.1. Go through the container requests owned by the toplevel workflow container
logging.info("Getting workflow steps")
for cr in arvados.util.keyset_list_all(
self.arv_client.container_requests().list,
if g:
cr["name"] = g[1]
+ # 5.2. Get the containers corresponding to the
+ # container requests. This has the same logic as
+ # report_from_api where we batch it into 1000 items at
+ # a time.
child_crs.setdefault(cr["requesting_container_uuid"], []).append(cr)
child_cr_containers.add(cr["container_uuid"])
if len(child_cr_containers) == 1000:
logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
child_cr_containers.clear()
+ # Get any remaining containers
if child_cr_containers:
stepcount += len(child_cr_containers)
for container in arvados.util.keyset_list_all(
containers[container["uuid"]] = container
logging.info("Got workflow steps %s - %s", stepcount-len(child_cr_containers), stepcount)
+ # 6. Now go through the list of workflow runs, yield a row
+ # with all the information we have collected, as well as the
+ # details for each workflow step (if enabled)
for container_request in pending:
if not container_request["container_uuid"] or not containers[container_request["container_uuid"]]["started_at"] or not containers[container_request["container_uuid"]]["finished_at"]:
continue
template_uuid = container_request["properties"].get("template_uuid", "none")
- workflowname = container_request["name"] if template_uuid == "none" else workflows.get(template_uuid, "workflow missing")
+ workflowname = container_request["name"] if template_uuid == "none" else workflows.get(template_uuid, template_uuid)
if exclude and re.search(exclude, workflowname, flags=re.IGNORECASE):
continue
if row["Step"] == "workflow runner":
prj.runs.setdefault(row["Workflow"], WorkflowRunSummary(name=row["Workflow"],
- uuid=row["WorkflowUUID"]))
+ uuid=row["WorkflowUUID"],
+ cost=[], hours=[]))
wfuuid = row["Workflow"]
prj.runs[wfuuid].count += 1
- prj.runs[wfuuid].cost += row["CumulativeCost"]
- prj.runs[wfuuid].hours += hrs
+ prj.runs[wfuuid].cost.append(row["CumulativeCost"])
+ prj.runs[wfuuid].hours.append(hrs)
self.total_workflows += 1
self.total_hours += hrs
filters=[
["command", "like", "[\"arvados-cwl-runner%"],
["created_at", ">=", since.strftime("%Y%m%dT%H%M%SZ")],
+ ["created_at", "<=", to.strftime("%Y%m%dT%H%M%SZ")],
],
select=["uuid", "owner_uuid", "container_uuid", "name", "cumulative_cost", "properties", "modified_by_user_uuid", "created_at"]):
if container_request["cumulative_cost"] == 0:
continue
+ # Every 1000 container requests, we fetch the
+ # corresponding container records.
+ #
+ # What's so special about 1000? Because that's the
+ # maximum Arvados page size, so when we use ['uuid', 'in',
+ # [...]] to fetch associated records it doesn't make sense
+ # to provide more than 1000 uuids.
+ #
+ # TODO: use the ?include=container_uuid feature so a
+ # separate request to the containers table isn't necessary.
if len(pending) < 1000:
pending.append(container_request)
else:
count += len(pending)
logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
- for row in self.flush_containers(pending, include_steps, exclude):
+ for row in self.iter_container_info(pending, include_steps, exclude):
self.collect_summary_stats(row)
yield row
pending.clear()
count += len(pending)
logging.info("Exporting workflow runs %s - %s", count-len(pending), count)
- for row in self.flush_containers(pending, include_steps, exclude):
+ for row in self.iter_container_info(pending, include_steps, exclude):
self.collect_summary_stats(row)
yield row
for row in self.report_from_api(since, to, include_steps, exclude):
csvwriter.writerow(row)
+
+ self.summary_fetched = True
+
+ def today(self):
+ return date.today()