19744: Add --enable/disable-usage-report
[arvados.git] / sdk / cwl / arvados_cwl / __init__.py
index 9cad9bd00d11eb96f50d2a65be5961de06e4d8c6..7e13488758b10f5ec9f2ac5a61ec31dfaa1ba4f8 100644 (file)
@@ -10,11 +10,12 @@ from future.utils import viewitems
 from builtins import str
 
 import argparse
+import importlib.metadata
+import importlib.resources
 import logging
 import os
 import sys
 import re
-import pkg_resources  # part of setuptools
 
 from schema_salad.sourceline import SourceLine
 import schema_salad.validate as validate
@@ -28,10 +29,10 @@ from cwltool.utils import adjustFileObjs, adjustDirObjs, get_listing
 
 import arvados
 import arvados.config
+import arvados.logging
 from arvados.keep import KeepClient
 from arvados.errors import ApiError
 import arvados.commands._util as arv_cmd
-from arvados.api import OrderedJsonModel
 
 from .perf import Perf
 from ._version import __version__
@@ -57,18 +58,18 @@ arvados.log_handler.setFormatter(logging.Formatter(
 
 def versionstring():
     """Print version string of key packages for provenance and debugging."""
-
-    arvcwlpkg = pkg_resources.require("arvados-cwl-runner")
-    arvpkg = pkg_resources.require("arvados-python-client")
-    cwlpkg = pkg_resources.require("cwltool")
-
-    return "%s %s, %s %s, %s %s" % (sys.argv[0], arvcwlpkg[0].version,
-                                    "arvados-python-client", arvpkg[0].version,
-                                    "cwltool", cwlpkg[0].version)
-
+    return "{} {}, arvados-python-client {}, cwltool {}".format(
+        sys.argv[0],
+        importlib.metadata.version('arvados-cwl-runner'),
+        importlib.metadata.version('arvados-python-client'),
+        importlib.metadata.version('cwltool'),
+    )
 
 def arg_parser():  # type: () -> argparse.ArgumentParser
-    parser = argparse.ArgumentParser(description='Arvados executor for Common Workflow Language')
+    parser = argparse.ArgumentParser(
+        description='Arvados executor for Common Workflow Language',
+        parents=[arv_cmd.retry_opt],
+    )
 
     parser.add_argument("--basedir",
                         help="Base directory used to resolve relative references in the input, default to directory of input object file or current directory (if inputs piped/provided on command line).")
@@ -120,6 +121,8 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
     exgroup.add_argument("--create-workflow", action="store_true", help="Register an Arvados workflow that can be run from Workbench")
     exgroup.add_argument("--update-workflow", metavar="UUID", help="Update an existing Arvados workflow with the given UUID.")
 
+    exgroup.add_argument("--print-keep-deps", action="store_true", help="To assist copying, print a list of Keep collections that this workflow depends on.")
+
     exgroup = parser.add_mutually_exclusive_group()
     exgroup.add_argument("--wait", action="store_true", help="After submitting workflow runner, wait for completion.",
                         default=True, dest="wait")
@@ -212,12 +215,25 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                         action="store_true", default=False,
                         help=argparse.SUPPRESS)
 
+    parser.add_argument("--fast-parser", dest="fast_parser",
+                        action="store_true", default=False,
+                        help=argparse.SUPPRESS)
+
     parser.add_argument("--thread-count", type=int,
                         default=0, help="Number of threads to use for job submit and output collection.")
 
     parser.add_argument("--http-timeout", type=int,
                         default=5*60, dest="http_timeout", help="API request timeout in seconds. Default is 300 seconds (5 minutes).")
 
+    parser.add_argument("--defer-downloads", action="store_true", default=False,
+                        help="When submitting a workflow, defer downloading HTTP URLs to workflow launch instead of downloading to Keep before submit.")
+
+    parser.add_argument("--varying-url-params", type=str, default="",
+                        help="A comma separated list of URL query parameters that should be ignored when storing HTTP URLs in Keep.")
+
+    parser.add_argument("--prefer-cached-downloads", action="store_true", default=False,
+                        help="If a HTTP URL is found in Keep, skip upstream URL freshness check (will not notice if the upstream has changed, but also not error if upstream is unavailable).")
+
     exgroup = parser.add_mutually_exclusive_group()
     exgroup.add_argument("--enable-preemptible", dest="enable_preemptible", default=None, action="store_true", help="Use preemptible instances. Control individual steps with arv:UsePreemptible hint.")
     exgroup.add_argument("--disable-preemptible", dest="enable_preemptible", default=None, action="store_false", help="Don't use preemptible instances.")
@@ -242,6 +258,10 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                         default=False, dest="trash_intermediate",
                         help="Do not trash intermediate outputs (default).")
 
+    exgroup = parser.add_mutually_exclusive_group()
+    exgroup.add_argument("--enable-usage-report", dest="enable_usage_report", default=None, action="store_true", help="Create usage_report.html with a summary of each step's resource usage.")
+    exgroup.add_argument("--disable-usage-report", dest="enable_usage_report", default=None, action="store_false", help="Disable usage report.")
+
     parser.add_argument("workflow", default=None, help="The workflow to execute")
     parser.add_argument("job_order", nargs=argparse.REMAINDER, help="The input object to the workflow.")
 
@@ -252,10 +272,8 @@ def add_arv_hints():
     cwltool.command_line_tool.ACCEPTLIST_RE = cwltool.command_line_tool.ACCEPTLIST_EN_RELAXED_RE
     supported_versions = ["v1.0", "v1.1", "v1.2"]
     for s in supported_versions:
-        res = pkg_resources.resource_stream(__name__, 'arv-cwl-schema-%s.yml' % s)
-        customschema = res.read().decode('utf-8')
+        customschema = importlib.resources.read_text(__name__, f'arv-cwl-schema-{s}.yml', 'utf-8')
         use_custom_schema(s, "http://arvados.org/cwl", customschema)
-        res.close()
     cwltool.process.supportedProcessRequirements.extend([
         "http://arvados.org/cwl#RunInSingleContainer",
         "http://arvados.org/cwl#OutputDirType",
@@ -271,6 +289,8 @@ def add_arv_hints():
         "http://commonwl.org/cwltool#CUDARequirement",
         "http://arvados.org/cwl#UsePreemptible",
         "http://arvados.org/cwl#OutputCollectionProperties",
+        "http://arvados.org/cwl#KeepCacheTypeRequirement",
+        "http://arvados.org/cwl#OutOfMemoryRetry",
     ])
 
 def exit_signal_handler(sigcode, frame):
@@ -306,7 +326,9 @@ def main(args=sys.argv[1:],
             return 1
         arvargs.work_api = want_api
 
-    if (arvargs.create_workflow or arvargs.update_workflow) and not arvargs.job_order:
+    workflow_op = arvargs.create_workflow or arvargs.update_workflow or arvargs.print_keep_deps
+
+    if workflow_op and not arvargs.job_order:
         job_order_object = ({}, "")
 
     add_arv_hints()
@@ -318,14 +340,32 @@ def main(args=sys.argv[1:],
     try:
         if api_client is None:
             api_client = arvados.safeapi.ThreadSafeApiCache(
-                api_params={"model": OrderedJsonModel(), "timeout": arvargs.http_timeout},
-                keep_params={"num_retries": 4})
+                api_params={
+                    'num_retries': arvargs.retries,
+                    'timeout': arvargs.http_timeout,
+                },
+                keep_params={
+                    'num_retries': arvargs.retries,
+                },
+                version='v1',
+            )
             keep_client = api_client.keep
             # Make an API object now so errors are reported early.
             api_client.users().current().execute()
         if keep_client is None:
-            keep_client = arvados.keep.KeepClient(api_client=api_client, num_retries=4)
-        executor = ArvCwlExecutor(api_client, arvargs, keep_client=keep_client, num_retries=4, stdout=stdout)
+            block_cache = arvados.keep.KeepBlockCache(disk_cache=True)
+            keep_client = arvados.keep.KeepClient(
+                api_client=api_client,
+                block_cache=block_cache,
+                num_retries=arvargs.retries,
+            )
+        executor = ArvCwlExecutor(
+            api_client,
+            arvargs,
+            keep_client=keep_client,
+            num_retries=arvargs.retries,
+            stdout=stdout,
+        )
     except WorkflowException as e:
         logger.error(e, exc_info=(sys.exc_info()[1] if arvargs.debug else False))
         return 1
@@ -335,9 +375,25 @@ def main(args=sys.argv[1:],
 
     # Note that unless in debug mode, some stack traces related to user
     # workflow errors may be suppressed.
+
+    # Set the logging on most modules INFO (instead of default which is WARNING)
+    logger.setLevel(logging.INFO)
+    logging.getLogger('arvados').setLevel(logging.INFO)
+    logging.getLogger('arvados.keep').setLevel(logging.WARNING)
+    # API retries are filtered to the INFO level and can be noisy, but as long as
+    # they succeed we don't need to see warnings about it.
+    googleapiclient_http_logger = logging.getLogger('googleapiclient.http')
+    googleapiclient_http_logger.addFilter(arvados.logging.GoogleHTTPClientFilter())
+    googleapiclient_http_logger.setLevel(logging.WARNING)
+
     if arvargs.debug:
         logger.setLevel(logging.DEBUG)
         logging.getLogger('arvados').setLevel(logging.DEBUG)
+        # In debug mode show logs about retries, but we arn't
+        # debugging the google client so we don't need to see
+        # everything.
+        googleapiclient_http_logger.setLevel(logging.NOTSET)
+        logging.getLogger('googleapiclient').setLevel(logging.INFO)
 
     if arvargs.quiet:
         logger.setLevel(logging.WARN)
@@ -364,9 +420,12 @@ def main(args=sys.argv[1:],
         # unit tests.
         stdout = None
 
-    if arvargs.submit and (arvargs.workflow.startswith("arvwf:") or workflow_uuid_pattern.match(arvargs.workflow)):
+    executor.loadingContext.default_docker_image = arvargs.submit_runner_image or "arvados/jobs:"+__version__
+
+    if arvargs.workflow.startswith("arvwf:") or workflow_uuid_pattern.match(arvargs.workflow) or arvargs.workflow.startswith("keep:"):
         executor.loadingContext.do_validate = False
-        executor.fast_submit = True
+        if arvargs.submit and not workflow_op:
+            executor.fast_submit = True
 
     return cwltool.main.main(args=arvargs,
                              stdout=stdout,
@@ -378,4 +437,4 @@ def main(args=sys.argv[1:],
                              custom_schema_callback=add_arv_hints,
                              loadingContext=executor.loadingContext,
                              runtimeContext=executor.toplevel_runtimeContext,
-                             input_required=not (arvargs.create_workflow or arvargs.update_workflow))
+                             input_required=not workflow_op)