20825: Fix git_info

[arvados.git] / sdk / cwl / arvados_cwl / arvcontainer.py
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py

index b04fb190e963e965524679bb4473767820e8b849..487ab70909a95b5f3849474fae360935e6a2794d 100644 (file)
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -15,9 +15,10 @@ import datetime
  import ciso8601
  import uuid
  import math
+import re
  
  import arvados_cwl.util
-import ruamel.yaml as yaml
+import ruamel.yaml
  
  from cwltool.errors import WorkflowException
  from cwltool.process import UnsupportedRequirement, shortname
@@ -37,6 +38,9 @@ from ._version import __version__
  logger = logging.getLogger('arvados.cwl-runner')
  metrics = logging.getLogger('arvados.cwl-runner.metrics')
  
+def cleanup_name_for_collection(name):
+    return name.replace("/", " ")
+
  class ArvadosContainer(JobBase):
      """Submit and manage a Crunch container request for executing a CWL CommandLineTool."""
  
@@ -53,6 +57,7 @@ class ArvadosContainer(JobBase):
          self.job_runtime = job_runtime
          self.running = False
          self.uuid = None
+        self.attempt_count = 0
  
      def update_pipeline_component(self, r):
          pass
@@ -85,9 +90,11 @@ class ArvadosContainer(JobBase):
          container_request["output_path"] = self.outdir
          container_request["cwd"] = self.outdir
          container_request["priority"] = runtimeContext.priority
-        container_request["state"] = "Committed"
+        container_request["state"] = "Uncommitted"
          container_request.setdefault("properties", {})
  
+        container_request["properties"]["cwl_input"] = self.joborder
+
          runtime_constraints = {}
  
          if runtimeContext.project_uuid:
@@ -146,6 +153,8 @@ class ArvadosContainer(JobBase):
                      mounts[targetdir]["path"] = path
              prevdir = targetdir + "/"
  
+        intermediate_collection_info = arvados_cwl.util.get_intermediate_collection_info(self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl)
+
          with Perf(metrics, "generatefiles %s" % self.name):
              if self.generatefiles["listing"]:
                  vwd = arvados.collection.Collection(api_client=self.arvrunner.api,
@@ -197,12 +206,11 @@ class ArvadosContainer(JobBase):
  
                  if not runtimeContext.current_container:
                      runtimeContext.current_container = arvados_cwl.util.get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
-                info = arvados_cwl.util.get_intermediate_collection_info(self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl)
-                vwd.save_new(name=info["name"],
+                vwd.save_new(name=intermediate_collection_info["name"],
                               owner_uuid=runtimeContext.project_uuid,
                               ensure_unique_name=True,
-                             trash_at=info["trash_at"],
-                             properties=info["properties"])
+                             trash_at=intermediate_collection_info["trash_at"],
+                             properties=intermediate_collection_info["properties"])
  
                  prev = None
                  for f, p in sorteditems:
@@ -244,10 +252,7 @@ class ArvadosContainer(JobBase):
          container_request["container_image"] = arv_docker_get_image(self.arvrunner.api,
                                                                      docker_req,
                                                                      runtimeContext.pull_image,
-                                                                    runtimeContext.project_uuid,
-                                                                    runtimeContext.force_docker_pull,
-                                                                    runtimeContext.tmp_outdir_prefix,
-                                                                    runtimeContext.match_local_docker)
+                                                                    runtimeContext)
  
          network_req, _ = self.get_requirement("NetworkAccess")
          if network_req:
@@ -257,10 +262,22 @@ class ArvadosContainer(JobBase):
          if api_req:
              runtime_constraints["API"] = True
  
+        use_disk_cache = (self.arvrunner.api.config()["Containers"].get("DefaultKeepCacheRAM", 0) == 0)
+
+        keep_cache_type_req, _ = self.get_requirement("http://arvados.org/cwl#KeepCacheTypeRequirement")
+        if keep_cache_type_req:
+            if "keepCacheType" in keep_cache_type_req:
+                if keep_cache_type_req["keepCacheType"] == "ram_cache":
+                    use_disk_cache = False
+
          runtime_req, _ = self.get_requirement("http://arvados.org/cwl#RuntimeConstraints")
          if runtime_req:
              if "keep_cache" in runtime_req:
-                runtime_constraints["keep_cache_ram"] = math.ceil(runtime_req["keep_cache"] * 2**20)
+                if use_disk_cache:
+                    # If DefaultKeepCacheRAM is zero it means we should use disk cache.
+                    runtime_constraints["keep_cache_disk"] = math.ceil(runtime_req["keep_cache"] * 2**20)
+                else:
+                    runtime_constraints["keep_cache_ram"] = math.ceil(runtime_req["keep_cache"] * 2**20)
              if "outputDirType" in runtime_req:
                  if runtime_req["outputDirType"] == "local_output_dir":
                      # Currently the default behavior.
@@ -318,7 +335,7 @@ class ArvadosContainer(JobBase):
          if runtimeContext.submit_runner_cluster:
              extra_submit_params["cluster_id"] = runtimeContext.submit_runner_cluster
  
-        container_request["output_name"] = "Output for step %s" % (self.name)
+        container_request["output_name"] = cleanup_name_for_collection("Output from step %s" % (self.name))
          container_request["output_ttl"] = self.output_ttl
          container_request["mounts"] = mounts
          container_request["secret_mounts"] = secret_mounts
@@ -340,6 +357,22 @@ class ArvadosContainer(JobBase):
              for pr in properties_req["processProperties"]:
                  container_request["properties"][pr["propertyName"]] = self.builder.do_eval(pr["propertyValue"])
  
+        output_properties_req, _ = self.get_requirement("http://arvados.org/cwl#OutputCollectionProperties")
+        if output_properties_req:
+            if self.arvrunner.api._rootDesc["revision"] >= "20220510":
+                container_request["output_properties"] = {}
+                for pr in output_properties_req["outputProperties"]:
+                    container_request["output_properties"][pr["propertyName"]] = self.builder.do_eval(pr["propertyValue"])
+            else:
+                logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.",
+                               self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510")
+
+        ram_multiplier = [1]
+
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'):
+            ram_multiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+
          if runtimeContext.runnerjob.startswith("arvwf:"):
              wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
              wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
@@ -347,23 +380,45 @@ class ArvadosContainer(JobBase):
                  container_request["name"] = wfrecord["name"]
              container_request["properties"]["template_uuid"] = wfuuid
  
-        self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
+        if self.attempt_count == 0:
+            self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
  
          try:
-            if runtimeContext.submit_request_uuid:
-                response = self.arvrunner.api.container_requests().update(
-                    uuid=runtimeContext.submit_request_uuid,
-                    body=container_request,
-                    **extra_submit_params
-                ).execute(num_retries=self.arvrunner.num_retries)
-            else:
-                response = self.arvrunner.api.container_requests().create(
-                    body=container_request,
-                    **extra_submit_params
-                ).execute(num_retries=self.arvrunner.num_retries)
+            ram = runtime_constraints["ram"]
+
+            self.uuid = runtimeContext.submit_request_uuid
+
+            for i in ram_multiplier:
+                runtime_constraints["ram"] = ram * i
+
+                if self.uuid:
+                    response = self.arvrunner.api.container_requests().update(
+                        uuid=self.uuid,
+                        body=container_request,
+                        **extra_submit_params
+                    ).execute(num_retries=self.arvrunner.num_retries)
+                else:
+                    response = self.arvrunner.api.container_requests().create(
+                        body=container_request,
+                        **extra_submit_params
+                    ).execute(num_retries=self.arvrunner.num_retries)
+                    self.uuid = response["uuid"]
+
+                if response["container_uuid"] is not None:
+                    break
+
+            if response["container_uuid"] is None:
+                runtime_constraints["ram"] = ram * ram_multiplier[self.attempt_count]
+
+            container_request["state"] = "Committed"
+            response = self.arvrunner.api.container_requests().update(
+                uuid=self.uuid,
+                body=container_request,
+                **extra_submit_params
+            ).execute(num_retries=self.arvrunner.num_retries)
  
-            self.uuid = response["uuid"]
              self.arvrunner.process_submitted(self)
+            self.attempt_count += 1
  
              if response["state"] == "Final":
                  logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"])
@@ -374,8 +429,37 @@ class ArvadosContainer(JobBase):
              logger.debug("Container request was %s", container_request)
              self.output_callback({}, "permanentFail")
  
+    def out_of_memory_retry(self, record, container):
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req is None:
+            return False
+
+        # Sometimes it gets killed with no warning
+        if container["exit_code"] == 137:
+            return True
+
+        logc = arvados.collection.CollectionReader(record["log_uuid"],
+                                                   api_client=self.arvrunner.api,
+                                                   keep_client=self.arvrunner.keep_client,
+                                                   num_retries=self.arvrunner.num_retries)
+
+        loglines = [""]
+        def callback(v1, v2, v3):
+            loglines[0] = v3
+
+        done.logtail(logc, callback, "", maxlen=1000)
+
+        # Check allocation failure
+        oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|memory ?error|container using over 9.% of memory)'
+        if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+            return True
+
+        return False
+
      def done(self, record):
          outputs = {}
+        retried = False
+        rcode = None
          try:
              container = self.arvrunner.api.containers().get(
                  uuid=record["container_uuid"]
@@ -393,8 +477,17 @@ class ArvadosContainer(JobBase):
                  else:
                      processStatus = "permanentFail"
  
+                if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container):
+                    logger.warning("%s Container failed with out of memory error, retrying with more RAM.",
+                                 self.arvrunner.label(self))
+                    self.job_runtime.submit_request_uuid = None
+                    self.uuid = None
+                    self.run(None)
+                    retried = True
+                    return
+
                  if rcode == 137:
-                    logger.warning("%s This container was killed on the compute instance.  The most common reason is that it attempted to allocate too much RAM and was targeted by the Out Of Memory (OOM) killer.  Try resubmitting with a higher 'ramMin'.",
+                    logger.warning("%s Container may have been killed for using too much RAM.  Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
                                   self.arvrunner.label(self))
              else:
                  processStatus = "permanentFail"
@@ -407,7 +500,7 @@ class ArvadosContainer(JobBase):
                  label = self.arvrunner.label(self)
                  done.logtail(
                      logc, logger.error,
-                    "%s (%s) error log:" % (label, record["uuid"]), maxlen=40)
+                    "%s (%s) error log:" % (label, record["uuid"]), maxlen=40, include_crunchrun=(rcode is None or rcode > 127))
  
              if record["output_uuid"]:
                  if self.arvrunner.trash_intermediate or self.arvrunner.intermediate_output_ttl:
@@ -422,6 +515,13 @@ class ArvadosContainer(JobBase):
  
              if container["output"]:
                  outputs = done.done_outputs(self, container, "/tmp", self.outdir, "/keep")
+
+            properties = record["properties"].copy()
+            properties["cwl_output"] = outputs
+            self.arvrunner.api.container_requests().update(
+                uuid=self.uuid,
+                body={"container_request": {"properties": properties}}
+            ).execute(num_retries=self.arvrunner.num_retries)
          except WorkflowException as e:
              # Only include a stack trace if in debug mode.
              # A stack trace may obfuscate more useful output about the workflow.
@@ -432,13 +532,14 @@ class ArvadosContainer(JobBase):
              logger.exception("%s while getting output object:", self.arvrunner.label(self))
              processStatus = "permanentFail"
          finally:
-            self.output_callback(outputs, processStatus)
+            if not retried:
+                self.output_callback(outputs, processStatus)
  
  
  class RunnerContainer(Runner):
      """Submit and manage a container that runs arvados-cwl-runner."""
  
-    def arvados_job_spec(self, runtimeContext):
+    def arvados_job_spec(self, runtimeContext, git_info):
          """Create an Arvados container request for this workflow.
  
          The returned dict can be used to create a container passed as
@@ -465,7 +566,7 @@ class RunnerContainer(Runner):
              "cwd": "/var/spool/cwl",
              "priority": self.priority,
              "state": "Committed",
-            "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image),
+            "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image, runtimeContext),
              "mounts": {
                  "/var/lib/cwl/cwl.input.json": {
                      "kind": "json",
@@ -499,15 +600,30 @@ class RunnerContainer(Runner):
                  "kind": "collection",
                  "portable_data_hash": "%s" % workflowcollection
              }
+        elif self.embedded_tool.tool.get("id", "").startswith("arvwf:"):
+            uuid, frg = urllib.parse.urldefrag(self.embedded_tool.tool["id"])
+            workflowpath = "/var/lib/cwl/workflow.json#" + frg
+            packedtxt = self.loadingContext.loader.fetch_text(uuid)
+            yaml = ruamel.yaml.YAML(typ='safe', pure=True)
+            packed = yaml.load(packedtxt)
+            container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
+                "kind": "json",
+                "content": packed
+            }
+            container_req["properties"]["template_uuid"] = self.embedded_tool.tool["id"][6:33]
+        elif self.embedded_tool.tool.get("id", "").startswith("file:"):
+            raise Exception("Tool id '%s' is a local file but expected keep: or arvwf:" % self.embedded_tool.tool.get("id"))
          else:
-            packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map)
+            main = self.loadingContext.loader.idx["_:main"]
+            if main.get("id") == "_:main":
+                del main["id"]
              workflowpath = "/var/lib/cwl/workflow.json#main"
              container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                  "kind": "json",
-                "content": packed
+                "content": main
              }
-            if self.embedded_tool.tool.get("id", "").startswith("arvwf:"):
-                container_req["properties"]["template_uuid"] = self.embedded_tool.tool["id"][6:33]
+
+        container_req["properties"].update({k.replace("http://arvados.org/cwl#", "arv:"): v for k, v in git_info.items()})
  
          properties_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#ProcessProperties")
          if properties_req:
@@ -550,17 +666,17 @@ class RunnerContainer(Runner):
          if runtimeContext.intermediate_storage_classes != "default" and runtimeContext.intermediate_storage_classes:
              command.append("--intermediate-storage-classes=" + runtimeContext.intermediate_storage_classes)
  
-        if self.on_error:
+        if runtimeContext.on_error:
              command.append("--on-error=" + self.on_error)
  
-        if self.intermediate_output_ttl:
-            command.append("--intermediate-output-ttl=%d" % self.intermediate_output_ttl)
+        if runtimeContext.intermediate_output_ttl:
+            command.append("--intermediate-output-ttl=%d" % runtimeContext.intermediate_output_ttl)
  
-        if self.arvrunner.trash_intermediate:
+        if runtimeContext.trash_intermediate:
              command.append("--trash-intermediate")
  
-        if self.arvrunner.project_uuid:
-            command.append("--project-uuid="+self.arvrunner.project_uuid)
+        if runtimeContext.project_uuid:
+            command.append("--project-uuid="+runtimeContext.project_uuid)
  
          if self.enable_dev:
              command.append("--enable-dev")
@@ -571,6 +687,15 @@ class RunnerContainer(Runner):
          if runtimeContext.enable_preemptible is False:
              command.append("--disable-preemptible")
  
+        if runtimeContext.varying_url_params:
+            command.append("--varying-url-params="+runtimeContext.varying_url_params)
+
+        if runtimeContext.prefer_cached_downloads:
+            command.append("--prefer-cached-downloads")
+
+        if self.fast_parser:
+            command.append("--fast-parser")
+
          command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])
  
          container_req["command"] = command
@@ -580,9 +705,9 @@ class RunnerContainer(Runner):
  
      def run(self, runtimeContext):
          runtimeContext.keepprefix = "keep:"
-        job_spec = self.arvados_job_spec(runtimeContext)
-        if self.arvrunner.project_uuid:
-            job_spec["owner_uuid"] = self.arvrunner.project_uuid
+        job_spec = self.arvados_job_spec(runtimeContext, self.git_info)
+        if runtimeContext.project_uuid:
+            job_spec["owner_uuid"] = runtimeContext.project_uuid
  
          extra_submit_params = {}
          if runtimeContext.submit_runner_cluster: