17301: Report warning about OOM killer when exit code 137
[arvados.git] / sdk / cwl / arvados_cwl / arvcontainer.py
index 560e8979ecd0950f989ba50d03ea44302b8a94ce..c85443a23af10cbe88bdb7bba1f51978d431efcb 100644 (file)
@@ -63,7 +63,7 @@ class ArvadosContainer(JobBase):
         env["TMPDIR"] = self.tmpdir
         return env
 
-    def run(self, runtimeContext):
+    def run(self, toplevelRuntimeContext):
         # ArvadosCommandTool subclasses from cwltool.CommandLineTool,
         # which calls makeJobRunner() to get a new ArvadosContainer
         # object.  The fields that define execution such as
@@ -246,7 +246,8 @@ class ArvadosContainer(JobBase):
                                                                     runtimeContext.pull_image,
                                                                     runtimeContext.project_uuid,
                                                                     runtimeContext.force_docker_pull,
-                                                                    runtimeContext.tmp_outdir_prefix)
+                                                                    runtimeContext.tmp_outdir_prefix,
+                                                                    runtimeContext.match_local_docker)
 
         network_req, _ = self.get_requirement("NetworkAccess")
         if network_req:
@@ -291,6 +292,25 @@ class ArvadosContainer(JobBase):
             else:
                 container_request["output_storage_classes"] = runtimeContext.intermediate_storage_classes.strip().split(",")
 
+        cuda_req, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement")
+        if cuda_req:
+            runtime_constraints["cuda"] = {
+                "device_count": resources.get("cudaDeviceCount", 1),
+                "driver_version": cuda_req["cudaVersionMin"],
+                "hardware_capability": aslist(cuda_req["cudaComputeCapability"])[0]
+            }
+
+        if runtimeContext.enable_preemptible is False:
+            scheduling_parameters["preemptible"] = False
+        else:
+            preemptible_req, _ = self.get_requirement("http://arvados.org/cwl#UsePreemptible")
+            if preemptible_req:
+                scheduling_parameters["preemptible"] = preemptible_req["usePreemptible"]
+            elif runtimeContext.enable_preemptible is True:
+                scheduling_parameters["preemptible"] = True
+            elif runtimeContext.enable_preemptible is None:
+                pass
+
         if self.timelimit is not None and self.timelimit > 0:
             scheduling_parameters["max_run_time"] = self.timelimit
 
@@ -372,6 +392,10 @@ class ArvadosContainer(JobBase):
                     processStatus = "success"
                 else:
                     processStatus = "permanentFail"
+
+                if rcode == 137:
+                    logger.warning("%s job was killed on the compute instance.  The most common reason is that it attempted to allocate too much RAM and was targeted by the Out Of Memory (OOM) killer.  Try resubmitting with a higher 'ramMin'.",
+                                 self.arvrunner.label(self))
             else:
                 processStatus = "permanentFail"
 
@@ -520,10 +544,10 @@ class RunnerContainer(Runner):
         if runtimeContext.debug:
             command.append("--debug")
 
-        if runtimeContext.storage_classes != "default":
+        if runtimeContext.storage_classes != "default" and runtimeContext.storage_classes:
             command.append("--storage-classes=" + runtimeContext.storage_classes)
 
-        if runtimeContext.intermediate_storage_classes != "default":
+        if runtimeContext.intermediate_storage_classes != "default" and runtimeContext.intermediate_storage_classes:
             command.append("--intermediate-storage-classes=" + runtimeContext.intermediate_storage_classes)
 
         if self.on_error:
@@ -541,6 +565,12 @@ class RunnerContainer(Runner):
         if self.enable_dev:
             command.append("--enable-dev")
 
+        if runtimeContext.enable_preemptible is True:
+            command.append("--enable-preemptible")
+
+        if runtimeContext.enable_preemptible is False:
+            command.append("--disable-preemptible")
+
         command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])
 
         container_req["command"] = command