X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/39fc2f223fae40dc4fb160758e76ca39304b44af..9f86e39c7720f1291194fc8eea867a175e94e07f:/sdk/cwl/arvados_cwl/arvcontainer.py diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index 4d0fde7440..be8e557bd8 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -367,6 +367,12 @@ class ArvadosContainer(JobBase): logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.", self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510") + ram_multiplier = [1] + + oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry") + if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'): + ram_multiplier.append(oom_retry_req.get('memoryRetryMultipler')) + if runtimeContext.runnerjob.startswith("arvwf:"): wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")] wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries) @@ -374,17 +380,20 @@ class ArvadosContainer(JobBase): container_request["name"] = wfrecord["name"] container_request["properties"]["template_uuid"] = wfuuid - self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback) + if self.attempt_count == 0: + self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback) try: ram = runtime_constraints["ram"] - for i in range(1, 4): + self.uuid = runtimeContext.submit_request_uuid + + for i in ram_multiplier: runtime_constraints["ram"] = ram * i - if runtimeContext.submit_request_uuid: + if self.uuid: response = self.arvrunner.api.container_requests().update( - uuid=runtimeContext.submit_request_uuid, + uuid=self.uuid, body=container_request, **extra_submit_params ).execute(num_retries=self.arvrunner.num_retries) @@ -393,22 +402,21 @@ class ArvadosContainer(JobBase): body=container_request, **extra_submit_params ).execute(num_retries=self.arvrunner.num_retries) - runtimeContext.submit_request_uuid = response["uuid"] + self.uuid = response["uuid"] if response["container_uuid"] is not None: break if response["container_uuid"] is None: - runtime_constraints["ram"] = ram * (self.attempt_count+1) + runtime_constraints["ram"] = ram * ram_multiplier[self.attempt_count] container_request["state"] = "Committed" response = self.arvrunner.api.container_requests().update( - uuid=runtimeContext.submit_request_uuid, + uuid=self.uuid, body=container_request, **extra_submit_params ).execute(num_retries=self.arvrunner.num_retries) - self.uuid = response["uuid"] self.arvrunner.process_submitted(self) self.attempt_count += 1 @@ -422,6 +430,11 @@ class ArvadosContainer(JobBase): self.output_callback({}, "permanentFail") def out_of_memory_retry(self, record, container): + oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry") + if oom_retry_req is None: + return False + + # Sometimes it gets killed with no warning if container["exit_code"] == 137: return True @@ -434,14 +447,10 @@ class ArvadosContainer(JobBase): def callback(v1, v2, v3): loglines[0] = v3 - done.logtail(logc, callback, "", maxlen=200) - - oom_matches = r'(bad_alloc|out ?of ?memory|Container using over 95% of memory)' - - print("Checking loglines", loglines[0]) - - print("Match", re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE)) + done.logtail(logc, callback, "", maxlen=1000) + # Check allocation failure + oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|memory ?error|container using over 9.% of memory)' if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE): return True @@ -467,16 +476,17 @@ class ArvadosContainer(JobBase): else: processStatus = "permanentFail" - if processStatus == "permanentFail" and self.out_of_memory_retry(record, container): - logger.info("%s Container failed with out of memory error, retrying with more RAM.", + if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container): + logger.warning("%s Container failed with out of memory error, retrying with more RAM.", self.arvrunner.label(self)) self.job_runtime.submit_request_uuid = None + self.uuid = None self.run(None) retried = True return if rcode == 137: - logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin'.", + logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.", self.arvrunner.label(self)) else: processStatus = "permanentFail"