"http://arvados.org/cwl#UsePreemptible",
"http://arvados.org/cwl#OutputCollectionProperties",
"http://arvados.org/cwl#KeepCacheTypeRequirement",
+ "http://arvados.org/cwl#OutOfMemoryRetry",
])
def exit_signal_handler(sigcode, frame):
only or written to disk and memory-mapped. The disk cache
leverages the kernel's virtual memory system so "hot" data will
generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+ type: record
+ extends: cwl:ProcessRequirement
+ inVocab: false
+ doc: |
+ Detect when a failed tool run may have run out of memory, and
+ re-submit the container with more RAM.
+ fields:
+ - name: class
+ type: string
+ doc: "'arv:OutOfMemoryRetry"
+ jsonldPredicate:
+ _id: "@type"
+ _type: "@vocab"
+ - name: memoryErrorRegex
+ type: string?
+ doc: |
+ A regular expression that will be used on the text of stdout
+ and stderr produced by the tool to determine if a failed job
+ should be retried with more RAM. By default, searches for the
+ substrings 'bad_alloc' and 'OutOfMemory'.
+ - name: memoryRetryMultipler
+ type: float
+ doc: |
+ If the container failed on its first run, re-submit the
+ container with the RAM request multiplied by this factor.
only or written to disk and memory-mapped. The disk cache
leverages the kernel's virtual memory system so "hot" data will
generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+ type: record
+ extends: cwl:ProcessRequirement
+ inVocab: false
+ doc: |
+ Detect when a failed tool run may have run out of memory, and
+ re-submit the container with more RAM.
+ fields:
+ - name: class
+ type: string
+ doc: "'arv:OutOfMemoryRetry"
+ jsonldPredicate:
+ _id: "@type"
+ _type: "@vocab"
+ - name: memoryErrorRegex
+ type: string?
+ doc: |
+ A regular expression that will be used on the text of stdout
+ and stderr produced by the tool to determine if a failed job
+ should be retried with more RAM. By default, searches for the
+ substrings 'bad_alloc' and 'OutOfMemory'.
+ - name: memoryRetryMultipler
+ type: float
+ doc: |
+ If the container failed on its first run, re-submit the
+ container with the RAM request multiplied by this factor.
only or written to disk and memory-mapped. The disk cache
leverages the kernel's virtual memory system so "hot" data will
generally still be kept in RAM.
+
+
+- name: OutOfMemoryRetry
+ type: record
+ extends: cwl:ProcessRequirement
+ inVocab: false
+ doc: |
+ Detect when a failed tool run may have run out of memory, and
+ re-submit the container with more RAM.
+ fields:
+ - name: class
+ type: string
+ doc: "'arv:OutOfMemoryRetry"
+ jsonldPredicate:
+ _id: "@type"
+ _type: "@vocab"
+ - name: memoryErrorRegex
+ type: string?
+ doc: |
+ A regular expression that will be used on the text of stdout
+ and stderr produced by the tool to determine if a failed job
+ should be retried with more RAM. By default, searches for the
+ substrings 'bad_alloc' and 'OutOfMemory'.
+ - name: memoryRetryMultipler
+ type: float
+ doc: |
+ If the container failed on its first run, re-submit the
+ container with the RAM request multiplied by this factor.
logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.",
self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510")
+ ramMultiplier = [1]
+
+ oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+ if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'):
+ ramMultiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+
if runtimeContext.runnerjob.startswith("arvwf:"):
wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
try:
ram = runtime_constraints["ram"]
- for i in range(1, 4):
+ for i in ramMultiplier:
runtime_constraints["ram"] = ram * i
if runtimeContext.submit_request_uuid:
break
if response["container_uuid"] is None:
- runtime_constraints["ram"] = ram * (self.attempt_count+1)
+ runtime_constraints["ram"] = ram * ramMultiplier[self.attempt_count]
container_request["state"] = "Committed"
response = self.arvrunner.api.container_requests().update(
self.output_callback({}, "permanentFail")
def out_of_memory_retry(self, record, container):
+ oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+ if oom_retry_req is None:
+ return False
+
+ # Sometimes it gets killed with no warning
+ if container["exit_code"] == 137:
+ return True
+
logc = arvados.collection.CollectionReader(record["log_uuid"],
api_client=self.arvrunner.api,
keep_client=self.arvrunner.keep_client,
done.logtail(logc, callback, "", maxlen=1000)
- # Check OOM killed
- oom_matches = r'container using over 9.% of memory'
- if container["exit_code"] == 137 and re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
- return True
-
# Check allocation failure
- bad_alloc_matches = r'(bad_alloc|out ?of ?memory)'
- if re.search(bad_alloc_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+ oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|container using over 9.% of memory)'
+ if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
return True
return False
else:
processStatus = "permanentFail"
- if processStatus == "permanentFail" and self.out_of_memory_retry(record, container):
+ if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container):
logger.info("%s Container failed with out of memory error, retrying with more RAM.",
self.arvrunner.label(self))
self.job_runtime.submit_request_uuid = None