import ciso8601
import uuid
import math
+import re
import arvados_cwl.util
-import ruamel.yaml as yaml
+import ruamel.yaml
from cwltool.errors import WorkflowException
from cwltool.process import UnsupportedRequirement, shortname
logger = logging.getLogger('arvados.cwl-runner')
metrics = logging.getLogger('arvados.cwl-runner.metrics')
+def cleanup_name_for_collection(name):
+ return name.replace("/", " ")
+
class ArvadosContainer(JobBase):
"""Submit and manage a Crunch container request for executing a CWL CommandLineTool."""
self.job_runtime = job_runtime
self.running = False
self.uuid = None
+ self.attempt_count = 0
def update_pipeline_component(self, r):
pass
container_request["output_path"] = self.outdir
container_request["cwd"] = self.outdir
container_request["priority"] = runtimeContext.priority
- container_request["state"] = "Committed"
+ container_request["state"] = "Uncommitted"
container_request.setdefault("properties", {})
+ container_request["properties"]["cwl_input"] = self.joborder
+
runtime_constraints = {}
if runtimeContext.project_uuid:
mounts[targetdir]["path"] = path
prevdir = targetdir + "/"
+ intermediate_collection_info = arvados_cwl.util.get_intermediate_collection_info(self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl)
+
with Perf(metrics, "generatefiles %s" % self.name):
if self.generatefiles["listing"]:
vwd = arvados.collection.Collection(api_client=self.arvrunner.api,
if not runtimeContext.current_container:
runtimeContext.current_container = arvados_cwl.util.get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
- info = arvados_cwl.util.get_intermediate_collection_info(self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl)
- vwd.save_new(name=info["name"],
+ vwd.save_new(name=intermediate_collection_info["name"],
owner_uuid=runtimeContext.project_uuid,
ensure_unique_name=True,
- trash_at=info["trash_at"],
- properties=info["properties"])
+ trash_at=intermediate_collection_info["trash_at"],
+ properties=intermediate_collection_info["properties"])
prev = None
for f, p in sorteditems:
container_request["container_image"] = arv_docker_get_image(self.arvrunner.api,
docker_req,
runtimeContext.pull_image,
- runtimeContext.project_uuid,
- runtimeContext.force_docker_pull,
- runtimeContext.tmp_outdir_prefix,
- runtimeContext.match_local_docker)
+ runtimeContext)
network_req, _ = self.get_requirement("NetworkAccess")
if network_req:
if api_req:
runtime_constraints["API"] = True
+ use_disk_cache = (self.arvrunner.api.config()["Containers"].get("DefaultKeepCacheRAM", 0) == 0)
+
+ keep_cache_type_req, _ = self.get_requirement("http://arvados.org/cwl#KeepCacheTypeRequirement")
+ if keep_cache_type_req:
+ if "keepCacheType" in keep_cache_type_req:
+ if keep_cache_type_req["keepCacheType"] == "ram_cache":
+ use_disk_cache = False
+
runtime_req, _ = self.get_requirement("http://arvados.org/cwl#RuntimeConstraints")
if runtime_req:
if "keep_cache" in runtime_req:
- runtime_constraints["keep_cache_ram"] = math.ceil(runtime_req["keep_cache"] * 2**20)
+ if use_disk_cache:
+ # If DefaultKeepCacheRAM is zero it means we should use disk cache.
+ runtime_constraints["keep_cache_disk"] = math.ceil(runtime_req["keep_cache"] * 2**20)
+ else:
+ runtime_constraints["keep_cache_ram"] = math.ceil(runtime_req["keep_cache"] * 2**20)
if "outputDirType" in runtime_req:
if runtime_req["outputDirType"] == "local_output_dir":
# Currently the default behavior.
if runtimeContext.submit_runner_cluster:
extra_submit_params["cluster_id"] = runtimeContext.submit_runner_cluster
- container_request["output_name"] = "Output for step %s" % (self.name)
+ container_request["output_name"] = cleanup_name_for_collection("Output from step %s" % (self.name))
container_request["output_ttl"] = self.output_ttl
container_request["mounts"] = mounts
container_request["secret_mounts"] = secret_mounts
for pr in properties_req["processProperties"]:
container_request["properties"][pr["propertyName"]] = self.builder.do_eval(pr["propertyValue"])
+ output_properties_req, _ = self.get_requirement("http://arvados.org/cwl#OutputCollectionProperties")
+ if output_properties_req:
+ if self.arvrunner.api._rootDesc["revision"] >= "20220510":
+ container_request["output_properties"] = {}
+ for pr in output_properties_req["outputProperties"]:
+ container_request["output_properties"][pr["propertyName"]] = self.builder.do_eval(pr["propertyValue"])
+ else:
+ logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.",
+ self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510")
+
+ ram_multiplier = [1]
+
+ oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+ if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'):
+ ram_multiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+
if runtimeContext.runnerjob.startswith("arvwf:"):
wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
container_request["name"] = wfrecord["name"]
container_request["properties"]["template_uuid"] = wfuuid
- self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
+ if self.attempt_count == 0:
+ self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
try:
- if runtimeContext.submit_request_uuid:
- response = self.arvrunner.api.container_requests().update(
- uuid=runtimeContext.submit_request_uuid,
- body=container_request,
- **extra_submit_params
- ).execute(num_retries=self.arvrunner.num_retries)
- else:
- response = self.arvrunner.api.container_requests().create(
- body=container_request,
- **extra_submit_params
- ).execute(num_retries=self.arvrunner.num_retries)
+ ram = runtime_constraints["ram"]
+
+ self.uuid = runtimeContext.submit_request_uuid
+
+ for i in ram_multiplier:
+ runtime_constraints["ram"] = ram * i
+
+ if self.uuid:
+ response = self.arvrunner.api.container_requests().update(
+ uuid=self.uuid,
+ body=container_request,
+ **extra_submit_params
+ ).execute(num_retries=self.arvrunner.num_retries)
+ else:
+ response = self.arvrunner.api.container_requests().create(
+ body=container_request,
+ **extra_submit_params
+ ).execute(num_retries=self.arvrunner.num_retries)
+ self.uuid = response["uuid"]
+
+ if response["container_uuid"] is not None:
+ break
+
+ if response["container_uuid"] is None:
+ runtime_constraints["ram"] = ram * ram_multiplier[self.attempt_count]
+
+ container_request["state"] = "Committed"
+ response = self.arvrunner.api.container_requests().update(
+ uuid=self.uuid,
+ body=container_request,
+ **extra_submit_params
+ ).execute(num_retries=self.arvrunner.num_retries)
- self.uuid = response["uuid"]
self.arvrunner.process_submitted(self)
+ self.attempt_count += 1
if response["state"] == "Final":
logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"])
logger.debug("Container request was %s", container_request)
self.output_callback({}, "permanentFail")
+ def out_of_memory_retry(self, record, container):
+ oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+ if oom_retry_req is None:
+ return False
+
+ # Sometimes it gets killed with no warning
+ if container["exit_code"] == 137:
+ return True
+
+ logc = arvados.collection.CollectionReader(record["log_uuid"],
+ api_client=self.arvrunner.api,
+ keep_client=self.arvrunner.keep_client,
+ num_retries=self.arvrunner.num_retries)
+
+ loglines = [""]
+ def callback(v1, v2, v3):
+ loglines[0] = v3
+
+ done.logtail(logc, callback, "", maxlen=1000)
+
+ # Check allocation failure
+ oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|memory ?error|container using over 9.% of memory)'
+ if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+ return True
+
+ return False
+
def done(self, record):
outputs = {}
+ retried = False
+ rcode = None
try:
container = self.arvrunner.api.containers().get(
uuid=record["container_uuid"]
else:
processStatus = "permanentFail"
+ if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container):
+ logger.warning("%s Container failed with out of memory error, retrying with more RAM.",
+ self.arvrunner.label(self))
+ self.job_runtime.submit_request_uuid = None
+ self.uuid = None
+ self.run(None)
+ retried = True
+ return
+
if rcode == 137:
- logger.warning("%s This container was killed on the compute instance. The most common reason is that it attempted to allocate too much RAM and was targeted by the Out Of Memory (OOM) killer. Try resubmitting with a higher 'ramMin'.",
+ logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
self.arvrunner.label(self))
else:
processStatus = "permanentFail"
label = self.arvrunner.label(self)
done.logtail(
logc, logger.error,
- "%s (%s) error log:" % (label, record["uuid"]), maxlen=40)
+ "%s (%s) error log:" % (label, record["uuid"]), maxlen=40, include_crunchrun=(rcode is None or rcode > 127))
if record["output_uuid"]:
if self.arvrunner.trash_intermediate or self.arvrunner.intermediate_output_ttl:
if container["output"]:
outputs = done.done_outputs(self, container, "/tmp", self.outdir, "/keep")
+
+ properties = record["properties"].copy()
+ properties["cwl_output"] = outputs
+ self.arvrunner.api.container_requests().update(
+ uuid=self.uuid,
+ body={"container_request": {"properties": properties}}
+ ).execute(num_retries=self.arvrunner.num_retries)
except WorkflowException as e:
# Only include a stack trace if in debug mode.
# A stack trace may obfuscate more useful output about the workflow.
logger.exception("%s while getting output object:", self.arvrunner.label(self))
processStatus = "permanentFail"
finally:
- self.output_callback(outputs, processStatus)
+ if not retried:
+ self.output_callback(outputs, processStatus)
class RunnerContainer(Runner):
"""Submit and manage a container that runs arvados-cwl-runner."""
- def arvados_job_spec(self, runtimeContext):
+ def arvados_job_spec(self, runtimeContext, git_info):
"""Create an Arvados container request for this workflow.
The returned dict can be used to create a container passed as
"cwd": "/var/spool/cwl",
"priority": self.priority,
"state": "Committed",
- "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image),
+ "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image, runtimeContext),
"mounts": {
"/var/lib/cwl/cwl.input.json": {
"kind": "json",
"kind": "collection",
"portable_data_hash": "%s" % workflowcollection
}
+ elif self.embedded_tool.tool.get("id", "").startswith("arvwf:"):
+ uuid, frg = urllib.parse.urldefrag(self.embedded_tool.tool["id"])
+ workflowpath = "/var/lib/cwl/workflow.json#" + frg
+ packedtxt = self.loadingContext.loader.fetch_text(uuid)
+ yaml = ruamel.yaml.YAML(typ='safe', pure=True)
+ packed = yaml.load(packedtxt)
+ container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
+ "kind": "json",
+ "content": packed
+ }
+ container_req["properties"]["template_uuid"] = self.embedded_tool.tool["id"][6:33]
+ elif self.embedded_tool.tool.get("id", "").startswith("file:"):
+ raise Exception("Tool id '%s' is a local file but expected keep: or arvwf:" % self.embedded_tool.tool.get("id"))
else:
- packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map)
+ main = self.loadingContext.loader.idx["_:main"]
+ if main.get("id") == "_:main":
+ del main["id"]
workflowpath = "/var/lib/cwl/workflow.json#main"
container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
"kind": "json",
- "content": packed
+ "content": main
}
- if self.embedded_tool.tool.get("id", "").startswith("arvwf:"):
- container_req["properties"]["template_uuid"] = self.embedded_tool.tool["id"][6:33]
+
+ container_req["properties"].update({k.replace("http://arvados.org/cwl#", "arv:"): v for k, v in git_info.items()})
properties_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#ProcessProperties")
if properties_req:
if runtimeContext.intermediate_storage_classes != "default" and runtimeContext.intermediate_storage_classes:
command.append("--intermediate-storage-classes=" + runtimeContext.intermediate_storage_classes)
- if self.on_error:
+ if runtimeContext.on_error:
command.append("--on-error=" + self.on_error)
- if self.intermediate_output_ttl:
- command.append("--intermediate-output-ttl=%d" % self.intermediate_output_ttl)
+ if runtimeContext.intermediate_output_ttl:
+ command.append("--intermediate-output-ttl=%d" % runtimeContext.intermediate_output_ttl)
- if self.arvrunner.trash_intermediate:
+ if runtimeContext.trash_intermediate:
command.append("--trash-intermediate")
- if self.arvrunner.project_uuid:
- command.append("--project-uuid="+self.arvrunner.project_uuid)
+ if runtimeContext.project_uuid:
+ command.append("--project-uuid="+runtimeContext.project_uuid)
if self.enable_dev:
command.append("--enable-dev")
if runtimeContext.enable_preemptible is False:
command.append("--disable-preemptible")
+ if runtimeContext.varying_url_params:
+ command.append("--varying-url-params="+runtimeContext.varying_url_params)
+
+ if runtimeContext.prefer_cached_downloads:
+ command.append("--prefer-cached-downloads")
+
+ if self.fast_parser:
+ command.append("--fast-parser")
+
command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])
container_req["command"] = command
def run(self, runtimeContext):
runtimeContext.keepprefix = "keep:"
- job_spec = self.arvados_job_spec(runtimeContext)
- if self.arvrunner.project_uuid:
- job_spec["owner_uuid"] = self.arvrunner.project_uuid
+ job_spec = self.arvados_job_spec(runtimeContext, self.git_info)
+ if runtimeContext.project_uuid:
+ job_spec["owner_uuid"] = runtimeContext.project_uuid
extra_submit_params = {}
if runtimeContext.submit_runner_cluster: