11907: Fix tests

[arvados.git] / sdk / cwl / arvados_cwl / __init__.py
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py

index 73901555acc4a3cc115fe9187abca80c34cbc75d..5b29ae517e8b15c33781cd70247c36d2ae831b94 100644 (file)
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -18,6 +18,9 @@ import re
  from functools import partial
  import pkg_resources  # part of setuptools
  import Queue
+import time
+import signal
+import thread
  
  from cwltool.errors import WorkflowException
  import cwltool.main
@@ -30,15 +33,17 @@ import arvados
  import arvados.config
  from arvados.keep import KeepClient
  from arvados.errors import ApiError
+import arvados.commands._util as arv_cmd
  
  from .arvcontainer import ArvadosContainer, RunnerContainer
  from .arvjob import ArvadosJob, RunnerJob, RunnerTemplate
-from. runner import Runner, upload_docker, upload_job_order, upload_workflow_deps, upload_dependencies
+from. runner import Runner, upload_docker, upload_job_order, upload_workflow_deps
  from .arvtool import ArvadosCommandTool
  from .arvworkflow import ArvadosWorkflow, upload_workflow
  from .fsaccess import CollectionFsAccess, CollectionFetcher, collectionResolver, CollectionCache
  from .perf import Perf
  from .pathmapper import NoFollowPathMapper
+from .task_queue import TaskQueue
  from ._version import __version__
  
  from cwltool.pack import pack
@@ -63,14 +68,14 @@ class ArvCwlRunner(object):
  
      """
  
-    def __init__(self, api_client, work_api=None, keep_client=None, output_name=None, output_tags=None, num_retries=4):
+    def __init__(self, api_client, work_api=None, keep_client=None,
+                 output_name=None, output_tags=None, num_retries=4,
+                 thread_count=4):
          self.api = api_client
          self.processes = {}
-        self.in_flight = 0
          self.workflow_eval_lock = threading.Condition(threading.RLock())
          self.final_output = None
          self.final_status = None
-        self.uploaded = {}
          self.num_retries = num_retries
          self.uuid = None
          self.stop_polling = threading.Event()
@@ -83,7 +88,8 @@ class ArvCwlRunner(object):
          self.intermediate_output_ttl = 0
          self.intermediate_output_collections = []
          self.trash_intermediate = False
-        self.runnable_queue = Queue.Queue()
+        self.thread_count = thread_count
+        self.poll_interval = 12
  
          if keep_client is not None:
              self.keep_client = keep_client
@@ -92,6 +98,11 @@ class ArvCwlRunner(object):
  
          self.collection_cache = CollectionCache(self.api, self.keep_client, self.num_retries)
  
+        self.fetcher_constructor = partial(CollectionFetcher,
+                                           api_client=self.api,
+                                           fs_access=CollectionFsAccess("", collection_cache=self.collection_cache),
+                                           num_retries=self.num_retries)
+
          self.work_api = None
          expected_api = ["jobs", "containers"]
          for api in expected_api:
@@ -112,10 +123,7 @@ class ArvCwlRunner(object):
  
      def arv_make_tool(self, toolpath_object, **kwargs):
          kwargs["work_api"] = self.work_api
-        kwargs["fetcher_constructor"] = partial(CollectionFetcher,
-                                                api_client=self.api,
-                                                fs_access=CollectionFsAccess("", collection_cache=self.collection_cache),
-                                                num_retries=self.num_retries)
+        kwargs["fetcher_constructor"] = self.fetcher_constructor
          kwargs["resolver"] = partial(collectionResolver, self.api, num_retries=self.num_retries)
          if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
              return ArvadosCommandTool(self, toolpath_object, **kwargs)
@@ -138,52 +146,43 @@ class ArvCwlRunner(object):
                                                           body={"state": "Failed"}).execute(num_retries=self.num_retries)
              self.final_status = processStatus
              self.final_output = out
+            self.workflow_eval_lock.notifyAll()
  
-    def runnable_queue_thread(self):
-        while True:
-            runnable, kwargs = self.runnable_queue.get()
-            runnable.run(**kwargs)
  
      def start_run(self, runnable, kwargs):
-        with self.workflow_eval_lock:
-            self.in_flight += 1
-        self.runnable_queue.put((runnable, kwargs))
+        self.task_queue.add(partial(runnable.run, **kwargs))
  
      def process_submitted(self, container):
          with self.workflow_eval_lock:
              self.processes[container.uuid] = container
-            self.in_flight -= 1
  
-    def process_done(self, uuid):
+    def process_done(self, uuid, record):
          with self.workflow_eval_lock:
-            if uuid in self.processes:
-                del self.processes[uuid]
+            j = self.processes[uuid]
+            logger.info("%s %s is %s", self.label(j), uuid, record["state"])
+            self.task_queue.add(partial(j.done, record))
+            del self.processes[uuid]
  
      def wrapped_callback(self, cb, obj, st):
          with self.workflow_eval_lock:
              cb(obj, st)
+            self.workflow_eval_lock.notifyAll()
  
      def get_wrapped_callback(self, cb):
          return partial(self.wrapped_callback, cb)
  
      def on_message(self, event):
-        if "object_uuid" in event:
-            if event["object_uuid"] in self.processes and event["event_type"] == "update":
-                if event["properties"]["new_attributes"]["state"] == "Running" and self.processes[event["object_uuid"]].running is False:
-                    uuid = event["object_uuid"]
-                    with self.workflow_eval_lock:
-                        j = self.processes[uuid]
-                        logger.info("%s %s is Running", self.label(j), uuid)
+        if event.get("object_uuid") in self.processes and event["event_type"] == "update":
+            uuid = event["object_uuid"]
+            if event["properties"]["new_attributes"]["state"] == "Running":
+                with self.workflow_eval_lock:
+                    j = self.processes[uuid]
+                    if j.running is False:
                          j.running = True
                          j.update_pipeline_component(event["properties"]["new_attributes"])
-                elif event["properties"]["new_attributes"]["state"] in ("Complete", "Failed", "Cancelled", "Final"):
-                    uuid = event["object_uuid"]
-                    with self.workflow_eval_lock:
-                        j = self.processes[uuid]
-                        logger.info("%s %s is %s", self.label(j), uuid, event["properties"]["new_attributes"]["state"])
-                        with Perf(metrics, "done %s" % j.name):
-                            j.done(event["properties"]["new_attributes"])
-                        self.workflow_eval_lock.notify()
+                        logger.info("%s %s is Running", self.label(j), uuid)
+            elif event["properties"]["new_attributes"]["state"] in ("Complete", "Failed", "Cancelled", "Final"):
+                self.process_done(uuid, event["properties"]["new_attributes"])
  
      def label(self, obj):
          return "[%s %s]" % (self.work_api[0:-1], obj.name)
@@ -195,15 +194,19 @@ class ArvCwlRunner(object):
          """
  
          try:
+            remain_wait = self.poll_interval
              while True:
-                self.stop_polling.wait(15)
+                if remain_wait > 0:
+                    self.stop_polling.wait(remain_wait)
                  if self.stop_polling.is_set():
                      break
                  with self.workflow_eval_lock:
                      keys = list(self.processes.keys())
                  if not keys:
+                    remain_wait = self.poll_interval
                      continue
  
+                begin_poll = time.time()
                  if self.work_api == "containers":
                      table = self.poll_api.container_requests()
                  elif self.work_api == "jobs":
@@ -213,6 +216,7 @@ class ArvCwlRunner(object):
                      proc_states = table.list(filters=[["uuid", "in", keys]]).execute(num_retries=self.num_retries)
                  except Exception as e:
                      logger.warn("Error checking states on API server: %s", e)
+                    remain_wait = self.poll_interval
                      continue
  
                  for p in proc_states["items"]:
@@ -223,20 +227,16 @@ class ArvCwlRunner(object):
                              "new_attributes": p
                          }
                      })
+                finish_poll = time.time()
+                remain_wait = self.poll_interval - (finish_poll - begin_poll)
          except:
-            logger.error("Fatal error in state polling thread.", exc_info=(sys.exc_info()[1] if self.debug else False))
-            with workflow_eval_lock:
+            logger.exception("Fatal error in state polling thread.")
+            with self.workflow_eval_lock:
                  self.processes.clear()
-                self.workflow_eval_lock.notify()
+                self.workflow_eval_lock.notifyAll()
          finally:
              self.stop_polling.set()
  
-    def get_uploaded(self):
-        return self.uploaded.copy()
-
-    def add_uploaded(self, src, pair):
-        self.uploaded[src] = pair
-
      def add_intermediate_output(self, uuid):
          if uuid:
              self.intermediate_output_collections.append(uuid)
@@ -248,7 +248,7 @@ class ArvCwlRunner(object):
                  self.api.collections().delete(uuid=i).execute(num_retries=self.num_retries)
              except:
                  logger.warn("Failed to delete intermediate output: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
-            if sys.exc_info()[0] is KeyboardInterrupt:
+            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                  break
  
      def check_features(self, obj):
@@ -394,6 +394,9 @@ class ArvCwlRunner(object):
          if self.intermediate_output_ttl < 0:
              raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)
  
+        if kwargs.get("submit_request_uuid") and self.work_api != "containers":
+            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))
+
          if not kwargs.get("name"):
              kwargs["name"] = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])
  
@@ -504,14 +507,16 @@ class ArvCwlRunner(object):
              logger.info("Pipeline instance %s", self.pipeline["uuid"])
  
          if runnerjob and not kwargs.get("wait"):
-            runnerjob.run(wait=kwargs.get("wait"))
+            submitargs = kwargs.copy()
+            submitargs['submit'] = False
+            runnerjob.run(**submitargs)
              return (runnerjob.uuid, "success")
  
          self.poll_api = arvados.api('v1')
          self.polling_thread = threading.Thread(target=self.poll_states)
          self.polling_thread.start()
  
-        threading.Thread(target=self.runnable_queue_thread).start()
+        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)
  
          if runnerjob:
              jobiter = iter((runnerjob,))
@@ -537,26 +542,31 @@ class ArvCwlRunner(object):
                  if self.stop_polling.is_set():
                      break
  
+                if self.task_queue.error is not None:
+                    raise self.task_queue.error
+
                  if runnable:
                      with Perf(metrics, "run"):
                          self.start_run(runnable, kwargs)
                  else:
-                    if (self.in_flight + len(self.processes)) > 0:
-                        self.workflow_eval_lock.wait(1)
+                    if (self.task_queue.in_flight + len(self.processes)) > 0:
+                        self.workflow_eval_lock.wait(3)
                      else:
-                        logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
+                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                          break
                  loopperf.__enter__()
              loopperf.__exit__()
  
-            while self.processes:
-                self.workflow_eval_lock.wait(1)
+            while (self.task_queue.in_flight + len(self.processes)) > 0:
+                if self.task_queue.error is not None:
+                    raise self.task_queue.error
+                self.workflow_eval_lock.wait(3)
  
          except UnsupportedRequirement:
              raise
          except:
-            if sys.exc_info()[0] is KeyboardInterrupt:
-                logger.error("Interrupted, marking pipeline as failed")
+            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
+                logger.error("Interrupted, workflow will be cancelled")
              else:
                  logger.error("Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
              if self.pipeline:
@@ -567,8 +577,10 @@ class ArvCwlRunner(object):
                                                       body={"priority": "0"}).execute(num_retries=self.num_retries)
          finally:
              self.workflow_eval_lock.release()
+            self.task_queue.drain()
              self.stop_polling.set()
              self.polling_thread.join()
+            self.task_queue.join()
  
          if self.final_status == "UnsupportedRequirement":
              raise UnsupportedRequirement("Check log for details.")
@@ -690,6 +702,10 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                          help="Docker image for workflow runner job, default arvados/jobs:%s" % __version__,
                          default=None)
  
+    parser.add_argument("--submit-request-uuid", type=str,
+                        default=None,
+                        help="Update and commit supplied container request instead of creating a new one (containers API only).")
+
      parser.add_argument("--name", type=str,
                          help="Name to use for workflow execution instance.",
                          default=None)
@@ -718,6 +734,9 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                          action="store_true", default=False,
                          help=argparse.SUPPRESS)
  
+    parser.add_argument("--thread-count", type=int,
+                        default=4, help="Number of threads to use for job submit and output collection.")
+
      exgroup = parser.add_mutually_exclusive_group()
      exgroup.add_argument("--trash-intermediate", action="store_true",
                          default=False, dest="trash_intermediate",
@@ -748,12 +767,20 @@ def add_arv_hints():
          "http://arvados.org/cwl#ReuseRequirement"
      ])
  
-def main(args, stdout, stderr, api_client=None, keep_client=None):
+def exit_signal_handler(sigcode, frame):
+    logger.error("Caught signal {}, exiting.".format(sigcode))
+    sys.exit(-sigcode)
+
+def main(args, stdout, stderr, api_client=None, keep_client=None,
+         install_sig_handlers=True):
      parser = arg_parser()
  
      job_order_object = None
      arvargs = parser.parse_args(args)
  
+    if install_sig_handlers:
+        arv_cmd.install_signal_handlers()
+
      if arvargs.update_workflow:
          if arvargs.update_workflow.find('-7fd4e-') == 5:
              want_api = 'containers'
@@ -780,7 +807,8 @@ def main(args, stdout, stderr, api_client=None, keep_client=None):
              keep_client = arvados.keep.KeepClient(api_client=api_client, num_retries=4)
          runner = ArvCwlRunner(api_client, work_api=arvargs.work_api, keep_client=keep_client,
                                num_retries=4, output_name=arvargs.output_name,
-                              output_tags=arvargs.output_tags)
+                              output_tags=arvargs.output_tags,
+                              thread_count=arvargs.thread_count)
      except Exception as e:
          logger.error(e)
          return 1