Merge branch '19316-oj-safe-load'

author Tom Clegg <tom@curii.com>

Thu, 4 Aug 2022 17:33:47 +0000 (13:33 -0400)

committer Tom Clegg <tom@curii.com>

Thu, 4 Aug 2022 17:33:47 +0000 (13:33 -0400)
author Tom Clegg <tom@curii.com>
Thu, 4 Aug 2022 17:33:47 +0000 (13:33 -0400)
committer Tom Clegg <tom@curii.com>
Thu, 4 Aug 2022 17:33:47 +0000 (13:33 -0400)
diff --git a/.licenseignore b/.licenseignore

index 203c378bdcfa1dadc483234dcea453c91462066f..6ddb5c009c16eec67db8ea629afc2645064b2fbe 100644 (file)
--- a/.licenseignore
+++ b/.licenseignore
@@ -92,3 +92,4 @@ sdk/cwl/tests/wf/hello.txt
  sdk/cwl/tests/wf/indir1/hello2.txt
  sdk/cwl/tests/chipseq/data/Genomes/*
  CITATION.cff
+SECURITY.md
+\ No newline at end of file
diff --git a/SECURITY.md b/SECURITY.md

new file mode 100644 (file)

index 0000000..4e16ed5
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,42 @@
+# Arvados Project Security Policy
+
+## Supported Versions
+
+The Arvados project will issue security fixes by making point releases
+on the current stable release series (X.Y.0, X.Y.1, X.Y.2, etc).
+
+The most recent stable release version, along with release notes and
+upgrade notes documenting security fixes, can be found at these
+locations:
+
+https://arvados.org/releases/
+
+https://doc.arvados.org/admin/upgrading.html
+
+The Arvados project does not support versions older than the current
+stable release except by special arrangement (contact info@curii.com).
+
+Release announcements, including notification of security fixes, are
+sent to the Arvados announcement list:
+
+https://lists.arvados.org//mailman/listinfo/arvados
+
+## Reporting Security Issues
+
+If you believe you have found a security vulnerability in any Arvados-owned repository, please report it to us through coordinated disclosure.
+
+**Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests.**
+
+Instead, please send an email to dev@curii.com.
+
+Please include as much of the information listed below as you can to help us better understand and resolve the issue:
+
+  * The type of issue (e.g., remote code execution, SQL injection, or cross-site scripting)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
diff --git a/apps/workbench/app/helpers/application_helper.rb b/apps/workbench/app/helpers/application_helper.rb

index f22ab50166591cb0875f32bfdba368af42e91fc3..697c469b563f3553f83c15a1089c0f96745d6b48 100644 (file)
--- a/apps/workbench/app/helpers/application_helper.rb
+++ b/apps/workbench/app/helpers/application_helper.rb
@@ -564,7 +564,7 @@ module ApplicationHelper
                       "data-emptytext" => "none",
                       "data-placement" => "bottom",
                       "data-type" => "select",
-                     "data-source" => (opt_empty_selection + primary_type[:symbols].map {|i| {:value => i, :text => i} }).to_json,
+                     "data-source" => (opt_empty_selection + primary_type[:symbols].map {|i| {:value => cwl_shortname(i), :text => cwl_shortname(i)} }).to_json,
                       "data-url" => url_for(action: "update", id: object.uuid, controller: object.class.to_s.pluralize.underscore, merge: true),
                       "data-title" => "Set value for #{cwl_shortname(input_schema[:id])}",
                       "data-name" => dn,
diff --git a/sdk/cwl/arvados_cwl/executor.py b/sdk/cwl/arvados_cwl/executor.py

index 778af58ac3f7a1b71c040d5ec4f3332ecba11964..3241fb607c5f06bf5030e901a27a110cdd43e7a6 100644 (file)
--- a/sdk/cwl/arvados_cwl/executor.py
+++ b/sdk/cwl/arvados_cwl/executor.py
@@ -565,8 +565,9 @@ The 'jobs' API is no longer supported.
          self.project_uuid = runtimeContext.project_uuid
  
          # Upload local file references in the job order.
-        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
-                                     updated_tool, job_order, runtimeContext)
+        with Perf(metrics, "upload_job_order"):
+            job_order = upload_job_order(self, "%s input" % runtimeContext.name,
+                                         updated_tool, job_order, runtimeContext)
  
          # the last clause means: if it is a command line tool, and we
          # are going to wait for the result, and always_submit_runner
@@ -581,19 +582,23 @@ The 'jobs' API is no longer supported.
  
          loadingContext = self.loadingContext.copy()
          loadingContext.do_validate = False
+        loadingContext.disable_js_validation = True
          if submitting:
              loadingContext.do_update = False
              # Document may have been auto-updated. Reload the original
              # document with updating disabled because we want to
              # submit the document with its original CWL version, not
              # the auto-updated one.
-            tool = load_tool(updated_tool.tool["id"], loadingContext)
+            with Perf(metrics, "load_tool original"):
+                tool = load_tool(updated_tool.tool["id"], loadingContext)
          else:
              tool = updated_tool
  
          # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
          # Also uploads docker images.
-        merged_map = upload_workflow_deps(self, tool, runtimeContext)
+        logger.info("Uploading workflow dependencies")
+        with Perf(metrics, "upload_workflow_deps"):
+            merged_map = upload_workflow_deps(self, tool, runtimeContext)
  
          # Recreate process object (ArvadosWorkflow or
          # ArvadosCommandTool) because tool document may have been
@@ -602,7 +607,8 @@ The 'jobs' API is no longer supported.
          loadingContext.loader = tool.doc_loader
          loadingContext.avsc_names = tool.doc_schema
          loadingContext.metadata = tool.metadata
-        tool = load_tool(tool.tool, loadingContext)
+        with Perf(metrics, "load_tool"):
+            tool = load_tool(tool.tool, loadingContext)
  
          if runtimeContext.update_workflow or runtimeContext.create_workflow:
              # Create a pipeline template or workflow record and exit.
diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py

index 644713bce25385938df289dbdcb4cf68b77f3ca5..225f4ae60ed9b5cdfe83f9ffa240e8dcf08da5f5 100644 (file)
--- a/sdk/cwl/arvados_cwl/runner.py
+++ b/sdk/cwl/arvados_cwl/runner.py
@@ -17,7 +17,30 @@ import json
  import copy
  from collections import namedtuple
  from io import StringIO
-from typing import Mapping, Sequence
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Mapping,
+    MutableMapping,
+    Sequence,
+    MutableSequence,
+    Optional,
+    Set,
+    Sized,
+    Tuple,
+    Type,
+    Union,
+    cast,
+)
+from cwltool.utils import (
+    CWLObjectType,
+    CWLOutputAtomType,
+    CWLOutputType,
+)
  
  if os.name == "posix" and sys.version_info[0] < 3:
      import subprocess32 as subprocess
@@ -49,8 +72,10 @@ from .pathmapper import ArvPathMapper, trim_listing, collection_pdh_pattern, col
  from ._version import __version__
  from . import done
  from . context import ArvRuntimeContext
+from .perf import Perf
  
  logger = logging.getLogger('arvados.cwl-runner')
+metrics = logging.getLogger('arvados.cwl-runner.metrics')
  
  def trim_anonymous_location(obj):
      """Remove 'location' field from File and Directory literals.
@@ -228,23 +253,33 @@ def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discov
                  if sfname is None:
                      continue
  
-                p_location = primary["location"]
-                if "/" in p_location:
-                    sfpath = (
-                        p_location[0 : p_location.rindex("/") + 1]
-                        + sfname
-                    )
+                if isinstance(sfname, str):
+                    p_location = primary["location"]
+                    if "/" in p_location:
+                        sfpath = (
+                            p_location[0 : p_location.rindex("/") + 1]
+                            + sfname
+                        )
  
              required = builder.do_eval(required, context=primary)
  
-            if fsaccess.exists(sfpath):
-                if pattern is not None:
-                    found.append({"location": sfpath, "class": "File"})
-                else:
-                    found.append(sf)
-            elif required:
-                raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
-                    "Required secondary file '%s' does not exist" % sfpath)
+            if isinstance(sfname, list) or isinstance(sfname, dict):
+                each = aslist(sfname)
+                for e in each:
+                    if required and not fsaccess.exists(e.get("location")):
+                        raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
+                            "Required secondary file '%s' does not exist" % e.get("location"))
+                found.extend(each)
+
+            if isinstance(sfname, str):
+                if fsaccess.exists(sfpath):
+                    if pattern is not None:
+                        found.append({"location": sfpath, "class": "File"})
+                    else:
+                        found.append(sf)
+                elif required:
+                    raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
+                        "Required secondary file '%s' does not exist" % sfpath)
  
          primary["secondaryFiles"] = cmap(found)
          if discovered is not None:
@@ -260,7 +295,8 @@ def discover_secondary_files(fsaccess, builder, inputs, job_order, discovered=No
  
  def upload_dependencies(arvrunner, name, document_loader,
                          workflowobj, uri, loadref_run, runtimeContext,
-                        include_primary=True, discovered_secondaryfiles=None):
+                        include_primary=True, discovered_secondaryfiles=None,
+                        cache=None):
      """Upload the dependencies of the workflowobj document to Keep.
  
      Returns a pathmapper object mapping local paths to keep references.  Also
@@ -279,6 +315,8 @@ def upload_dependencies(arvrunner, name, document_loader,
          defrg, _ = urllib.parse.urldefrag(joined)
          if defrg not in loaded:
              loaded.add(defrg)
+            if cache is not None and defrg in cache:
+                return cache[defrg]
              # Use fetch_text to get raw file (before preprocessing).
              text = document_loader.fetch_text(defrg)
              if isinstance(text, bytes):
@@ -286,7 +324,10 @@ def upload_dependencies(arvrunner, name, document_loader,
              else:
                  textIO = StringIO(text)
              yamlloader = YAML(typ='safe', pure=True)
-            return yamlloader.load(textIO)
+            result = yamlloader.load(textIO)
+            if cache is not None:
+                cache[defrg] = result
+            return result
          else:
              return {}
  
@@ -297,25 +338,37 @@ def upload_dependencies(arvrunner, name, document_loader,
  
      scanobj = workflowobj
      if "id" in workflowobj and not workflowobj["id"].startswith("_:"):
-        # Need raw file content (before preprocessing) to ensure
-        # that external references in $include and $mixin are captured.
-        scanobj = loadref("", workflowobj["id"])
+        defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
+        if cache is not None and defrg not in cache:
+            # if we haven't seen this file before, want raw file
+            # content (before preprocessing) to ensure that external
+            # references like $include haven't already been inlined.
+            scanobj = loadref("", workflowobj["id"])
  
      metadata = scanobj
  
-    sc_result = scandeps(uri, scanobj,
-                         loadref_fields,
-                         set(("$include", "location")),
-                         loadref, urljoin=document_loader.fetcher.urljoin,
-                         nestdirs=False)
+    with Perf(metrics, "scandeps include, location"):
+        sc_result = scandeps(uri, scanobj,
+                             loadref_fields,
+                             set(("$include", "location")),
+                             loadref, urljoin=document_loader.fetcher.urljoin,
+                             nestdirs=False)
  
-    optional_deps = scandeps(uri, scanobj,
-                                  loadref_fields,
-                                  set(("$schemas",)),
-                                  loadref, urljoin=document_loader.fetcher.urljoin,
-                                  nestdirs=False)
+    with Perf(metrics, "scandeps $schemas"):
+        optional_deps = scandeps(uri, scanobj,
+                                      loadref_fields,
+                                      set(("$schemas",)),
+                                      loadref, urljoin=document_loader.fetcher.urljoin,
+                                      nestdirs=False)
  
-    sc_result.extend(optional_deps)
+    if sc_result is None:
+        sc_result = []
+
+    if optional_deps is None:
+        optional_deps = []
+
+    if optional_deps:
+        sc_result.extend(optional_deps)
  
      sc = []
      uuids = {}
@@ -343,35 +396,45 @@ def upload_dependencies(arvrunner, name, document_loader,
              sc.append(obj)
          collect_uuids(obj)
  
-    visit_class(workflowobj, ("File", "Directory"), collect_uuids)
-    visit_class(sc_result, ("File", "Directory"), collect_uploads)
+    with Perf(metrics, "collect uuids"):
+        visit_class(workflowobj, ("File", "Directory"), collect_uuids)
+
+    with Perf(metrics, "collect uploads"):
+        visit_class(sc_result, ("File", "Directory"), collect_uploads)
  
      # Resolve any collection uuids we found to portable data hashes
      # and assign them to uuid_map
      uuid_map = {}
      fetch_uuids = list(uuids.keys())
-    while fetch_uuids:
-        # For a large number of fetch_uuids, API server may limit
-        # response size, so keep fetching from API server has nothing
-        # more to give us.
-        lookups = arvrunner.api.collections().list(
-            filters=[["uuid", "in", fetch_uuids]],
-            count="none",
-            select=["uuid", "portable_data_hash"]).execute(
-                num_retries=arvrunner.num_retries)
+    with Perf(metrics, "fetch_uuids"):
+        while fetch_uuids:
+            # For a large number of fetch_uuids, API server may limit
+            # response size, so keep fetching from API server has nothing
+            # more to give us.
+            lookups = arvrunner.api.collections().list(
+                filters=[["uuid", "in", fetch_uuids]],
+                count="none",
+                select=["uuid", "portable_data_hash"]).execute(
+                    num_retries=arvrunner.num_retries)
  
-        if not lookups["items"]:
-            break
+            if not lookups["items"]:
+                break
  
-        for l in lookups["items"]:
-            uuid_map[l["uuid"]] = l["portable_data_hash"]
+            for l in lookups["items"]:
+                uuid_map[l["uuid"]] = l["portable_data_hash"]
  
-        fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]
+            fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]
  
      normalizeFilesDirs(sc)
  
-    if include_primary and "id" in workflowobj:
-        sc.append({"class": "File", "location": workflowobj["id"]})
+    if "id" in workflowobj:
+        defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
+        if include_primary:
+            # make sure it's included
+            sc.append({"class": "File", "location": defrg})
+        else:
+            # make sure it's excluded
+            sc = [d for d in sc if d.get("location") != defrg]
  
      def visit_default(obj):
          def defaults_are_optional(f):
@@ -412,12 +475,13 @@ def upload_dependencies(arvrunner, name, document_loader,
          else:
              del discovered[d]
  
-    mapper = ArvPathMapper(arvrunner, sc, "",
-                           "keep:%s",
-                           "keep:%s/%s",
-                           name=name,
-                           single_collection=True,
-                           optional_deps=optional_deps)
+    with Perf(metrics, "mapper"):
+        mapper = ArvPathMapper(arvrunner, sc, "",
+                               "keep:%s",
+                               "keep:%s/%s",
+                               name=name,
+                               single_collection=True,
+                               optional_deps=optional_deps)
  
      keeprefs = set()
      def addkeepref(k):
@@ -461,8 +525,9 @@ def upload_dependencies(arvrunner, name, document_loader,
          p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "")
          p[collectionUUID] = uuid
  
-    visit_class(workflowobj, ("File", "Directory"), setloc)
-    visit_class(discovered, ("File", "Directory"), setloc)
+    with Perf(metrics, "setloc"):
+        visit_class(workflowobj, ("File", "Directory"), setloc)
+        visit_class(discovered, ("File", "Directory"), setloc)
  
      if discovered_secondaryfiles is not None:
          for d in discovered:
@@ -647,24 +712,27 @@ FileUpdates = namedtuple("FileUpdates", ["resolved", "secondaryFiles"])
  def upload_workflow_deps(arvrunner, tool, runtimeContext):
      # Ensure that Docker images needed by this workflow are available
  
-    upload_docker(arvrunner, tool, runtimeContext)
+    with Perf(metrics, "upload_docker"):
+        upload_docker(arvrunner, tool, runtimeContext)
  
      document_loader = tool.doc_loader
  
      merged_map = {}
-
+    tool_dep_cache = {}
      def upload_tool_deps(deptool):
          if "id" in deptool:
              discovered_secondaryfiles = {}
-            pm = upload_dependencies(arvrunner,
-                                     "%s dependencies" % (shortname(deptool["id"])),
-                                     document_loader,
-                                     deptool,
-                                     deptool["id"],
-                                     False,
-                                     runtimeContext,
-                                     include_primary=False,
-                                     discovered_secondaryfiles=discovered_secondaryfiles)
+            with Perf(metrics, "upload_dependencies %s" % shortname(deptool["id"])):
+                pm = upload_dependencies(arvrunner,
+                                         "%s dependencies" % (shortname(deptool["id"])),
+                                         document_loader,
+                                         deptool,
+                                         deptool["id"],
+                                         False,
+                                         runtimeContext,
+                                         include_primary=False,
+                                         discovered_secondaryfiles=discovered_secondaryfiles,
+                                         cache=tool_dep_cache)
              document_loader.idx[deptool["id"]] = deptool
              toolmap = {}
              for k,v in pm.items():
diff --git a/sdk/cwl/setup.py b/sdk/cwl/setup.py

index c885ebd4b1303a1fb9cb02d6b5918719d2c01b16..66cda19f4012fc7754a4286d81dfe746c901cad6 100644 (file)
--- a/sdk/cwl/setup.py
+++ b/sdk/cwl/setup.py
@@ -36,12 +36,13 @@ setup(name='arvados-cwl-runner',
        # file to determine what version of cwltool and schema-salad to
        # build.
        install_requires=[
-          'cwltool==3.1.20220224085855',
-          'schema-salad==8.2.20211116214159',
+          'cwltool==3.1.20220623174452',
+          'schema-salad==8.3.20220801194920',
            'arvados-python-client{}'.format(pysdk_dep),
            'setuptools',
            'ciso8601 >= 2.0.0',
-          'networkx < 2.6'
+          'networkx < 2.6',
+          'msgpack==1.0.3'
        ],
        data_files=[
            ('share/doc/arvados-cwl-runner', ['LICENSE-2.0.txt', 'README.rst']),
author	Tom Clegg <tom@curii.com>
	Thu, 4 Aug 2022 17:33:47 +0000 (13:33 -0400)
committer	Tom Clegg <tom@curii.com>
	Thu, 4 Aug 2022 17:33:47 +0000 (13:33 -0400)
.licenseignore		patch \| blob \| history
SECURITY.md	[new file with mode: 0644]	patch \| blob
apps/workbench/app/helpers/application_helper.rb		patch \| blob \| history
sdk/cwl/arvados_cwl/executor.py		patch \| blob \| history
sdk/cwl/arvados_cwl/runner.py		patch \| blob \| history
sdk/cwl/setup.py		patch \| blob \| history