20462: Refactor "common prefix" function & fix fencepost errors 20462-workflow-prefix
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 1 May 2023 14:58:25 +0000 (10:58 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Mon, 1 May 2023 14:58:25 +0000 (10:58 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/cwl/arvados_cwl/arvworkflow.py
sdk/cwl/tests/test_util.py

index 895676565d53f6b817ac0ad555330aa4b12781e4..bd32e9f81b7f4f714c26ce9989e26e41bb7349b9 100644 (file)
@@ -235,6 +235,7 @@ def fix_schemadef(req, baseuri, urlexpander, merged_map, jobmapper, pdh):
             merged_map[mm].resolved[r] = rename
     return req
 
+
 def drop_ids(d):
     if isinstance(d, MutableSequence):
         for i, s in enumerate(d):
@@ -247,6 +248,32 @@ def drop_ids(d):
             drop_ids(d[field])
 
 
+def common_prefix(firstfile, all_files):
+    n = 0
+    allmatch = True
+    if not firstfile:
+        return ""
+
+    while allmatch and n < len(firstfile)-1:
+        n += 1
+        for f in all_files:
+            if len(f)-1 < n:
+                n -= 1
+                allmatch = False
+                break
+            if f[n] != firstfile[n]:
+                allmatch = False
+                break
+
+    while n > 0 and firstfile[n] != "/":
+        n -= 1
+
+    if firstfile[n] == "/":
+        n += 1
+
+    return firstfile[:n]
+
+
 def upload_workflow(arvRunner, tool, job_order, project_uuid,
                         runtimeContext,
                         uuid=None,
@@ -280,22 +307,7 @@ def upload_workflow(arvRunner, tool, job_order, project_uuid,
     # Find the longest common prefix among all the file names.  We'll
     # use this to recreate the directory structure in a keep
     # collection with correct relative references.
-    n = 7
-    allmatch = True
-    if firstfile:
-        while allmatch:
-            n += 1
-            for f in all_files:
-                if len(f)-1 < n:
-                    n -= 1
-                    allmatch = False
-                    break
-                if f[n] != firstfile[n]:
-                    allmatch = False
-                    break
-
-        while firstfile[n] != "/":
-            n -= 1
+    prefix = common_prefix(firstfile, all_files)
 
     col = arvados.collection.Collection(api_client=arvRunner.api)
 
@@ -332,22 +344,22 @@ def upload_workflow(arvRunner, tool, job_order, project_uuid,
         update_refs(result, w, tool.doc_loader.expand_url, merged_map, jobmapper, runtimeContext, "", "")
 
         # Write the updated file to the collection.
-        with col.open(w[n+1:], "wt") as f:
+        with col.open(w[len(prefix):], "wt") as f:
             if export_as_json:
                 json.dump(result, f, indent=4, separators=(',',': '))
             else:
                 yamlloader.dump(result, stream=f)
 
         # Also store a verbatim copy of the original files
-        with col.open(os.path.join("original", w[n+1:]), "wt") as f:
+        with col.open(os.path.join("original", w[len(prefix):]), "wt") as f:
             f.write(text)
 
 
     # Upload files referenced by $include directives, these are used
     # unchanged and don't need to be updated.
     for w in include_files:
-        with col.open(w[n+1:], "wb") as f1:
-            with col.open(os.path.join("original", w[n+1:]), "wb") as f3:
+        with col.open(w[len(prefix):], "wb") as f1:
+            with col.open(os.path.join("original", w[len(prefix):]), "wb") as f3:
                 with open(uri_file_path(w), "rb") as f2:
                     dat = f2.read(65536)
                     while dat:
@@ -361,7 +373,7 @@ def upload_workflow(arvRunner, tool, job_order, project_uuid,
     if git_info and git_info.get("http://arvados.org/cwl#gitDescribe"):
         toolname = "%s (%s)" % (toolname, git_info.get("http://arvados.org/cwl#gitDescribe"))
 
-    toolfile = tool.tool["id"][n+1:]
+    toolfile = tool.tool["id"][len(prefix):]
 
     properties = {
         "type": "workflow",
index 1209b88d8eb6d4a2d70d5632dfbc34e367ccf257..183e0fcc698eccf85613590ccd2fa90fb93e3ca4 100644 (file)
@@ -11,6 +11,7 @@ import httplib2
 
 from arvados_cwl.util import *
 from arvados.errors import ApiError
+from arvados_cwl.arvworkflow import common_prefix
 
 class MockDateTime(datetime.datetime):
     @classmethod
@@ -53,4 +54,19 @@ class TestUtil(unittest.TestCase):
         logger = mock.MagicMock()
 
         current_container = get_current_container(api, num_retries=0, logger=logger)
-        self.assertEqual(current_container, None)
\ No newline at end of file
+        self.assertEqual(current_container, None)
+
+    def test_common_prefix(self):
+        self.assertEqual(common_prefix("file:///foo/bar", ["file:///foo/bar/baz"]), "file:///foo/")
+        self.assertEqual(common_prefix("file:///foo", ["file:///foo", "file:///foo/bar", "file:///foo/bar/"]), "file:///")
+        self.assertEqual(common_prefix("file:///foo/", ["file:///foo/", "file:///foo/bar", "file:///foo/bar/"]), "file:///foo/")
+        self.assertEqual(common_prefix("file:///foo/bar", ["file:///foo/bar", "file:///foo/baz", "file:///foo/quux/q2"]), "file:///foo/")
+        self.assertEqual(common_prefix("file:///foo/bar/", ["file:///foo/bar/", "file:///foo/baz", "file:///foo/quux/q2"]), "file:///foo/")
+        self.assertEqual(common_prefix("file:///foo/bar/splat", ["file:///foo/bar/splat", "file:///foo/baz", "file:///foo/quux/q2"]), "file:///foo/")
+        self.assertEqual(common_prefix("file:///foo/bar/splat", ["file:///foo/bar/splat", "file:///nope", "file:///foo/quux/q2"]), "file:///")
+        self.assertEqual(common_prefix("file:///blub/foo", ["file:///blub/foo", "file:///blub/foo/bar", "file:///blub/foo/bar/"]), "file:///blub/")
+
+        # sanity check, the subsequent code strips off the prefix so
+        # just confirm the logic doesn't have a fencepost error
+        prefix = "file:///"
+        self.assertEqual("file:///foo/bar"[len(prefix):], "foo/bar")