15332: Updated federation demo to do something a little more interesting.
authorPeter Amstutz <pamstutz@veritasgenetics.com>
Mon, 10 Jun 2019 21:25:17 +0000 (17:25 -0400)
committerPeter Amstutz <pamstutz@veritasgenetics.com>
Thu, 13 Jun 2019 19:13:34 +0000 (15:13 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz@veritasgenetics.com>

19 files changed:
.licenseignore
doc/_includes/_federated_cwl.liquid
doc/user/cwl/federated-workflows.html.textile.liquid
doc/user/cwl/federated/FileOnCluster.yml [new file with mode: 0644]
doc/user/cwl/federated/cat.cwl [deleted file]
doc/user/cwl/federated/colors_to_select.txt [new file with mode: 0644]
doc/user/cwl/federated/extract.cwl [new file with mode: 0644]
doc/user/cwl/federated/extract.py [new file with mode: 0644]
doc/user/cwl/federated/feddemo.cwl [moved from doc/user/cwl/federated/federated.cwl with 51% similarity]
doc/user/cwl/federated/file-on-clsr1.dat [deleted file]
doc/user/cwl/federated/file-on-clsr2.dat [deleted file]
doc/user/cwl/federated/file-on-clsr3.dat [deleted file]
doc/user/cwl/federated/items1.csv [new file with mode: 0644]
doc/user/cwl/federated/items2.csv [new file with mode: 0644]
doc/user/cwl/federated/items3.csv [new file with mode: 0644]
doc/user/cwl/federated/md5sum.cwl [deleted file]
doc/user/cwl/federated/merge.cwl [new file with mode: 0644]
doc/user/cwl/federated/merge.py [new file with mode: 0644]
doc/user/cwl/federated/shards.yml

index a9b6f5f6cafdbedb1a202f5f9c0e3526e7a54549..28ddf9c290a2a77adcb1f60b8ecbb806a81d48fd 100644 (file)
@@ -15,6 +15,7 @@ build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados
 doc/fonts/*
 doc/_includes/_config_default_yml.liquid
 doc/user/cwl/federated/*
+doc/_includes/_federated_cwl.liquid
 */docker_image
 docker/jobs/apt.arvados.org*.list
 docker/jobs/1078ECD7.key
index 59a629c5acf0d6db2ff8d9121fb8f16b26b5a715..cfe8407e2f75af222e5ffe34de0dd10d65b8d6fc 120000 (symlink)
@@ -1 +1 @@
-../user/cwl/federated/federated.cwl
\ No newline at end of file
+../user/cwl/federated/feddemo.cwl
\ No newline at end of file
index 7e2150dccb20e7784785b36f6bd4abe1905ff25a..01d656dd1519ffa337d10c9bc1ce047c6a133f2e 100644 (file)
@@ -36,7 +36,7 @@ At this time, remote steps of a workflow on Workbench are not displayed.  As a w
 Run it like any other workflow:
 
 <notextile>
-<pre><code>~$ <span class="userinput">arvados-cwl-runner federated.cwl shards.cwl</span>
+<pre><code>~$ <span class="userinput">arvados-cwl-runner feddemo.cwl shards.cwl</span>
 </code></pre>
 </notextile>
 
diff --git a/doc/user/cwl/federated/FileOnCluster.yml b/doc/user/cwl/federated/FileOnCluster.yml
new file mode 100644 (file)
index 0000000..363d071
--- /dev/null
@@ -0,0 +1,5 @@
+name: FileOnCluster
+type: record
+fields:
+  file: File
+  cluster: string
\ No newline at end of file
diff --git a/doc/user/cwl/federated/cat.cwl b/doc/user/cwl/federated/cat.cwl
deleted file mode 100644 (file)
index 17132fe..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-cwlVersion: v1.0
-class: CommandLineTool
-inputs:
-  inp:
-    type: File[]
-    inputBinding: {}
-outputs:
-  joined: stdout
-stdout: joined.txt
-baseCommand: cat
diff --git a/doc/user/cwl/federated/colors_to_select.txt b/doc/user/cwl/federated/colors_to_select.txt
new file mode 100644 (file)
index 0000000..620b008
--- /dev/null
@@ -0,0 +1,2 @@
+green
+blue
diff --git a/doc/user/cwl/federated/extract.cwl b/doc/user/cwl/federated/extract.cwl
new file mode 100644 (file)
index 0000000..f8fdedb
--- /dev/null
@@ -0,0 +1,22 @@
+cwlVersion: v1.0
+class: CommandLineTool
+requirements:
+  SchemaDefRequirement:
+    types:
+      - $import: FileOnCluster.yml
+inputs:
+  select_column: string
+  select_values: File
+  dataset: 'FileOnCluster.yml#FileOnCluster'
+  extract_py:
+    type: File
+    default:
+      class: File
+      location: extract.py
+outputs:
+  out:
+    type: File
+    outputBinding:
+      glob: extracted.csv
+
+arguments: [python, $(inputs.extract_py), $(inputs.select_column), $(inputs.select_values), $(inputs.dataset.file), $(inputs.dataset.cluster)]
diff --git a/doc/user/cwl/federated/extract.py b/doc/user/cwl/federated/extract.py
new file mode 100644 (file)
index 0000000..2d2c49d
--- /dev/null
@@ -0,0 +1,31 @@
+import csv
+import sys
+
+select_column = sys.argv[1]
+select_values = sys.argv[2]
+dataset = sys.argv[3]
+cluster = sys.argv[4]
+
+sv = open(select_values, "rt")
+selectvals = [s.strip() for s in sv]
+
+print("selectvals", selectvals)
+
+ds = csv.reader(open(dataset, "rt"))
+header = next(ds)
+print("header is", header)
+columnindex = None
+for i,v in enumerate(header):
+    if v == select_column:
+        columnindex = i
+if columnindex is None:
+    raise Exception("Column %s not found" % select_column)
+
+print("column index", columnindex)
+
+ex = csv.writer(open("extracted.csv", "wt"))
+ex.writerow(["cluster"]+list(header))
+
+for row in ds:
+    if row[columnindex] in selectvals:
+        ex.writerow([cluster]+list(row))
similarity index 51%
rename from doc/user/cwl/federated/federated.cwl
rename to doc/user/cwl/federated/feddemo.cwl
index 5314a7675b2e6f64c08351cec9e2ccb893a77bab..a68ff444a67dd40db13b6aa47dce20e8afc67922 100644 (file)
@@ -1,8 +1,11 @@
-#
-# Demonstrate Arvados federation features.  This performs a parallel
-# scatter over some arbitrary number of files and federated clusters,
-# then joins the results.
-#
+# Demonstrate Arvados federation features.  This example searches a
+# list of CSV files that are hosted on different Arvados clusters.
+# For each file, send a task to the remote cluster which will scan
+# file and extracts the rows where the column "select_column" has one
+# of the values appearing in the "select_values" file.  The home
+# cluster then runs a task which pulls the results from the remote
+# clusters and merges the results to produce a final report.
+
 cwlVersion: v1.0
 class: Workflow
 $namespaces:
@@ -19,50 +22,45 @@ requirements:
     dockerPull: arvados/jobs
 
   # Define a record type so we can conveniently associate the input
-  # file, the cluster on which the file lives, and the project on that
-  # cluster that will own the container requests and intermediate
-  # outputs.
+  # file and the cluster where the task should run.
   SchemaDefRequirement:
     types:
-      - name: FileOnCluster
-        type: record
-        fields:
-          file: File
-          cluster: string
-          project: string
+      - $import: FileOnCluster.yml
 
 inputs:
-  # Expect an array of FileOnCluster records (defined above)
-  # as our input.
-  shards:
+  select_column: string
+  select_values: File
+
+  datasets:
     type:
       type: array
-      items: FileOnCluster
+      items: FileOnCluster.yml#FileOnCluster
+
+  intermediate_projects: string[]
 
 outputs:
   # Will produce an output file with the results of the distributed
-  # analysis jobs joined together.
+  # analysis jobs merged together.
   joined:
     type: File
-    outputSource: gather-results/joined
+    outputSource: gather-results/out
 
 steps:
   distributed-analysis:
     in:
-      # Take "shards" array as input, we scatter over it below.
-      shard: shards
-
-      # Use an expression to extract the "file" field to assign to the
-      # "inp" parameter of the tool.
-      inp: {valueFrom: $(inputs.shard.file)}
+      select_column: select_column
+      select_values: select_values
+      dataset: datasets
+      intermediate_projects: intermediate_projects
 
     # Scatter over shards, this means creating a parallel job for each
     # element in the "shards" array.  Expressions are evaluated for
     # each element.
-    scatter: shard
+    scatter: [dataset, intermediate_projects]
+    scatterMethod: dotproduct
 
-    # Specify the cluster target for this job.  This means each
-    # separate scatter job will execute on the cluster that was
+    # Specify the cluster target for this task.  This means each
+    # separate scatter task will execute on the cluster that was
     # specified in the "cluster" field.
     #
     # Arvados handles streaming data between clusters, for example,
@@ -71,17 +69,17 @@ steps:
     # the federation.
     hints:
       arv:ClusterTarget:
-        cluster_id: $(inputs.shard.cluster)
-        project_uuid: $(inputs.shard.project)
+        cluster_id: $(inputs.dataset.cluster)
+        project_uuid: $(inputs.intermediate_projects)
 
     out: [out]
-    run: md5sum.cwl
+    run: extract.cwl
 
   # Collect the results of the distributed step and join them into a
   # single output file.  Arvados handles streaming inputs,
   # intermediate results, and outputs between clusters on demand.
   gather-results:
     in:
-      inp: distributed-analysis/out
-    out: [joined]
-    run: cat.cwl
+      dataset: distributed-analysis/out
+    out: [out]
+    run: merge.cwl
diff --git a/doc/user/cwl/federated/file-on-clsr1.dat b/doc/user/cwl/federated/file-on-clsr1.dat
deleted file mode 100644 (file)
index e79f152..0000000
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr1.dat
diff --git a/doc/user/cwl/federated/file-on-clsr2.dat b/doc/user/cwl/federated/file-on-clsr2.dat
deleted file mode 100644 (file)
index 9179dc8..0000000
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr2.dat
diff --git a/doc/user/cwl/federated/file-on-clsr3.dat b/doc/user/cwl/federated/file-on-clsr3.dat
deleted file mode 100644 (file)
index 58b5902..0000000
+++ /dev/null
@@ -1 +0,0 @@
-file-on-clsr3.dat
diff --git a/doc/user/cwl/federated/items1.csv b/doc/user/cwl/federated/items1.csv
new file mode 100644 (file)
index 0000000..59d2d32
--- /dev/null
@@ -0,0 +1,29 @@
+color,item
+blue,ball
+yellow,ball
+red,ball
+green,book
+purple,book
+red,book
+yellow,flower
+purple,flower
+red,bicycle
+red,ball
+green,picture
+yellow,ball
+purple,flower
+yellow,ball
+green,bicycle
+orange,book
+green,book
+orange,picture
+blue,book
+orange,car
+yellow,flower
+purple,ball
+blue,book
+orange,book
+orange,book
+yellow,book
+orange,car
+yellow,car
diff --git a/doc/user/cwl/federated/items2.csv b/doc/user/cwl/federated/items2.csv
new file mode 100644 (file)
index 0000000..566dab7
--- /dev/null
@@ -0,0 +1,33 @@
+color,item
+green,bicycle
+red,flower
+blue,bicycle
+yellow,flower
+green,ball
+red,book
+red,bicycle
+yellow,ball
+blue,picture
+green,book
+orange,flower
+blue,ball
+orange,car
+green,book
+yellow,car
+orange,picture
+orange,car
+yellow,flower
+green,ball
+orange,car
+purple,book
+green,ball
+red,flower
+blue,car
+orange,flower
+blue,book
+blue,bicycle
+red,picture
+orange,flower
+orange,book
+blue,flower
+orange,book
diff --git a/doc/user/cwl/federated/items3.csv b/doc/user/cwl/federated/items3.csv
new file mode 100644 (file)
index 0000000..e820e45
--- /dev/null
@@ -0,0 +1,41 @@
+color,item
+purple,book
+green,book
+red,bicycle
+yellow,book
+orange,book
+green,car
+green,car
+blue,ball
+yellow,bicycle
+orange,book
+green,bicycle
+blue,flower
+red,bicycle
+purple,bicycle
+green,bicycle
+orange,ball
+yellow,car
+orange,ball
+red,ball
+red,car
+green,picture
+green,flower
+blue,picture
+green,car
+yellow,flower
+purple,flower
+green,ball
+yellow,bicycle
+orange,bicycle
+orange,flower
+yellow,picture
+purple,flower
+green,picture
+orange,car
+orange,picture
+yellow,car
+yellow,picture
+purple,picture
+purple,picture
+purple,flower
diff --git a/doc/user/cwl/federated/md5sum.cwl b/doc/user/cwl/federated/md5sum.cwl
deleted file mode 100644 (file)
index 9c78dc2..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-cwlVersion: v1.0
-class: CommandLineTool
-$namespaces:
-  arv: "http://arvados.org/cwl#"
-requirements:
-  InlineJavascriptRequirement: {}
-inputs:
-  inp:
-    type: File
-outputs:
-  out:
-    type: File
-    outputBinding:
-      glob: out.txt
-stdin: $(inputs.inp.path)
-stdout: out.txt
-arguments: ["md5sum", "-"]
diff --git a/doc/user/cwl/federated/merge.cwl b/doc/user/cwl/federated/merge.cwl
new file mode 100644 (file)
index 0000000..a60d619
--- /dev/null
@@ -0,0 +1,23 @@
+cwlVersion: v1.0
+class: CommandLineTool
+requirements:
+  SchemaDefRequirement:
+    types:
+      - $import: FileOnCluster.yml
+inputs:
+  dataset:
+    type: File[]
+    inputBinding:
+      position: 1
+  merge_py:
+    type: File
+    default:
+      class: File
+      location: merge.py
+outputs:
+  out:
+    type: File
+    outputBinding:
+      glob: merged.csv
+
+arguments: [python, $(inputs.merge_py)]
diff --git a/doc/user/cwl/federated/merge.py b/doc/user/cwl/federated/merge.py
new file mode 100644 (file)
index 0000000..03c79f2
--- /dev/null
@@ -0,0 +1,15 @@
+import sys
+import csv
+
+merged = open("merged.csv", "wt")
+
+wroteheader = False
+for s in sys.argv[1:]:
+    f = open(s, "rt")
+    header = next(f)
+    if not wroteheader:
+        merged.write(header)
+        wroteheader = True
+    for l in f:
+        merged.write(l)
+    f.close()
index ed8a83ab3f6ba19527b4dbd926f0bb6e838aacc0..14e346248dc3828c357a9c451bc30a0d2461c626 100644 (file)
@@ -1,18 +1,25 @@
-shards:
+select_column: color
+select_values:
+  class: File
+  location: colors_to_select.txt
+
+datasets:
   - cluster: clsr1
-    project: clsr1-j7d0g-qxc4jcji7n4lafx
     file:
       class: File
-      location: keep:485df2c5cec3207a32f49c42f1cdcca9+61/file-on-clsr1.dat
+      location: keep:0dcf9310e5bf0c07270416d3a0cd6a43+56/items1.csv
 
   - cluster: clsr2
-    project: clsr2-j7d0g-ivdrm1hyym21vkq
     file:
       class: File
-      location: keep:ae6e9c3e9bfa52a0122ecb489d8198ff+61/file-on-clsr2.dat
+      location: keep:12707d325a3f4687674b858bd32beae9+56/items2.csv
 
   - cluster: clsr3
-    project: clsr3-j7d0g-e3njz2s53lyb0ka
     file:
       class: File
-      location: keep:0b43a0ef9ea592d5d7b299978dfa8643+61/file-on-clsr3.dat
+      location: keep:dbff6bb7fc43176527af5eb9dec28871+56/items3.csv
+
+intermediate_projects:
+  - clsr1-j7d0g-qxc4jcji7n4lafx
+  - clsr2-j7d0g-e7r20egb8hlgn53
+  - clsr3-j7d0g-vrl00zoku9spnen