From 7ffc69693f4fd7f5db67fba3102d99cfba4b8fef Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 10 Jun 2019 17:25:17 -0400 Subject: [PATCH] 15332: Updated federation demo to do something a little more interesting. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- .licenseignore | 1 + doc/_includes/_federated_cwl.liquid | 2 +- .../federated-workflows.html.textile.liquid | 2 +- doc/user/cwl/federated/FileOnCluster.yml | 5 ++ doc/user/cwl/federated/cat.cwl | 14 ---- doc/user/cwl/federated/colors_to_select.txt | 2 + doc/user/cwl/federated/extract.cwl | 22 ++++++ doc/user/cwl/federated/extract.py | 31 +++++++++ .../federated/{federated.cwl => feddemo.cwl} | 68 +++++++++---------- doc/user/cwl/federated/file-on-clsr1.dat | 1 - doc/user/cwl/federated/file-on-clsr2.dat | 1 - doc/user/cwl/federated/file-on-clsr3.dat | 1 - doc/user/cwl/federated/items1.csv | 29 ++++++++ doc/user/cwl/federated/items2.csv | 33 +++++++++ doc/user/cwl/federated/items3.csv | 41 +++++++++++ doc/user/cwl/federated/md5sum.cwl | 21 ------ doc/user/cwl/federated/merge.cwl | 23 +++++++ doc/user/cwl/federated/merge.py | 15 ++++ doc/user/cwl/federated/shards.yml | 21 ++++-- 19 files changed, 251 insertions(+), 82 deletions(-) create mode 100644 doc/user/cwl/federated/FileOnCluster.yml delete mode 100644 doc/user/cwl/federated/cat.cwl create mode 100644 doc/user/cwl/federated/colors_to_select.txt create mode 100644 doc/user/cwl/federated/extract.cwl create mode 100644 doc/user/cwl/federated/extract.py rename doc/user/cwl/federated/{federated.cwl => feddemo.cwl} (51%) delete mode 100644 doc/user/cwl/federated/file-on-clsr1.dat delete mode 100644 doc/user/cwl/federated/file-on-clsr2.dat delete mode 100644 doc/user/cwl/federated/file-on-clsr3.dat create mode 100644 doc/user/cwl/federated/items1.csv create mode 100644 doc/user/cwl/federated/items2.csv create mode 100644 doc/user/cwl/federated/items3.csv delete mode 100644 doc/user/cwl/federated/md5sum.cwl create mode 100644 doc/user/cwl/federated/merge.cwl create mode 100644 doc/user/cwl/federated/merge.py diff --git a/.licenseignore b/.licenseignore index a9b6f5f6ca..28ddf9c290 100644 --- a/.licenseignore +++ b/.licenseignore @@ -15,6 +15,7 @@ build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados doc/fonts/* doc/_includes/_config_default_yml.liquid doc/user/cwl/federated/* +doc/_includes/_federated_cwl.liquid */docker_image docker/jobs/apt.arvados.org*.list docker/jobs/1078ECD7.key diff --git a/doc/_includes/_federated_cwl.liquid b/doc/_includes/_federated_cwl.liquid index 59a629c5ac..cfe8407e2f 120000 --- a/doc/_includes/_federated_cwl.liquid +++ b/doc/_includes/_federated_cwl.liquid @@ -1 +1 @@ -../user/cwl/federated/federated.cwl \ No newline at end of file +../user/cwl/federated/feddemo.cwl \ No newline at end of file diff --git a/doc/user/cwl/federated-workflows.html.textile.liquid b/doc/user/cwl/federated-workflows.html.textile.liquid index 7e2150dccb..01d656dd15 100644 --- a/doc/user/cwl/federated-workflows.html.textile.liquid +++ b/doc/user/cwl/federated-workflows.html.textile.liquid @@ -36,7 +36,7 @@ At this time, remote steps of a workflow on Workbench are not displayed. As a w Run it like any other workflow: -
~$ arvados-cwl-runner federated.cwl shards.cwl
+
~$ arvados-cwl-runner feddemo.cwl shards.cwl
 
diff --git a/doc/user/cwl/federated/FileOnCluster.yml b/doc/user/cwl/federated/FileOnCluster.yml new file mode 100644 index 0000000000..363d0717a9 --- /dev/null +++ b/doc/user/cwl/federated/FileOnCluster.yml @@ -0,0 +1,5 @@ +name: FileOnCluster +type: record +fields: + file: File + cluster: string \ No newline at end of file diff --git a/doc/user/cwl/federated/cat.cwl b/doc/user/cwl/federated/cat.cwl deleted file mode 100644 index 17132fe61c..0000000000 --- a/doc/user/cwl/federated/cat.cwl +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (C) The Arvados Authors. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -cwlVersion: v1.0 -class: CommandLineTool -inputs: - inp: - type: File[] - inputBinding: {} -outputs: - joined: stdout -stdout: joined.txt -baseCommand: cat diff --git a/doc/user/cwl/federated/colors_to_select.txt b/doc/user/cwl/federated/colors_to_select.txt new file mode 100644 index 0000000000..620b0084bd --- /dev/null +++ b/doc/user/cwl/federated/colors_to_select.txt @@ -0,0 +1,2 @@ +green +blue diff --git a/doc/user/cwl/federated/extract.cwl b/doc/user/cwl/federated/extract.cwl new file mode 100644 index 0000000000..f8fdedbd88 --- /dev/null +++ b/doc/user/cwl/federated/extract.cwl @@ -0,0 +1,22 @@ +cwlVersion: v1.0 +class: CommandLineTool +requirements: + SchemaDefRequirement: + types: + - $import: FileOnCluster.yml +inputs: + select_column: string + select_values: File + dataset: 'FileOnCluster.yml#FileOnCluster' + extract_py: + type: File + default: + class: File + location: extract.py +outputs: + out: + type: File + outputBinding: + glob: extracted.csv + +arguments: [python, $(inputs.extract_py), $(inputs.select_column), $(inputs.select_values), $(inputs.dataset.file), $(inputs.dataset.cluster)] diff --git a/doc/user/cwl/federated/extract.py b/doc/user/cwl/federated/extract.py new file mode 100644 index 0000000000..2d2c49dcb3 --- /dev/null +++ b/doc/user/cwl/federated/extract.py @@ -0,0 +1,31 @@ +import csv +import sys + +select_column = sys.argv[1] +select_values = sys.argv[2] +dataset = sys.argv[3] +cluster = sys.argv[4] + +sv = open(select_values, "rt") +selectvals = [s.strip() for s in sv] + +print("selectvals", selectvals) + +ds = csv.reader(open(dataset, "rt")) +header = next(ds) +print("header is", header) +columnindex = None +for i,v in enumerate(header): + if v == select_column: + columnindex = i +if columnindex is None: + raise Exception("Column %s not found" % select_column) + +print("column index", columnindex) + +ex = csv.writer(open("extracted.csv", "wt")) +ex.writerow(["cluster"]+list(header)) + +for row in ds: + if row[columnindex] in selectvals: + ex.writerow([cluster]+list(row)) diff --git a/doc/user/cwl/federated/federated.cwl b/doc/user/cwl/federated/feddemo.cwl similarity index 51% rename from doc/user/cwl/federated/federated.cwl rename to doc/user/cwl/federated/feddemo.cwl index 5314a7675b..a68ff444a6 100644 --- a/doc/user/cwl/federated/federated.cwl +++ b/doc/user/cwl/federated/feddemo.cwl @@ -1,8 +1,11 @@ -# -# Demonstrate Arvados federation features. This performs a parallel -# scatter over some arbitrary number of files and federated clusters, -# then joins the results. -# +# Demonstrate Arvados federation features. This example searches a +# list of CSV files that are hosted on different Arvados clusters. +# For each file, send a task to the remote cluster which will scan +# file and extracts the rows where the column "select_column" has one +# of the values appearing in the "select_values" file. The home +# cluster then runs a task which pulls the results from the remote +# clusters and merges the results to produce a final report. + cwlVersion: v1.0 class: Workflow $namespaces: @@ -19,50 +22,45 @@ requirements: dockerPull: arvados/jobs # Define a record type so we can conveniently associate the input - # file, the cluster on which the file lives, and the project on that - # cluster that will own the container requests and intermediate - # outputs. + # file and the cluster where the task should run. SchemaDefRequirement: types: - - name: FileOnCluster - type: record - fields: - file: File - cluster: string - project: string + - $import: FileOnCluster.yml inputs: - # Expect an array of FileOnCluster records (defined above) - # as our input. - shards: + select_column: string + select_values: File + + datasets: type: type: array - items: FileOnCluster + items: FileOnCluster.yml#FileOnCluster + + intermediate_projects: string[] outputs: # Will produce an output file with the results of the distributed - # analysis jobs joined together. + # analysis jobs merged together. joined: type: File - outputSource: gather-results/joined + outputSource: gather-results/out steps: distributed-analysis: in: - # Take "shards" array as input, we scatter over it below. - shard: shards - - # Use an expression to extract the "file" field to assign to the - # "inp" parameter of the tool. - inp: {valueFrom: $(inputs.shard.file)} + select_column: select_column + select_values: select_values + dataset: datasets + intermediate_projects: intermediate_projects # Scatter over shards, this means creating a parallel job for each # element in the "shards" array. Expressions are evaluated for # each element. - scatter: shard + scatter: [dataset, intermediate_projects] + scatterMethod: dotproduct - # Specify the cluster target for this job. This means each - # separate scatter job will execute on the cluster that was + # Specify the cluster target for this task. This means each + # separate scatter task will execute on the cluster that was # specified in the "cluster" field. # # Arvados handles streaming data between clusters, for example, @@ -71,17 +69,17 @@ steps: # the federation. hints: arv:ClusterTarget: - cluster_id: $(inputs.shard.cluster) - project_uuid: $(inputs.shard.project) + cluster_id: $(inputs.dataset.cluster) + project_uuid: $(inputs.intermediate_projects) out: [out] - run: md5sum.cwl + run: extract.cwl # Collect the results of the distributed step and join them into a # single output file. Arvados handles streaming inputs, # intermediate results, and outputs between clusters on demand. gather-results: in: - inp: distributed-analysis/out - out: [joined] - run: cat.cwl + dataset: distributed-analysis/out + out: [out] + run: merge.cwl diff --git a/doc/user/cwl/federated/file-on-clsr1.dat b/doc/user/cwl/federated/file-on-clsr1.dat deleted file mode 100644 index e79f1526c6..0000000000 --- a/doc/user/cwl/federated/file-on-clsr1.dat +++ /dev/null @@ -1 +0,0 @@ -file-on-clsr1.dat diff --git a/doc/user/cwl/federated/file-on-clsr2.dat b/doc/user/cwl/federated/file-on-clsr2.dat deleted file mode 100644 index 9179dc8a5b..0000000000 --- a/doc/user/cwl/federated/file-on-clsr2.dat +++ /dev/null @@ -1 +0,0 @@ -file-on-clsr2.dat diff --git a/doc/user/cwl/federated/file-on-clsr3.dat b/doc/user/cwl/federated/file-on-clsr3.dat deleted file mode 100644 index 58b590233a..0000000000 --- a/doc/user/cwl/federated/file-on-clsr3.dat +++ /dev/null @@ -1 +0,0 @@ -file-on-clsr3.dat diff --git a/doc/user/cwl/federated/items1.csv b/doc/user/cwl/federated/items1.csv new file mode 100644 index 0000000000..59d2d322b4 --- /dev/null +++ b/doc/user/cwl/federated/items1.csv @@ -0,0 +1,29 @@ +color,item +blue,ball +yellow,ball +red,ball +green,book +purple,book +red,book +yellow,flower +purple,flower +red,bicycle +red,ball +green,picture +yellow,ball +purple,flower +yellow,ball +green,bicycle +orange,book +green,book +orange,picture +blue,book +orange,car +yellow,flower +purple,ball +blue,book +orange,book +orange,book +yellow,book +orange,car +yellow,car diff --git a/doc/user/cwl/federated/items2.csv b/doc/user/cwl/federated/items2.csv new file mode 100644 index 0000000000..566dab7751 --- /dev/null +++ b/doc/user/cwl/federated/items2.csv @@ -0,0 +1,33 @@ +color,item +green,bicycle +red,flower +blue,bicycle +yellow,flower +green,ball +red,book +red,bicycle +yellow,ball +blue,picture +green,book +orange,flower +blue,ball +orange,car +green,book +yellow,car +orange,picture +orange,car +yellow,flower +green,ball +orange,car +purple,book +green,ball +red,flower +blue,car +orange,flower +blue,book +blue,bicycle +red,picture +orange,flower +orange,book +blue,flower +orange,book diff --git a/doc/user/cwl/federated/items3.csv b/doc/user/cwl/federated/items3.csv new file mode 100644 index 0000000000..e820e45372 --- /dev/null +++ b/doc/user/cwl/federated/items3.csv @@ -0,0 +1,41 @@ +color,item +purple,book +green,book +red,bicycle +yellow,book +orange,book +green,car +green,car +blue,ball +yellow,bicycle +orange,book +green,bicycle +blue,flower +red,bicycle +purple,bicycle +green,bicycle +orange,ball +yellow,car +orange,ball +red,ball +red,car +green,picture +green,flower +blue,picture +green,car +yellow,flower +purple,flower +green,ball +yellow,bicycle +orange,bicycle +orange,flower +yellow,picture +purple,flower +green,picture +orange,car +orange,picture +yellow,car +yellow,picture +purple,picture +purple,picture +purple,flower diff --git a/doc/user/cwl/federated/md5sum.cwl b/doc/user/cwl/federated/md5sum.cwl deleted file mode 100644 index 9c78dc2685..0000000000 --- a/doc/user/cwl/federated/md5sum.cwl +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) The Arvados Authors. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -cwlVersion: v1.0 -class: CommandLineTool -$namespaces: - arv: "http://arvados.org/cwl#" -requirements: - InlineJavascriptRequirement: {} -inputs: - inp: - type: File -outputs: - out: - type: File - outputBinding: - glob: out.txt -stdin: $(inputs.inp.path) -stdout: out.txt -arguments: ["md5sum", "-"] diff --git a/doc/user/cwl/federated/merge.cwl b/doc/user/cwl/federated/merge.cwl new file mode 100644 index 0000000000..a60d619f9f --- /dev/null +++ b/doc/user/cwl/federated/merge.cwl @@ -0,0 +1,23 @@ +cwlVersion: v1.0 +class: CommandLineTool +requirements: + SchemaDefRequirement: + types: + - $import: FileOnCluster.yml +inputs: + dataset: + type: File[] + inputBinding: + position: 1 + merge_py: + type: File + default: + class: File + location: merge.py +outputs: + out: + type: File + outputBinding: + glob: merged.csv + +arguments: [python, $(inputs.merge_py)] diff --git a/doc/user/cwl/federated/merge.py b/doc/user/cwl/federated/merge.py new file mode 100644 index 0000000000..03c79f23cf --- /dev/null +++ b/doc/user/cwl/federated/merge.py @@ -0,0 +1,15 @@ +import sys +import csv + +merged = open("merged.csv", "wt") + +wroteheader = False +for s in sys.argv[1:]: + f = open(s, "rt") + header = next(f) + if not wroteheader: + merged.write(header) + wroteheader = True + for l in f: + merged.write(l) + f.close() diff --git a/doc/user/cwl/federated/shards.yml b/doc/user/cwl/federated/shards.yml index ed8a83ab3f..14e346248d 100644 --- a/doc/user/cwl/federated/shards.yml +++ b/doc/user/cwl/federated/shards.yml @@ -1,18 +1,25 @@ -shards: +select_column: color +select_values: + class: File + location: colors_to_select.txt + +datasets: - cluster: clsr1 - project: clsr1-j7d0g-qxc4jcji7n4lafx file: class: File - location: keep:485df2c5cec3207a32f49c42f1cdcca9+61/file-on-clsr1.dat + location: keep:0dcf9310e5bf0c07270416d3a0cd6a43+56/items1.csv - cluster: clsr2 - project: clsr2-j7d0g-ivdrm1hyym21vkq file: class: File - location: keep:ae6e9c3e9bfa52a0122ecb489d8198ff+61/file-on-clsr2.dat + location: keep:12707d325a3f4687674b858bd32beae9+56/items2.csv - cluster: clsr3 - project: clsr3-j7d0g-e3njz2s53lyb0ka file: class: File - location: keep:0b43a0ef9ea592d5d7b299978dfa8643+61/file-on-clsr3.dat + location: keep:dbff6bb7fc43176527af5eb9dec28871+56/items3.csv + +intermediate_projects: + - clsr1-j7d0g-qxc4jcji7n4lafx + - clsr2-j7d0g-e7r20egb8hlgn53 + - clsr3-j7d0g-vrl00zoku9spnen -- 2.30.2