+#
+# Demonstrate Arvados federation features. This performs a parallel
+# scatter over some arbitrary number of files and federated clusters,
+# then joins the results.
+#
cwlVersion: v1.0
class: Workflow
$namespaces:
+ # When using Arvados extensions to CWL, must declare the 'arv' namespace
arv: "http://arvados.org/cwl#"
+
requirements:
InlineJavascriptRequirement: {}
- DockerRequirement:
- dockerPull: arvados/fed-test:scatter-gather
ScatterFeatureRequirement: {}
StepInputExpressionRequirement: {}
+
+ DockerRequirement:
+ # Replace this with your own Docker container
+ dockerPull: arvados/jobs
+
+ # Define a record type so we can conveniently associate the input
+ # file, the cluster on which the file lives, and the project on that
+ # cluster that will own the container requests and intermediate
+ # outputs.
SchemaDefRequirement:
types:
- name: FileOnCluster
file: File
cluster: string
project: string
+
inputs:
+ # Expect an array of FileOnCluster records (defined above)
+ # as our input.
shards:
type:
type: array
items: FileOnCluster
+
outputs:
+ # Will produce an output file with the results of the distributed
+ # analysis jobs joined together.
joined:
type: File
outputSource: gather-results/joined
+
steps:
distributed-analysis:
in:
- shards: shards
- inp: {valueFrom: $(inputs.shards.file)}
- scatter: shards
+ # Take "shards" array as input, we scatter over it below.
+ shard: shards
+
+ # Use an expression to extract the "file" field to assign to the
+ # "inp" parameter of the tool.
+ inp: {valueFrom: $(inputs.shard.file)}
+
+ # Scatter over shards, this means creating a parallel job for each
+ # element in the "shards" array. Expressions are evaluated for
+ # each element.
+ scatter: shard
+
+ # Specify the cluster target for this job. This means each
+ # separate scatter job will execute on the cluster that was
+ # specified in the "cluster" field.
+ #
+ # Arvados handles streaming data between clusters, for example,
+ # the Docker image containing the code for a particular tool will
+ # be fetched on demand, as long as it is available somewhere in
+ # the federation.
hints:
arv:ClusterTarget:
- cluster_id: $(inputs.shards.cluster)
- project_uuid: $(inputs.shards.project)
+ cluster_id: $(inputs.shard.cluster)
+ project_uuid: $(inputs.shard.project)
+
out: [out]
run: md5sum.cwl
+
+ # Collect the results of the distributed step and join them into a
+ # single output file. Arvados handles streaming inputs,
+ # intermediate results, and outputs between clusters on demand.
gather-results:
in:
inp: distributed-analysis/out