# # Demonstrate Arvados federation features. This performs a parallel # scatter over some arbitrary number of files and federated clusters, # then joins the results. # cwlVersion: v1.0 class: Workflow $namespaces: # When using Arvados extensions to CWL, must declare the 'arv' namespace arv: "http://arvados.org/cwl#" requirements: InlineJavascriptRequirement: {} ScatterFeatureRequirement: {} StepInputExpressionRequirement: {} DockerRequirement: # Replace this with your own Docker container dockerPull: arvados/jobs # Define a record type so we can conveniently associate the input # file, the cluster on which the file lives, and the project on that # cluster that will own the container requests and intermediate # outputs. SchemaDefRequirement: types: - name: FileOnCluster type: record fields: file: File cluster: string project: string inputs: # Expect an array of FileOnCluster records (defined above) # as our input. shards: type: type: array items: FileOnCluster outputs: # Will produce an output file with the results of the distributed # analysis jobs joined together. joined: type: File outputSource: gather-results/joined steps: distributed-analysis: in: # Take "shards" array as input, we scatter over it below. shard: shards # Use an expression to extract the "file" field to assign to the # "inp" parameter of the tool. inp: {valueFrom: $(inputs.shard.file)} # Scatter over shards, this means creating a parallel job for each # element in the "shards" array. Expressions are evaluated for # each element. scatter: shard # Specify the cluster target for this job. This means each # separate scatter job will execute on the cluster that was # specified in the "cluster" field. # # Arvados handles streaming data between clusters, for example, # the Docker image containing the code for a particular tool will # be fetched on demand, as long as it is available somewhere in # the federation. hints: arv:ClusterTarget: cluster_id: $(inputs.shard.cluster) project_uuid: $(inputs.shard.project) out: [out] run: md5sum.cwl # Collect the results of the distributed step and join them into a # single output file. Arvados handles streaming inputs, # intermediate results, and outputs between clusters on demand. gather-results: in: inp: distributed-analysis/out out: [joined] run: cat.cwl