1 # Demonstrate Arvados federation features. This example searches a
2 # list of CSV files that are hosted on different Arvados clusters.
3 # For each file, send a task to the remote cluster which will scan
4 # file and extracts the rows where the column "select_column" has one
5 # of the values appearing in the "select_values" file. The home
6 # cluster then runs a task which pulls the results from the remote
7 # clusters and merges the results to produce a final report.
12 # When using Arvados extensions to CWL, must declare the 'arv' namespace
13 arv: "http://arvados.org/cwl#"
16 InlineJavascriptRequirement: {}
17 ScatterFeatureRequirement: {}
18 StepInputExpressionRequirement: {}
21 # Replace this with your own Docker container
22 dockerPull: arvados/jobs
24 # Define a record type so we can conveniently associate the input
25 # file and the cluster where the task should run.
28 - $import: FileOnCluster.yml
37 items: FileOnCluster.yml#FileOnCluster
39 intermediate_projects: string[]
42 # Will produce an output file with the results of the distributed
43 # analysis jobs merged together.
46 outputSource: gather-results/out
51 select_column: select_column
52 select_values: select_values
54 intermediate_projects: intermediate_projects
56 # Scatter over shards, this means creating a parallel job for each
57 # element in the "shards" array. Expressions are evaluated for
59 scatter: [dataset, intermediate_projects]
60 scatterMethod: dotproduct
62 # Specify the cluster target for this task. This means each
63 # separate scatter task will execute on the cluster that was
64 # specified in the "cluster" field.
66 # Arvados handles streaming data between clusters, for example,
67 # the Docker image containing the code for a particular tool will
68 # be fetched on demand, as long as it is available somewhere in
72 cluster_id: $(inputs.dataset.cluster)
73 project_uuid: $(inputs.intermediate_projects)
78 # Collect the results of the distributed step and join them into a
79 # single output file. Arvados handles streaming inputs,
80 # intermediate results, and outputs between clusters on demand.
83 dataset: distributed-analysis/out