# Demonstrate Arvados federation features. This example searches a # list of CSV files that are hosted on different Arvados clusters. # For each file, send a task to the remote cluster which will scan # file and extracts the rows where the column "select_column" has one # of the values appearing in the "select_values" file. The home # cluster then runs a task which pulls the results from the remote # clusters and merges the results to produce a final report. cwlVersion: v1.0 class: Workflow $namespaces: # When using Arvados extensions to CWL, must declare the 'arv' namespace arv: "http://arvados.org/cwl#" requirements: InlineJavascriptRequirement: {} ScatterFeatureRequirement: {} StepInputExpressionRequirement: {} DockerRequirement: # Replace this with your own Docker container dockerPull: arvados/jobs # Define a record type so we can conveniently associate the input # file and the cluster where the task should run. SchemaDefRequirement: types: - $import: FileOnCluster.yml inputs: select_column: string select_values: File datasets: type: type: array items: FileOnCluster.yml#FileOnCluster intermediate_projects: string[] outputs: # Will produce an output file with the results of the distributed # analysis jobs merged together. joined: type: File outputSource: gather-results/out steps: distributed-analysis: in: select_column: select_column select_values: select_values dataset: datasets intermediate_projects: intermediate_projects # Scatter over shards, this means creating a parallel job for each # element in the "shards" array. Expressions are evaluated for # each element. scatter: [dataset, intermediate_projects] scatterMethod: dotproduct # Specify the cluster target for this task. This means each # separate scatter task will execute on the cluster that was # specified in the "cluster" field. # # Arvados handles streaming data between clusters, for example, # the Docker image containing the code for a particular tool will # be fetched on demand, as long as it is available somewhere in # the federation. hints: arv:ClusterTarget: cluster_id: $(inputs.dataset.cluster) project_uuid: $(inputs.intermediate_projects) out: [out] run: extract.cwl # Collect the results of the distributed step and join them into a # single output file. Arvados handles streaming inputs, # intermediate results, and outputs between clusters on demand. gather-results: in: dataset: distributed-analysis/out out: [out] run: merge.cwl