git.arvados.org - arvados.git/blob - doc/user/cwl/federated/feddemo.cwl

1 # Demonstrate Arvados federation features. This example searches a

2 # list of CSV files that are hosted on different Arvados clusters.

3 # For each file, send a task to the remote cluster which will scan

4 # file and extracts the rows where the column "select_column" has one

5 # of the values appearing in the "select_values" file. The home

6 # cluster then runs a task which pulls the results from the remote

7 # clusters and merges the results to produce a final report.

9 cwlVersion: v1.0

10 class: Workflow

11 $namespaces:

12 # When using Arvados extensions to CWL, must declare the 'arv' namespace

13 arv: "http://arvados.org/cwl#"

15 requirements:

16 InlineJavascriptRequirement: {}

17 ScatterFeatureRequirement: {}

18 StepInputExpressionRequirement: {}

20 DockerRequirement:

21 # Replace this with your own Docker container

22 dockerPull: arvados/jobs

24 # Define a record type so we can conveniently associate the input

25 # file and the cluster where the task should run.

26 SchemaDefRequirement:

27 types:

28 - $import: FileOnCluster.yml

30 inputs:

31 select_column: string

32 select_values: File

34 datasets:

35 type:

36 type: array

37 items: FileOnCluster.yml#FileOnCluster

39 intermediate_projects: string[]

41 outputs:

42 # Will produce an output file with the results of the distributed

43 # analysis jobs merged together.

44 joined:

45 type: File

46 outputSource: gather-results/out

48 steps:

49 distributed-analysis:

50 in:

51 select_column: select_column

52 select_values: select_values

53 dataset: datasets

54 intermediate_projects: intermediate_projects

56 # Scatter over shards, this means creating a parallel job for each

57 # element in the "shards" array. Expressions are evaluated for

58 # each element.

59 scatter: [dataset, intermediate_projects]

60 scatterMethod: dotproduct

62 # Specify the cluster target for this task. This means each

63 # separate scatter task will execute on the cluster that was

64 # specified in the "cluster" field.

65 #

66 # Arvados handles streaming data between clusters, for example,

67 # the Docker image containing the code for a particular tool will

68 # be fetched on demand, as long as it is available somewhere in

69 # the federation.

70 hints:

71 arv:ClusterTarget:

72 cluster_id: $(inputs.dataset.cluster)

73 project_uuid: $(inputs.intermediate_projects)

75 out: [out]

76 run: extract.cwl

78 # Collect the results of the distributed step and join them into a

79 # single output file. Arvados handles streaming inputs,

80 # intermediate results, and outputs between clusters on demand.

81 gather-results:

82 in:

83 dataset: distributed-analysis/out

84 out: [out]

85 run: merge.cwl