git.arvados.org - arvados.git/blob - doc/user/cwl/federated/federated.cwl

1 #

2 # Demonstrate Arvados federation features. This performs a parallel

3 # scatter over some arbitrary number of files and federated clusters,

4 # then joins the results.

5 #

6 cwlVersion: v1.0

7 class: Workflow

8 $namespaces:

9 # When using Arvados extensions to CWL, must declare the 'arv' namespace

10 arv: "http://arvados.org/cwl#"

12 requirements:

13 InlineJavascriptRequirement: {}

14 ScatterFeatureRequirement: {}

15 StepInputExpressionRequirement: {}

17 DockerRequirement:

18 # Replace this with your own Docker container

19 dockerPull: arvados/jobs

21 # Define a record type so we can conveniently associate the input

22 # file, the cluster on which the file lives, and the project on that

23 # cluster that will own the container requests and intermediate

24 # outputs.

25 SchemaDefRequirement:

26 types:

27 - name: FileOnCluster

28 type: record

29 fields:

30 file: File

31 cluster: string

32 project: string

34 inputs:

35 # Expect an array of FileOnCluster records (defined above)

36 # as our input.

37 shards:

38 type:

39 type: array

40 items: FileOnCluster

42 outputs:

43 # Will produce an output file with the results of the distributed

44 # analysis jobs joined together.

45 joined:

46 type: File

47 outputSource: gather-results/joined

49 steps:

50 distributed-analysis:

51 in:

52 # Take "shards" array as input, we scatter over it below.

53 shard: shards

55 # Use an expression to extract the "file" field to assign to the

56 # "inp" parameter of the tool.

57 inp: {valueFrom: $(inputs.shard.file)}

59 # Scatter over shards, this means creating a parallel job for each

60 # element in the "shards" array. Expressions are evaluated for

61 # each element.

62 scatter: shard

64 # Specify the cluster target for this job. This means each

65 # separate scatter job will execute on the cluster that was

66 # specified in the "cluster" field.

67 #

68 # Arvados handles streaming data between clusters, for example,

69 # the Docker image containing the code for a particular tool will

70 # be fetched on demand, as long as it is available somewhere in

71 # the federation.

72 hints:

73 arv:ClusterTarget:

74 cluster_id: $(inputs.shard.cluster)

75 project_uuid: $(inputs.shard.project)

77 out: [out]

78 run: md5sum.cwl

80 # Collect the results of the distributed step and join them into a

81 # single output file. Arvados handles streaming inputs,

82 # intermediate results, and outputs between clusters on demand.

83 gather-results:

84 in:

85 inp: distributed-analysis/out

86 out: [joined]

87 run: cat.cwl