doc/user/cwl/federated/federated.cwl

   1 #
   2 # Demonstrate Arvados federation features.  This performs a parallel
   3 # scatter over some arbitrary number of files and federated clusters,
   4 # then joins the results.
   5 #
   6 cwlVersion: v1.0
   7 class: Workflow
   8 $namespaces:
   9   # When using Arvados extensions to CWL, must declare the 'arv' namespace
  10   arv: "http://arvados.org/cwl#"
  11
  12 requirements:
  13   InlineJavascriptRequirement: {}
  14   ScatterFeatureRequirement: {}
  15   StepInputExpressionRequirement: {}
  16
  17   DockerRequirement:
  18     # Replace this with your own Docker container
  19     dockerPull: arvados/jobs
  20
  21   # Define a record type so we can conveniently associate the input
  22   # file, the cluster on which the file lives, and the project on that
  23   # cluster that will own the container requests and intermediate
  24   # outputs.
  25   SchemaDefRequirement:
  26     types:
  27       - name: FileOnCluster
  28         type: record
  29         fields:
  30           file: File
  31           cluster: string
  32           project: string
  33
  34 inputs:
  35   # Expect an array of FileOnCluster records (defined above)
  36   # as our input.
  37   shards:
  38     type:
  39       type: array
  40       items: FileOnCluster
  41
  42 outputs:
  43   # Will produce an output file with the results of the distributed
  44   # analysis jobs joined together.
  45   joined:
  46     type: File
  47     outputSource: gather-results/joined
  48
  49 steps:
  50   distributed-analysis:
  51     in:
  52       # Take "shards" array as input, we scatter over it below.
  53       shard: shards
  54
  55       # Use an expression to extract the "file" field to assign to the
  56       # "inp" parameter of the tool.
  57       inp: {valueFrom: $(inputs.shard.file)}
  58
  59     # Scatter over shards, this means creating a parallel job for each
  60     # element in the "shards" array.  Expressions are evaluated for
  61     # each element.
  62     scatter: shard
  63
  64     # Specify the cluster target for this job.  This means each
  65     # separate scatter job will execute on the cluster that was
  66     # specified in the "cluster" field.
  67     #
  68     # Arvados handles streaming data between clusters, for example,
  69     # the Docker image containing the code for a particular tool will
  70     # be fetched on demand, as long as it is available somewhere in
  71     # the federation.
  72     hints:
  73       arv:ClusterTarget:
  74         cluster_id: $(inputs.shard.cluster)
  75         project_uuid: $(inputs.shard.project)
  76
  77     out: [out]
  78     run: md5sum.cwl
  79
  80   # Collect the results of the distributed step and join them into a
  81   # single output file.  Arvados handles streaming inputs,
  82   # intermediate results, and outputs between clusters on demand.
  83   gather-results:
  84     in:
  85       inp: distributed-analysis/out
  86     out: [joined]
  87     run: cat.cwl