}
}
- @@notification_tests.push lambda { |controller, current_user|
- Job.limit(1).where(created_by: current_user.uuid).each do
- return nil
- end
- return lambda { |view|
- view.render partial: 'notifications/jobs_notification'
- }
- }
+ #@@notification_tests.push lambda { |controller, current_user|
+ # Job.limit(1).where(created_by: current_user.uuid).each do
+ # return nil
+ # end
+ # return lambda { |view|
+ # view.render partial: 'notifications/jobs_notification'
+ # }
+ #}
@@notification_tests.push lambda { |controller, current_user|
Collection.limit(1).where(created_by: current_user.uuid).each do
<p><%= image_tag "dax.png", class: "dax" %>
Hi, I noticed you haven't run a pipeline yet.
<%= link_to "Click here to learn how to run an Arvados Crunch pipeline.",
- "#{Rails.configuration.arvados_docsite}/user/tutorials/tutorial-new-pipeline.html",
+ "#{Rails.configuration.arvados_docsite}/user/tutorials/tutorial-pipeline-workbench.html",
style: "font-weight: bold",
target: "_blank" %>
</p>
--- /dev/null
+Arvados Documentation
+
+0. Install dependencies
+
+ $ bundle install
+
+
+1. To build or update documentation:
+ $ rake generate
+
+
+2. To view documentation:
+ $ rake run
+[2014-03-10 09:03:41] INFO WEBrick 1.3.1
+[2014-03-10 09:03:41] INFO ruby 2.1.1 (2014-02-24) [x86_64-linux]
+[2014-03-10 09:03:41] INFO WEBrick::HTTPServer#start: pid=8926 port=8000
+
+ Then go to http://localhost:8000
+
+
+2. You can set 'baseurl' (the URL prefix for all internal links),
+'arvados_api_host' and 'arvados_workbench_host' without changing _config.yml:
+
+ $ rake generate baseurl=/example arvados_api_host=example.com
+
+
+4. To delete generated files:
+ $ rake realclean
require "rubygems"
require "colorize"
+task :generate do
+ vars = ['baseurl', 'arvados_api_host', 'arvados_workbench_host']
+ vars.each do |v|
+ if ENV[v]
+ website.config.h[v] = ENV[v]
+ end
+ end
+end
+
require "zenweb/tasks"
load "zenweb-textile.rb"
load "zenweb-liquid.rb"
# file:///tmp/arvados/doc/.site). To make docs show up inside
# workbench, use /doc here and add a symlink at
# apps/workbench/public/doc pointing to ../../../doc/.site
+# You can also set these on the command line:
+# $ rake generate baseurl=/example arvados_api_host=example.comA
-baseurl: /doc
+baseurl:
+arvados_api_host: localhost
+arvados_workbench_host: localhost
exclude: ["Rakefile", "tmp", "vendor"]
-arvados_api_host: qr1hi.arvadosapi.com
navbar:
userguide:
- user/getting_started/community.html.textile.liquid
- Tutorials:
- user/tutorials/tutorial-keep.html.textile.liquid
- - user/tutorials/tutorial-job1.html.textile.liquid
+ - user/tutorials/intro-crunch.html.textile.liquid
+ - user/tutorials/tutorial-pipeline-workbench.html.textile.liquid
- user/tutorials/tutorial-firstscript.html.textile.liquid
- - user/tutorials/tutorial-job-debug.html.textile.liquid
- - user/tutorials/tutorial-parallel.html.textile.liquid
- user/tutorials/tutorial-new-pipeline.html.textile.liquid
- - user/tutorials/tutorial-trait-search.html.textile.liquid
- - user/tutorials/tutorial-gatk-variantfiltration.html.textile.liquid
- user/tutorials/running-external-program.html.textile.liquid
+ - Intermediate topics:
+ - user/topics/running-pipeline-command-line.html.textile.liquid
+ - user/topics/tutorial-job1.html.textile.liquid
+ - user/topics/tutorial-job-debug.html.textile.liquid
+ - user/topics/tutorial-parallel.html.textile.liquid
+ - user/topics/tutorial-trait-search.html.textile.liquid
+ - user/topics/tutorial-gatk-variantfiltration.html.textile.liquid
+ - user/topics/keep.html.textile.liquid
- Examples:
- user/examples/crunch-examples.html.textile.liquid
- Reference:
import arvados
-arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)
-this_task = arvados.current_task()
+# Automatically parallelize this job by running one task per file.
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True)
-# Get the input collection for this task
-this_task_input = this_task['parameters']['input']
+# Get the input file for the task
+input_file = arvados.get_task_param_mount('input')
-# Create a CollectionReader to access the collection
-input_collection = arvados.CollectionReader(this_task_input)
+# Run the external 'md5sum' program on the input file
+stdoutdata, stderrdata = arvados.util.run_command(['md5sum', input_file])
-# Get the name of the first file in the collection
-input_file = list(input_collection.all_files())[0].name()
-
-# Extract the file to a temporary directory
-# Returns the directory that the file was written to
-input_dir = arvados.util.collection_extract(this_task_input,
- 'tmp',
- files=[input_file],
- decompress=False)
-
-# Run the external 'md5sum' program on the input file, with the current working
-# directory set to the location the input file was extracted to.
-stdoutdata, stderrdata = arvados.util.run_command(
- ['md5sum', input_file],
- cwd=input_dir)
-
-# Save the standard output (stdoutdata) "md5sum.txt" in the output collection
+# Save the standard output (stdoutdata) to "md5sum.txt" in the output collection
out = arvados.CollectionWriter()
out.set_current_file_name("md5sum.txt")
out.write(stdoutdata)
-
-this_task.set_output(out.finish())
+arvados.current_task().set_output(out.finish())
#!/usr/bin/env python
-# Import the hashlib module (part of the Python standard library) to compute md5.
-import hashlib
+import hashlib # Import the hashlib module to compute md5.
+import arvados # Import the Arvados sdk module
-# Import the Arvados sdk module
-import arvados
+# Automatically parallelize this job by running one task per file.
+# This means that if the input consists of many files, each file will
+# be processed in parallel on different nodes enabling the job to
+# be completed quicker.
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
+ input_as_path=True)
-# Get information about the task from the environment
-this_task = arvados.current_task()
-
-# Get the "input" field from "script_parameters" on the job creation object
-this_job_input = arvados.getjobparam('input')
-
-# Create the object access to the collection referred to in the input
-collection = arvados.CollectionReader(this_job_input)
-
-# Create an object to write a new collection as output
-out = arvados.CollectionWriter()
-
-# Set the name of output file within the collection
-out.set_current_file_name("md5sum.txt")
+# Create the message digest object that will compute the md5 hash
+digestor = hashlib.new('md5')
-# Get an iterator over the files listed in the collection
-all_files = collection.all_files()
-
-# Iterate over each file
-for input_file in all_files:
- # Create the object that will actually compute the md5 hash
- digestor = hashlib.new('md5')
+# Get the input file for the task
+input_file = arvados.get_task_param_mount('input')
+# Open the input file for reading
+with open(input_file) as f:
while True:
- # read a 1 megabyte block from the file
- buf = input_file.read(2**20)
-
- # break when there is no more data left
- if len(buf) == 0:
+ buf = f.read(2**20) # read a 1 megabyte block from the file
+ if len(buf) == 0: # break when there is no more data left
break
+ digestor.update(buf) # update the md5 hash object
- # update the md5 hash object
- digestor.update(buf)
-
- # Get the final hash code
- hexdigest = digestor.hexdigest()
+# Get object representing the current task
+this_task = arvados.current_task()
- # Get the file name from the StreamFileReader object
- file_name = input_file.name()
+ # Write a new collection as output
+out = arvados.CollectionWriter()
- # The "stream name" is the subdirectory inside the collection in which
- # the file is located; '.' is the root of the collection.
- if input_file.stream_name() != '.':
- file_name = os.join(input_file.stream_name(), file_name)
+ # Set output file within the collection
+out.set_current_file_name("md5sum.txt")
- # Write an output line with the md5 value and file name.
- out.write("%s %s\n" % (hexdigest, file_name))
+# Write an output line with the md5 value and input
+out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input']))
-# Commit the output to keep. This returns a Keep id.
+ # Commit the output to keep. This returns a Keep id.
output_id = out.finish()
# Set the output for this task to the Keep id
-this_task.set_output(output_id)
+this_task.set_output(output_id)
# Done!
--- /dev/null
+{% assign n = 0 %}
+{% assign prev = "" %}
+{% assign nx = 0 %}
+{% for section in site.navbar[page.navsection] %}
+ {% for entry in section %}
+ {% for item in entry[1] %}
+ {% assign p = site.pages[item] %}
+ {% if nx == 1 %}
+ <hr>
+ {% if prev != "" %}
+ <a href="{{ site.baseurl }}{{ prev.url }}" class="pull-left">Previous: {{ prev.title }}</a></li>
+ {% endif %}
+ <a href="{{ site.baseurl }}{{ p.url }}" class="pull-right">Next: {{ p.title }}</a></li>
+ {% assign nx = 0 %}
+ {% assign n = 1 %}
+ {% endif %}
+ {% if p.url == page.url %}
+ {% assign nx = 1 %}
+ {% else %}
+ {% assign prev = p %}
+ {% endif %}
+ {% endfor %}
+ {% endfor %}
+{% endfor %}
+{% if n == 0 && prev != "" %}
+ <hr>
+ <a href="{{ site.baseurl }}{{ prev.url }}" class="pull-left">Previous: {{ prev.title }}</a></li>
+ {% assign n = 1 %}
+{% endif %}
\ No newline at end of file
<link href="{{ site.baseurl }}/css/bootstrap.css" rel="stylesheet">
<link href="{{ site.baseurl }}/css/nav-list.css" rel="stylesheet">
<link href="{{ site.baseurl }}/css/badges.css" rel="stylesheet">
+ <link href="{{ site.baseurl }}/css/code.css" rel="stylesheet">
<style>
html {
height:100%;
text-align: center;
margin-bottom: 1em;
}
- .userinput {
- color: #d14;
- }
:target {
padding-top: 61px;
margin-top: -61px;
}
</style>
- <link href="{{ site.baseurl }}/css/bootstrap-responsive.min.css" rel="stylesheet">
+
<!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
<!--[if lt IE 9]>
<script src="../assets/js/html5shiv.js"></script>
<div class="row">
{% include 'navbar_left' %}
<div class="col-sm-9">
+ <h1>{{ page.title }}</h1>
{{ content }}
+ {% include 'webring' %}
</div>
</div>
title: Cheat Sheet
...
-h1. Cheat Sheet
+
h3. CLI setup
...
-h1. Authentication
+
Every API request (except the authentication API itself) includes an @access_token@ parameter.
...
-h1. API Reference
+
h2. Concepts
...
-h1. REST Methods
+
(using Group as an example)
...
-h1. api_client_authorizations
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. api_clients
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. authorized_keys
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. collections
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. groups
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. humans
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. job_tasks
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. jobs
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. keep_disks
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. links
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. logs
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. nodes
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. pipeline_instances
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. pipeline_templates
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. repositories
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. specimens
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. traits
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. users
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. virtual_machines
+
Required arguments are displayed in %{background:#ccffcc}green%.
...
-h1. Permission model
+
Each API transaction (read, write, create, etc.) is done on behalf of a person.
...
-h1. Resources
+
This page describes the common attributes of Arvados resources.
...
-h1. ApiClient
+
An **ApiClient** represents a client program that has issued a request to the API server.
...
-h1. ApiClientAuthorization
+
A **ApiClientAuthorization** represents the API authorization token that has been issued to each "ApiClient":ApiClient.html known to this Arvados instance.
...
-h1. AuthorizedKey
+
A **AuthorizedKey** represents...
...
-h1. Collection
+
This resource concerns metadata, usage accounting, and integrity checks for data stored on the cloud. Reading and writing the data _per se_ is achieved by the "Keep":/user/tutorials/tutorial-keep.html storage system.
...
-h1. Commit
+
A **Commit** represents...
...
-h1. CommitAncestor
+
A **CommitAncestor** represents...
...
-h1. Group
+
A **Group** represents...
...
-h1. Human
+
A **Human** represents...
...
-h1. Job
+
Applications submit compute jobs when:
* Provenance is important, i.e., it is worth recording how the output was produced; or
...
-h1. JobTask
+
A Job Task is a well defined independently-computable portion of a "Job":Job.html.
...
-h1. KeepDisk
+
A **KeepDisk** represents...
...
-h1. Link
+
**Links** describe relationships between Arvados objects, and from objects to primitives.
...
-h1. Log
+
**Log** objects record events that occur in an Arvados cluster. Both user-written pipelines and the Arvados system itself may generate Log events.
...
-h1. Node
+
A **Node** represents...
...
-h1. PipelineInstance
+
A **PipelineInstance** represents...
...
-h1. PipelineTemplate
+
A **PipelineTemplate** represents...
...
-h1. Repository
+
A **Repository** represents...
...
-h1. Specimen
+
A **Specimen** represents...
...
-h1. Trait
+
A **Trait** represents...
...
-h1. User
+
A **User** represents...
...
-h1. VirtualMachine
+
A **VirtualMachine** represents...
--- /dev/null
+table.code {
+ font-family: Menlo,Monaco,Consolas,"Courier New",monospace;
+ display: block;
+ padding: 9.5px;
+ margin: 0px 0px 10px;
+ font-size: 13px;
+ line-height: 1.42857;
+ color: rgb(51, 51, 51);
+ word-break: break-all;
+ word-wrap: break-word;
+ background-color: rgb(245, 245, 245);
+ border: 1px solid rgb(204, 204, 204);
+ border-radius: 4px 4px 4px 4px;
+}
+
+table.code tr td {
+ white-space: pre;
+}
+
+table.code tr td:nth-child(2) {
+ color: #d14;
+ padding-left: .5em;
+}
+
+.userinput {
+ color: #d14;
+}
text-shadow: 0 -1px 0 rgba(0,0,0,.2);
background-color: rgb(66, 139, 202);
}
+
+.spaced-out li {
+ padding-bottom: 1em;
+}
\ No newline at end of file
"script":"file-select",
"script_parameters":{
"names":[
- "human_g1k_v37.fasta.gz", "human_g1k_v37.fasta.fai.gz", "human_g1k_v37.dict.gz"
+ "human_g1k_v37.fasta.gz",
+ "human_g1k_v37.fasta.fai.gz",
+ "human_g1k_v37.dict.gz"
],
"input":"d237a90bae3870b3b033aea1e99de4a9+10820+K@qr1hi"
},
- "script_version":"82a471c92036198aaf02ca0467ea48d49dbe822d"
+ "script_version":"e820bd1c6890f93ea1a84ffd5730bbf0e3d8e153"
},
"bwa-index":{
- "script_version":"82a471c92036198aaf02ca0467ea48d49dbe822d",
+ "script_version":"e820bd1c6890f93ea1a84ffd5730bbf0e3d8e153",
"script":"bwa-index",
"script_parameters":{
"input":{
"output_of":"extract-reference"
},
"bwa_tbz":{
- "optional":false
+ "value":"8b6e2c4916133e1d859c9e812861ce13+70",
+ "required":true
}
}
},
"bwa-aln":{
- "script_version":"82a471c92036198aaf02ca0467ea48d49dbe822d",
+ "script_version":"e820bd1c6890f93ea1a84ffd5730bbf0e3d8e153",
"script":"bwa-aln",
"script_parameters":{
"input":{
- "optional":"false"
+ "dataclass":"Collection",
+ "required":"true"
},
"reference_index":{
"output_of":"bwa-index"
},
"samtools_tgz":{
- "optional":false
+ "value":"c777e23cf13e5d5906abfdc08d84bfdb+74",
+ "required":true
},
"bwa_tbz":{
- "optional":false
+ "value":"8b6e2c4916133e1d859c9e812861ce13+70",
+ "required":true
}
},
"runtime_constraints":{
}
},
"picard-gatk2-prep":{
- "script_version":"82a471c92036198aaf02ca0467ea48d49dbe822d",
+ "script_version":"e820bd1c6890f93ea1a84ffd5730bbf0e3d8e153",
"script":"picard-gatk2-prep",
"script_parameters":{
"input":{
- "output_of":"bwa-aln"
+ "output_of":"bwa-aln"
},
"reference":{
"output_of":"extract-reference"
},
"picard_zip":{
- "optional":false
+ "value":"687f74675c6a0e925dec619cc2bec25f+77",
+ "required":true
}
},
"runtime_constraints":{
}
},
"GATK2-realign":{
- "script_version":"82a471c92036198aaf02ca0467ea48d49dbe822d",
+ "script_version":"e820bd1c6890f93ea1a84ffd5730bbf0e3d8e153",
"script":"GATK2-realign",
"script_parameters":{
"input":{
- "output_of":"picard-gatk2-prep"
+ "output_of":"picard-gatk2-prep"
},
"gatk_bundle":{
- "optional":false
+ "value":"d237a90bae3870b3b033aea1e99de4a9+10820+K@qr1hi",
+ "required":true
},
"picard_zip":{
- "optional":false
+ "value":"687f74675c6a0e925dec619cc2bec25f+77",
+ "required":true
},
"gatk_tbz":{
- "optional":false
+ "value":"7e0a277d6d2353678a11f56bab3b13f2+87",
+ "required":true
},
"regions":{
- "optional":true
+ "value":"13b53dbe1ec032dfc495fd974aa5dd4a+87/S02972011_Covered_sort_merged.bed"
},
"region_padding":{
- "optional":true
+ "value":10
}
},
"runtime_constraints":{
}
},
"GATK2-bqsr":{
- "script_version":"82a471c92036198aaf02ca0467ea48d49dbe822d",
+ "script_version":"e820bd1c6890f93ea1a84ffd5730bbf0e3d8e153",
"script":"GATK2-bqsr",
"script_parameters":{
"input":{
- "output_of":"GATK2-realign"
+ "output_of":"GATK2-realign"
},
"gatk_bundle":{
- "optional":false
+ "value":"d237a90bae3870b3b033aea1e99de4a9+10820+K@qr1hi",
+ "required":true
},
"picard_zip":{
- "optional":false
+ "value":"687f74675c6a0e925dec619cc2bec25f+77",
+ "required":true
},
"gatk_tbz":{
- "optional":false
+ "value":"7e0a277d6d2353678a11f56bab3b13f2+87",
+ "required":true
}
}
},
"GATK2-merge-call":{
- "script_version":"82a471c92036198aaf02ca0467ea48d49dbe822d",
+ "script_version":"e820bd1c6890f93ea1a84ffd5730bbf0e3d8e153",
"script":"GATK2-merge-call",
"script_parameters":{
"input":{
- "output_of":"GATK2-bqsr"
+ "output_of":"GATK2-bqsr"
},
"gatk_bundle":{
- "optional":false
+ "value":"d237a90bae3870b3b033aea1e99de4a9+10820+K@qr1hi",
+ "required":true
},
"picard_zip":{
- "optional":false
+ "value":"687f74675c6a0e925dec619cc2bec25f+77",
+ "required":true
},
"gatk_tbz":{
- "optional":false
+ "value":"7e0a277d6d2353678a11f56bab3b13f2+87",
+ "required":true
},
"regions":{
- "optional":true
+ "value":"13b53dbe1ec032dfc495fd974aa5dd4a+87/S02972011_Covered_sort_merged.bed"
},
"region_padding":{
- "optional":true
+ "value":10
},
"GATK2_UnifiedGenotyper_args":{
- "default":["-stand_call_conf","30.0","-stand_emit_conf","30.0","-dcov","200"]
+ "default":[
+ "-stand_call_conf",
+ "30.0",
+ "-stand_emit_conf",
+ "30.0",
+ "-dcov",
+ "200"
+ ]
}
}
}
</div>
<div class="col-sm-7" style="border-left: solid; border-width: 1px">
<p>
- <a href="{{ site.baseurl }}/user/">User Guide</a> — How to manage data and do analysis with Arvados.
+ <a href="{{ site.baseurl }}/user/index.html">User Guide</a> — How to manage data and do analysis with Arvados.
</p>
<p>
- <a href="{{ site.baseurl }}/sdk/">SDK Reference</a> — Details about the accessing Arvados from various programming languages.
+ <a href="{{ site.baseurl }}/sdk/index.html">SDK Reference</a> — Details about the accessing Arvados from various programming languages.
</p>
<p>
- <a href="{{ site.baseurl }}/api/">API Reference</a> — Details about the the Arvados REST API.
+ <a href="{{ site.baseurl }}/api/index.html">API Reference</a> — Details about the the Arvados REST API.
</p>
<p>
- <a href="{{ site.baseurl }}/admin/">Admin Guide</a> — How to administer an Arvados system.
+ <a href="{{ site.baseurl }}/admin/index.html">Admin Guide</a> — How to administer an Arvados system.
</p>
<p>
- <a href="{{ site.baseurl }}/install/">Install Guide</a> — How to install Arvados on a cloud platform.
+ <a href="{{ site.baseurl }}/install/index.html">Install Guide</a> — How to install Arvados on a cloud platform.
</p>
</div>
</div>
...
-h1. Install client libraries
+
h3. Python
...
-h1. Create standard objects
+
h3. "All users" group
...
-h1. Crunch setup
+
The dispatcher normally runs on the same host/VM as the API server.
---
layout: default
navsection: sdk
-title: "SDK Reference"
-
+title: "Arvados SDK Reference"
...
-h1. Arvados SDK Reference
-
This section documents how to access the Arvados API and Keep using various programming languages.
* "Python SDK":python/sdk-python.html
...
-h1. Crunch utility libraries
-
Several utility libraries are included with Arvados. They are intended to make it quicker and easier to write your own crunch scripts.
* "Python SDK extras":#pythonsdk
...
-h1. Python SDK
-
The Python SDK provides a generic set of wrappers so you can make API calls easily. It performs some validation before connecting to the API server: for example, it refuses to do an API call if a required parameter is missing.
The library also includes some conveniences for use in Crunch scripts; see "Crunch utility libraries":crunch-utility-libraries.html for details.
---
layout: default
navsection: userguide
-navmenu: Examples
title: "Crunch examples"
-
...
-h1. Crunch examples
-
Several crunch scripts are included with Arvados in the "/crunch_scripts directory":https://arvados.org/projects/arvados/repository/revisions/master/show/crunch_scripts. They are intended to provide examples and starting points for writing your own scripts.
h4. bwa-aln
---
layout: default
navsection: userguide
-navmenu: Getting Started
title: "Checking your environment"
-
...
-h1. Checking your environment
-
First you should "log into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login if you have not already done so.
If @arv user current@ is able to access the API server, it will print out information about your account. Check that you are able to access the Arvados API server using the following command:
bc. ARVADOS_API_HOST and ARVADOS_API_TOKEN need to be defined as environment variables
Then follow the instructions for "getting an API token,":{{site.baseurl}}/user/reference/api-tokens.html and try @arv user current@ again.
-
-Once you are able to access the API server, you are ready proceed to the first tutorial: "Storing and retrieving data using Arvados Keep.":{{site.baseurl}}/user/tutorials/tutorial-keep.html
---
layout: default
navsection: userguide
-navmenu: Getting Started
title: Arvados Community and Getting Help
-
...
-h1. Arvados Community and Getting Help
-
h2. On the web
---
layout: default
navsection: userguide
-navmenu: Getting Started
title: Accessing an Arvados VM over ssh
-
...
-h1. Accessing an Arvados Virtual Machine over ssh
-
Arvados requires a public @ssh@ key in order to securely log in to an Arvados VM instance, or to access an Arvados @git@ repository.
This document is divided up into three sections.
h3. From the workbench dashboard
-If you have no @ssh@ keys registered, there should be a notification asking you to provide your @ssh@ public key. On the Workbench dashboard (in this guide, this is "https://workbench.{{ site.arvados_api_host }}/":https://workbench.{{ site.arvados_api_host }}/ ), look for the envelope icon <span class="glyphicon glyphicon-envelope"></span> <span class="badge badge-alert">1</span> in upper right corner (the number indicates there are new notifications). Click on this icon and a dropdown menu should appear with a message asking you to add your public key. Paste your public key into the text area provided and click on the check button to submit the key. You are now ready to "log into an Arvados VM":#login.
+If you have no @ssh@ keys registered, there should be a notification asking you to provide your @ssh@ public key. On the Workbench dashboard (in this guide, this is "https://{{ site.arvados_workbench_host }}/":https://{{ site.arvados_workbench_host }}/ ), look for the envelope icon <span class="glyphicon glyphicon-envelope"></span> <span class="badge badge-alert">1</span> in upper right corner (the number indicates there are new notifications). Click on this icon and a dropdown menu should appear with a message asking you to add your public key. Paste your public key into the text area provided and click on the check button to submit the key. You are now ready to "log into an Arvados VM":#login.
h3. Alternate way to add ssh keys
<notextile>
<pre><code class="userinput">Host *.arvados
ProxyCommand ssh -a -x -p2222 turnout@switchyard.{{ site.arvados_api_host }} $SSH_PROXY_FLAGS %h
+ User <b>you</b>
ForwardAgent yes
ForwardX11 no
</code></pre>
This will recognize any host ending in ".arvados" and automatically apply the proxy, user and forwarding settings from the configuration file, allowing you to log in with a much simpler command:
-notextile. <pre><code>$ <span class="userinput">ssh <b>you@shell</b>.arvados</span></code></pre>
+notextile. <pre><code>$ <span class="userinput">ssh <b>shell</b>.arvados</span></code></pre>
h2(#windowsvm). Logging in using PuTTY (Windows)
---
layout: default
navsection: userguide
-navmenu: Getting Started
title: Accessing Arvados Workbench
-
...
-h1. Accessing Arvados Workbench
Access the Arvados beta test instance available using this link:
-"https://workbench.{{ site.arvados_api_host }}/":https://workbench.{{ site.arvados_api_host }}/
+"https://{{ site.arvados_workbench_host }}/":https://{{ site.arvados_workbench_host }}/
If you are accessing Arvados for the first time, you will be asked to log in using a Google account. Arvados uses only your name and email address from Google services for identification, and will never access any personal information. Once you are logged in, the Workbench page may indicate your account status is *New / inactive*. If this is the case, contact the administrator of the Arvados instance to activate your account.
Once your account is active, logging in to the Workbench will present you with a system status dashboard. This gives a summary of data, configuration, and activity in the Arvados instance.
-
-Next, we will "configure your account for ssh access to an Arvados virtual machine (VM).":ssh-access.html
layout: default
navsection: userguide
title: Welcome to Arvados!
-
...
-h1. Welcome to Arvados!
-
-This guide is intended to introduce new users to the Arvados system. It covers initial configuration required to use the system and then presents several tutorials on using Arvados to do data processing.
+This guide is intended to introduce new users to the Arvados system. It covers initial configuration required to access the system and then presents several tutorials on using Arvados to do data processing.
This user guide introduces how to use the major components of Arvados. These are:
# Programming in @python@
# Revision control using @git@
-The examples in this guide uses the public Arvados instance located at "https://workbench.{{ site.arvados_api_host }}/":https://workbench.{{ site.arvados_api_host }}/ . You must have an account in order to use this service. If you would like to request an account, please send an email to "arvados@curoverse.com":mailto:arvados@curoverse.com .
+We also recommend you read the "Arvados Platform Overview":https://arvados.org/projects/arvados/wiki#Platform-Overview for an introduction and background information about Arvados.
+
+The examples in this guide uses the Arvados instance located at "https://{{ site.arvados_workbench_host }}/":https://{{ site.arvados_workbench_host }}/ . If you are using a different Arvados instance replace @{{ site.arvados_workbench_host }}@ with your private instance in all of the examples in this guide.
-If you are using a different Arvados instance replace @{{ site.arvados_api_host }}@ with your private instance in all of the examples in this guide.
+The Arvados public beta instance is located at "https://workbench.qr1hi.arvadosapi.com/":https://workbench.qr1hi.arvadosapi.com/ . You must have an account in order to use this service. If you would like to request an account, please send an email to "arvados@curoverse.com":mailto:arvados@curoverse.com .
h2. Typographic conventions
<notextile>
<ul>
-<li>Code blocks which are set aside from the text indicate user input to the system. Commands that should be entered into a Unix shell are indicated by the directory where you should enter the command ('~' indicates your home directory) followed by '$', followed by the highlighted <span class="userinput">command to enter</span> (do not enter the '$'), and possibly followed by example command output in black. For example, the following block indicates that you should type "ls foo" while in your home directory and the expected output will be "foo".
-
+<li>Code blocks which are set aside from the text indicate user input to the system. Commands that should be entered into a Unix shell are indicated by the directory where you should enter the command ('~' indicates your home directory) followed by '$', followed by the highlighted <span class="userinput">command to enter</span> (do not enter the '$'), and possibly followed by example command output in black. For example, the following block indicates that you should type "ls foo.*" while in your home directory and the expected output will be "foo.input" and "foo.output".
<pre><code>~$ <span class="userinput">ls foo</span>
foo
-</code></pre></li>
+</code></pre>
+</li>
<li>Code blocks inline with text emphasize specific <code>programs</code>, <code>files</code>, or <code>options</code> that are being discussed.</li>
<li>Bold text emphasizes <b>specific items</b> to look when discussing Arvados Workbench pages.</li>
</ul>
</notextile>
-Now begin by "accessing the Arvados workbench.":getting_started/workbench.html
---
layout: default
navsection: userguide
-navmenu: Reference
title: "Getting an API token"
-
...
-h1. Reference: Getting an API token
-
The Arvados API token is a secret key that enables the @arv@ command line client to access Arvados with the proper permissions.
-Access the Arvados workbench using this link: "https://workbench.{{ site.arvados_api_host }}/":https://workbench.{{ site.arvados_api_host }}/
+Access the Arvados workbench using this link: "https://{{ site.arvados_workbench_host }}/":https://{{ site.arvados_workbench_host }}/
(Replace @{{ site.arvados_api_host }}@ with the hostname of your local Arvados instance if necessary.)
---
layout: default
navsection: userguide
-navmenu: Reference
title: "Command line interface"
...
-h1. Reference: Command Line Interface
-
*First, you should be "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
h3. Usage
--- /dev/null
+---
+layout: default
+navsection: userguide
+title: "How Keep works"
+...
+
+In Keep, information is stored in *data blocks*. Data blocks are normally between 1 byte and 64 megabytes in size. If a file exceeds the maximum size of a single data block, the file will be split across multiple data blocks until the entire file can be stored. These data blocks may be stored and replicated across multiple disks, servers, or clusters. Each data block has its own identifier for the contents of that specific data block.
+
+In order to reassemble the file, Keep stores a *collection* data block which lists in sequence the data blocks that make up the original file. A collection data block may store the information for multiple files, including a directory structure.
+
+In this example we will use @c1bad4b39ca5a924e481008009d94e32+210@ which we added to Keep in "the first Keep tutorial":{{ site.baseurl }}/users/tutorial/tutorial-keep.html. First let us examine the contents of this collection using @arv keep get@:
+
+<notextile>
+<pre><code>~$ <span class="userinput">arv keep get c1bad4b39ca5a924e481008009d94e32+210</span>
+. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
+</code></pre>
+</notextile>
+
+The command @arv keep get@ fetches the contents of the locator @c1bad4b39ca5a924e481008009d94e32+210@. This is a locator for a collection data block, so it fetches the contents of the collection. In this example, this collection consists of a single file @var-GS000016015-ASM.tsv.bz2@ which is 227212247 bytes long, and is stored using four sequential data blocks, <code>204e43b8a1185621ca55a94839582e6f+67108864</code>, <code>b9677abbac956bd3e86b1deb28dfac03+67108864</code>, <code>fc15aff2a762b13f521baf042140acec+67108864</code>, <code>323d2a3ce20370c4ca1d3462a344f8fd+25885655</code>.
+
+Let's use @arv keep get@ to download the first datablock:
+
+notextile. <pre><code>~$ <span class="userinput">cd /scratch/<b>you</b></span>
+/scratch/<b>you</b>$ <span class="userinput">arv keep get 204e43b8a1185621ca55a94839582e6f+67108864 > block1</span></code></pre>
+
+{% include 'notebox_begin' %}
+
+When you run this command, you may get this API warning:
+
+notextile. <pre><code>WARNING:root:API lookup failed for collection 204e43b8a1185621ca55a94839582e6f+67108864 (<class 'apiclient.errors.HttpError'>: <HttpError 404 when requesting https://qr1hi.arvadosapi.com/arvados/v1/collections/204e43b8a1185621ca55a94839582e6f%2B67108864?alt=json returned "Not Found">)</code></pre>
+
+This happens because @arv keep get@ tries to find a collection with this identifier. When that fails, it emits this warning, then looks for a datablock instead, which succeeds.
+
+{% include 'notebox_end' %}
+
+Let's look at the size and compute the md5 hash of @block1@:
+
+<notextile>
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">ls -l block1</span>
+-rw-r--r-- 1 you group 67108864 Dec 9 20:14 block1
+/scratch/<b>you</b>$ <span class="userinput">md5sum block1</span>
+204e43b8a1185621ca55a94839582e6f block1
+</code></pre>
+</notextile>
+
+Notice that the block identifer <code>204e43b8a1185621ca55a94839582e6f+67108864</code> consists of:
+* the md5 hash @204e43b8a1185621ca55a94839582e6f@ which matches the md5 hash of @block1@
+* a size hint @67108864@ which matches the size of @block1@
--- /dev/null
+---
+layout: default
+navsection: userguide
+title: "Running a pipeline on the command line"
+...
+
+In "Writing a pipeline":{{ site.baseurl }}/user/tutorials/tutorial-firstscript.html, we learned how to create a pipeline template on the command-line. Let's create one that doesn't require any user input to start:
+
+<notextile>
+<pre><code>~$ <span class="userinput">cat >the_pipeline <<EOF
+{
+ "name":"Filter md5 hash values",
+ "components":{
+ "do_hash":{
+ "script":"hash.py",
+ "script_parameters":{
+ "input": "887cd41e9c613463eab2f0d885c6dd96+83"
+ },
+ "script_version":"<b>you</b>:master"
+ },
+ "filter":{
+ "script":"0-filter.py",
+ "script_parameters":{
+ "input":{
+ "output_of":"do_hash"
+ }
+ },
+ "script_version":"<b>you</b>:master"
+ }
+ }
+}
+EOF</span>
+~$ <span class="userinput">arv pipeline_template create --pipeline-template "$(cat the_pipeline)"</span></code></pre>
+</notextile>
+
+You can run this pipeline from the command line using @arv pipeline run@, filling in the UUID that you received from @arv pipeline_template create@:
+
+<notextile>
+<pre><code>~$ <span class="userinput">arv pipeline run --template qr1hi-p5p6p-xxxxxxxxxxxxxxx</span>
+2013-12-16 14:08:40 +0000 -- pipeline_instance qr1hi-d1hrv-vxzkp38nlde9yyr
+do_hash qr1hi-8i9sb-hoyc2u964ecv1s6 queued 2013-12-16T14:08:40Z
+filter - -
+
+2013-12-16 14:08:51 +0000 -- pipeline_instance qr1hi-d1hrv-vxzkp38nlde9yyr
+do_hash qr1hi-8i9sb-hoyc2u964ecv1s6 8e1b6acdd3f2f1da722538127c5c6202+56
+filter qr1hi-8i9sb-w5k40fztqgg9i2x queued 2013-12-16T14:08:50Z
+
+2013-12-16 14:09:01 +0000 -- pipeline_instance qr1hi-d1hrv-vxzkp38nlde9yyr
+do_hash qr1hi-8i9sb-hoyc2u964ecv1s6 8e1b6acdd3f2f1da722538127c5c6202+56
+filter qr1hi-8i9sb-w5k40fztqgg9i2x 735ac35adf430126cf836547731f3af6+56
+</code></pre>
+</notextile>
+
+This instantiates your pipeline and displays a live feed of its status. The new pipeline instance will also show up on the Workbench %(rarr)→% Compute %(rarr)→% Pipeline instances page.
+
+Arvados adds each pipeline component to the job queue as its dependencies are satisfied (or immediately if it has no dependencies) and finishes when all components are completed or failed and there is no more work left to do.
+
+The Keep locators of the output of each of @"do_hash"@ and @"filter"@ component are available from the output log shown above. The output is also available on the Workbench by navigating to %(rarr)→% Compute %(rarr)→% Pipeline instances %(rarr)→% pipeline uuid under the *id* column %(rarr)→% components.
+
+<notextile>
+<pre><code>~$ <span class="userinput">arv keep get 8e1b6acdd3f2f1da722538127c5c6202+56/md5sum.txt</span>
+0f1d6bcf55c34bed7f92a805d2d89bbf alice.txt
+504938460ef369cd275e4ef58994cffe bob.txt
+8f3b36aff310e06f3c5b9e95678ff77a carol.txt
+~$ <span class="userinput">arv keep get 735ac35adf430126cf836547731f3af6+56/0-filter.txt</span>
+0f1d6bcf55c34bed7f92a805d2d89bbf alice.txt
+</code></pre>
+</notextile>
+
+Indeed, the filter has picked out just the "alice" file as having a hash that starts with 0.
+
+h3. Running a pipeline with different parameters
+
+Notice that the pipeline template explicitly specifies the Keep locator for the input:
+
+<notextile>
+<pre><code>...
+ "do_hash":{
+ "script_parameters":{
+ "input": "887cd41e9c613463eab2f0d885c6dd96+83"
+ },
+ }
+...
+</code></pre>
+</notextile>
+
+You can specify values for pipeline component script_parameters like this:
+
+<notextile>
+<pre><code>~$ <span class="userinput">arv pipeline run --template qr1hi-p5p6p-xxxxxxxxxxxxxxx do_hash::input=c1bad4b39ca5a924e481008009d94e32+210</span>
+2013-12-17 20:31:24 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
+do_hash qr1hi-8i9sb-rffhuay4jryl2n2 queued 2013-12-17T20:31:24Z
+filter - -
+
+2013-12-17 20:31:34 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
+do_hash qr1hi-8i9sb-rffhuay4jryl2n2 {:done=>1, :running=>1, :failed=>0, :todo=>0}
+filter - -
+
+2013-12-17 20:31:55 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
+do_hash qr1hi-8i9sb-rffhuay4jryl2n2 880b55fb4470b148a447ff38cacdd952+54
+filter qr1hi-8i9sb-j347g1sqovdh0op queued 2013-12-17T20:31:55Z
+
+2013-12-17 20:32:05 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
+do_hash qr1hi-8i9sb-rffhuay4jryl2n2 880b55fb4470b148a447ff38cacdd952+54
+filter qr1hi-8i9sb-j347g1sqovdh0op 490cd451c8108824b8a17e3723e1f236+19
+</code></pre>
+</notextile>
+
+Now check the output:
+
+<notextile>
+<pre><code>~$ <span class="userinput">arv keep get 880b55fb4470b148a447ff38cacdd952+54/md5sum.txt</span>
+44b8ae3fde7a8a88d2f7ebd237625b4f var-GS000016015-ASM.tsv.bz2
+~$ <span class="userinput">arv keep get 490cd451c8108824b8a17e3723e1f236+19/0-filter.txt</span>
+~$
+</code></pre>
+</notextile>
+
+Since none of the files in the collection have hash code that start with 0, output of the filter component is empty.
---
layout: default
navsection: userguide
-navmenu: Tutorials
title: "Using GATK with Arvados"
-
...
-h1. Using GATK with Arvados
-
This tutorial demonstrates how to use the Genome Analysis Toolkit (GATK) with Arvados. In this example we will install GATK and then create a VariantFiltration job to assign pass/fail scores to variants in a VCF file.
*This tutorial assumes that you are "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
h2. Installing GATK
-Download the GATK binary tarball[1] -- e.g., @GenomeAnalysisTK-2.6-4.tar.bz2@ -- and "copy it to your Arvados VM":tutorial-keep.html.
+Download the GATK binary tarball[1] -- e.g., @GenomeAnalysisTK-2.6-4.tar.bz2@ -- and "copy it to your Arvados VM":{{site.baseurl}}/user/tutorials/tutorial-keep.html.
<notextile>
<pre><code>~$ <span class="userinput">arv keep put GenomeAnalysisTK-2.6-4.tar.bz2</span>
---
layout: default
navsection: userguide
-navmenu: Tutorials
title: "Debugging a Crunch script"
-
...
-h1. Debugging a Crunch script
-
To test changes to a script by running a job, the change must be pushed into @git@, the job queued asynchronously, and the actual execution may be run on any compute server. As a result, debugging a script can be difficult and time consuming. This tutorial demonstrates using @arv-crunch-job@ to run your job in your local VM. This avoids the job queue and allows you to execute the script from your uncomitted git tree.
*This tutorial assumes that you are "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
-This tutorial uses _you_ to denote your username. Replace _you_ with your user name in all the following examples.
+This tutorial uses *@you@* to denote your username. Replace *@you@* with your user name in all the following examples.
h2. Create a new script
<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">cat >~/the_job <<EOF
{
"script":"hello-world.py",
- "script_version":"/home/you/you",
+ "script_version":"/home/<b>you</b>/<b>you</b>",
"script_parameters":{}
}
EOF</span>
2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 node localhost - 1 slots
2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 start
2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 script hello-world.py
-2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 script_version /home/you/you
+2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 script_version /home/<b>you</b>/<b>you</b>
2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 script_parameters {}
2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 runtime_constraints {"max_tasks_per_node":0}
2013-12-12_21:36:42 qr1hi-8i9sb-okzukfzkpbrnhst 29827 start level 0
~/<b>you</b>/crunch_scripts$ <span class="userinput">cat >~/the_job <<EOF
{
"script":"hello-world-fixed.py",
- "script_version":"/home/you/you",
+ "script_version":"/home/<b>you</b>/<b>you</b>",
"script_parameters":{}
}
EOF</span>
2013-12-12_21:56:59 qr1hi-8i9sb-79260ykfew5trzl 31578 node localhost - 1 slots
2013-12-12_21:57:00 qr1hi-8i9sb-79260ykfew5trzl 31578 start
2013-12-12_21:57:00 qr1hi-8i9sb-79260ykfew5trzl 31578 script hello-world-fixed.py
-2013-12-12_21:57:00 qr1hi-8i9sb-79260ykfew5trzl 31578 script_version /home/you/you
+2013-12-12_21:57:00 qr1hi-8i9sb-79260ykfew5trzl 31578 script_version /home/<b>you</b>/<b>you</b>
2013-12-12_21:57:00 qr1hi-8i9sb-79260ykfew5trzl 31578 script_parameters {}
2013-12-12_21:57:00 qr1hi-8i9sb-79260ykfew5trzl 31578 runtime_constraints {"max_tasks_per_node":0}
2013-12-12_21:57:00 qr1hi-8i9sb-79260ykfew5trzl 31578 start level 0
notextile. <pre><code>~$ <span class="userinput">export KEEP_LOCAL_STORE=/tmp</span></code></pre>
-Next, "parallel tasks.":tutorial-parallel.html
---
layout: default
navsection: userguide
-navmenu: Tutorials
-title: "Running a Crunch job"
-
+title: "Running a Crunch job on the command line"
...
-h1. Running a crunch job
-
-This tutorial introduces the concepts and use of the Crunch job system using the @arv@ command line tool and Arvados Workbench.
+This tutorial introduces how to run individual Crunch jobs using the @arv@ command line tool.
*This tutorial assumes that you are "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
-In "retrieving data using Keep,":tutorial-keep.html we downloaded a file from Keep and did some computation with it (specifically, computing the md5 hash of the complete file). While a straightforward way to accomplish a computational task, there are several obvious drawbacks to this approach:
-* Large files require significant time to download.
-* Very large files may exceed the scratch space of the local disk.
-* We are only able to use the local CPU to process the file.
+You will create a job to run the "hash" crunch script. The "hash" script computes the md5 hash of each file in a collection.
-The Arvados "Crunch" framework is designed to support processing very large data batches (gigabytes to terabytes) efficiently, and provides the following benefits:
-* Increase concurrency by running tasks asynchronously, using many CPUs and network interfaces at once (especially beneficial for CPU-bound and I/O-bound tasks respectively).
-* Track inputs, outputs, and settings so you can verify that the inputs, settings, and sequence of programs you used to arrive at an output is really what you think it was.
-* Ensure that your programs and workflows are repeatable with different versions of your code, OS updates, etc.
-* Interrupt and resume long-running jobs consisting of many short tasks.
-* Maintain timing statistics automatically, so they're there when you want them.
+h2. Jobs
-For your first job, you will run the "hash" crunch script using the Arvados system. The "hash" script computes the md5 hash of each file in a collection.
+Crunch pipelines consist of one or more jobs. A "job" is a single run of a specific version of a crunch script with a specific input. You an also run jobs individually.
-Crunch jobs are described using JSON objects. For example:
+A request to run a crunch job are is described using a JSON object. For example:
<notextile>
<pre><code>~$ <span class="userinput">cat >the_job <<EOF
* @<<EOF@ tells the shell to direct the following lines into the standard input for @cat@ up until it sees the line @EOF@
* @>the_job@ redirects standard output to a file called @the_job@
* @"script"@ specifies the name of the script to run. The script is searched for in the "crunch_scripts/" subdirectory of the @git@ checkout specified by @"script_version"@.
-* @"script_version"@ specifies the version of the script that you wish to run. This can be in the form of an explicit @git@ revision hash, or in the form "repository:branch" (in which case it will take the HEAD of the specified branch). Arvados logs the script version that was used in the run, enabling you to go back and re-run any past job with the guarantee that the exact same code will be used as was used in the previous run. You can access a list of available @git@ repositories on the Arvados workbench under _Compute %(rarr)→% Code repositories_.
+* @"script_version"@ specifies the version of the script that you wish to run. This can be in the form of an explicit @git@ revision hash, or in the form "repository:branch" (in which case it will take the HEAD of the specified branch). Arvados logs the script version that was used in the run, enabling you to go back and re-run any past job with the guarantee that the exact same code will be used as was used in the previous run. You can access a list of available @git@ repositories on the Arvados workbench under "Compute %(rarr)→% Code repositories":http://{{site.arvados_workbench_host}}/repositories .
* @"script_parameters"@ are provided to the script. In this case, the input is the locator for the collection that we inspected in the previous section.
Use @arv job create@ to actually submit the job. It should print out a JSON object which describes the newly created job:
h2. Monitor job progress
-Go to the Workbench dashboard. Your job should be at the top of the "Recent jobs" table. This table refreshes automatically. When the job has completed successfully, it will show <span class="label label-success">finished</span> in the *Status* column.
+Go to the "Workbench dashboard":http://{{site.arvados_workbench_host}}. Your job should be at the top of the "Recent jobs" table. This table refreshes automatically. When the job has completed successfully, it will show <span class="label label-success">finished</span> in the *Status* column.
On the command line, you can access log messages while the job runs using @arv job log_tail_follow@:
h2. Inspect the job output
-On the workbench dashboard, look for the *Output* column of the *Recent jobs* table. Click on the link under *Output* for your job to go to the files page with the job output. The files page lists all the files that were output by the job. Click on the link under the *files* column to view a file, or click on the download icon <span class="glyphicon glyphicon-download-alt"></span> to download the output file.
+On the "Workbench dashboard":http://{{site.arvados_workbench_host}}, look for the *Output* column of the *Recent jobs* table. Click on the link under *Output* for your job to go to the files page with the job output. The files page lists all the files that were output by the job. Click on the link under the *files* column to view a file, or click on the download icon <span class="glyphicon glyphicon-download-alt"></span> to download the output file.
On the command line, you can use @arv job get@ to access a JSON object describing the output:
"cancelled_by_user_uuid":null,
"started_at":"2013-12-16T20:44:36Z",
"finished_at":"2013-12-16T20:44:53Z",
- "output":"880b55fb4470b148a447ff38cacdd952+54",
+ "output":"dd755dbc8d49a67f4fe7dc843e4f10a6+54",
"success":true,
"running":false,
"is_locked_by_uuid":"qr1hi-tpzed-9zdpkpni2yddge6",
</code></pre>
</notextile>
-* @"output"@ is the unique identifier for this specific job's output. This is a Keep collection. Because the output of Arvados jobs should be deterministic, the known expected output is <code>880b55fb4470b148a447ff38cacdd952+54</code>.
+* @"output"@ is the unique identifier for this specific job's output. This is a Keep collection. Because the output of Arvados jobs should be deterministic, the known expected output is <code>dd755dbc8d49a67f4fe7dc843e4f10a6+54</code>.
Now you can list the files in the collection:
<notextile>
-<pre><code>~$ <span class="userinput">arv keep ls 880b55fb4470b148a447ff38cacdd952+54</span>
+<pre><code>~$ <span class="userinput">arv keep ls dd755dbc8d49a67f4fe7dc843e4f10a6+54</span>
md5sum.txt
</code></pre>
</notextile>
This collection consists of the @md5sum.txt@ file. Use @arv keep get@ to show the contents of the @md5sum.txt@ file:
<notextile>
-<pre><code>~$ <span class="userinput">arv keep get 880b55fb4470b148a447ff38cacdd952+54/md5sum.txt</span>
-44b8ae3fde7a8a88d2f7ebd237625b4f var-GS000016015-ASM.tsv.bz2
+<pre><code>~$ <span class="userinput">arv keep get dd755dbc8d49a67f4fe7dc843e4f10a6+54/md5sum.txt</span>
+44b8ae3fde7a8a88d2f7ebd237625b4f ./var-GS000016015-ASM.tsv.bz2
</code></pre>
</notextile>
2013-12-16_20:44:39 qr1hi-8i9sb-1pm1t02dezhupss 7575 status: 1 done, 1 running, 0 todo
2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 1 child 7716 on compute13.1 exit 0 signal 0 success=true
2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 1 success in 13 seconds
-2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 1 output 880b55fb4470b148a447ff38cacdd952+54
+2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 1 output dd755dbc8d49a67f4fe7dc843e4f10a6+54
2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 wait for last 0 children to finish
2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 status: 2 done, 0 running, 0 todo
2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 release job allocation
2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 Freeze not implemented
2013-12-16_20:44:52 qr1hi-8i9sb-1pm1t02dezhupss 7575 collate
-2013-12-16_20:44:53 qr1hi-8i9sb-1pm1t02dezhupss 7575 output 880b55fb4470b148a447ff38cacdd952+54
+2013-12-16_20:44:53 qr1hi-8i9sb-1pm1t02dezhupss 7575 output dd755dbc8d49a67f4fe7dc843e4f10a6+54+K@qr1hi
2013-12-16_20:44:53 qr1hi-8i9sb-1pm1t02dezhupss 7575 finish
</code></pre>
</notextile>
-
-This concludes the first tutorial. In the next tutorial, we will "write a script to compute the hash.":tutorial-firstscript.html
---
layout: default
navsection: userguide
-navmenu: Tutorials
title: "Parallel Crunch tasks"
-
...
-h1. Parallel Crunch tasks
-
-In the tutorial "writing a crunch script,":tutorial-firstscript.html our script used a "for" loop to compute the md5 hashes for each file in sequence. This approach, while simple, is not able to take advantage of the compute cluster with multiple nodes and cores to speed up computation by running tasks in parallel. This tutorial will demonstrate how to create parallel Crunch tasks.
+In the previous tutorials, we used @arvados.job_setup.one_task_per_input_file()@ to automatically parallelize our jobs by creating a separate task per file. For some types of jobs, you may need to split the work up differently, for example creating tasks to process different segments of a single large file. In this this tutorial will demonstrate how to create Crunch tasks directly.
Start by entering the @crunch_scripts@ directory of your git repository:
Add the following code to compute the md5 hash of each file in a
-<pre><code class="userinput">{% include 'parallel_hash_script_py' %}</code></pre>
+<notextile> {% code 'parallel_hash_script_py' as python %} </notextile>
Make the file executable:
</code></pre>
</notextile>
-You should now be able to run your new script using Crunch, with "script" referring to our new "parallel-hash.py" script. We will use a different input from our previous examples. We will use @887cd41e9c613463eab2f0d885c6dd96+83@ which consists of three files, "alice.txt", "bob.txt" and "carol.txt" (the example collection used previously in "fetching data from Arvados using Keep":tutorial-keep.html).
+You should now be able to run your new script using Crunch, with "script" referring to our new "parallel-hash.py" script. We will use a different input from our previous examples. We will use @887cd41e9c613463eab2f0d885c6dd96+83@ which consists of three files, "alice.txt", "bob.txt" and "carol.txt" (the example collection used previously in "fetching data from Arvados using Keep":{{site.baseurl}}/user/tutorials/tutorial-keep.html#dir).
<notextile>
<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">cat >~/the_job <<EOF
{
"script": "parallel-hash.py",
- "script_version": "you:master",
+ "script_version": "<b>you</b>:master",
"script_parameters":
{
"input": "887cd41e9c613463eab2f0d885c6dd96+83"
Because the job ran in parallel, each instance of parallel-hash creates a separate @md5sum.txt@ as output. Arvados automatically collates theses files into a single collection, which is the output of the job:
<notextile>
-<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">arv keep get e2ccd204bca37c77c0ba59fc470cd0f7+162</span>
+<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">arv keep ls e2ccd204bca37c77c0ba59fc470cd0f7+162</span>
md5sum.txt
md5sum.txt
md5sum.txt
</code></pre>
</notextile>
-h2. The one job per file pattern
-
-This example demonstrates how to schedule a new task per file. Because this is a common pattern, the Crunch Python API contains a convenience function to "queue a task for each input file":{{site.baseurl}}/sdk/python/crunch-utility-libraries.html#one_task_per_input which reduces the amount of boilerplate code required to handle parallel jobs.
-
-Next, "Constructing a Crunch pipeline":tutorial-new-pipeline.html
---
layout: default
navsection: userguide
-navmenu: Tutorials
title: "Querying the Metadata Database"
-
...
-h1. Querying the Metadata Database
-
This tutorial introduces the Arvados Metadata Database. The Metadata Database stores information about files in Keep. This example will use the Python SDK to find public WGS (Whole Genome Sequencing) data for people who have reported a certain medical condition.
*This tutorial assumes that you are "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
--- /dev/null
+---
+layout: default
+navsection: userguide
+title: Introduction to Crunch
+...
+
+In "getting data from Keep,":tutorial-keep.html#arv-get we downloaded a file from Keep and did some computation with it (specifically, computing the md5 hash of the complete file). While a straightforward way to accomplish a computational task, there are several obvious drawbacks to this approach:
+* Large files require significant time to download.
+* Very large files may exceed the scratch space of the local disk.
+* We are only able to use the local CPU to process the file.
+
+The Arvados "Crunch" framework is designed to support processing very large data batches (gigabytes to terabytes) efficiently, and provides the following benefits:
+* Increase concurrency by running tasks asynchronously, using many CPUs and network interfaces at once (especially beneficial for CPU-bound and I/O-bound tasks respectively).
+* Track inputs, outputs, and settings so you can verify that the inputs, settings, and sequence of programs you used to arrive at an output is really what you think it was.
+* Ensure that your programs and workflows are repeatable with different versions of your code, OS updates, etc.
+* Interrupt and resume long-running jobs consisting of many short tasks.
+* Maintain timing statistics automatically, so they're there when you want them.
---
layout: default
navsection: userguide
-navmenu: Tutorials
-title: "Running external programs"
-
+title: "Using Crunch to run external programs"
...
-h1. Running external programs
-
This tutorial demonstrates how to use Crunch to run an external program by writting a wrapper using the Python SDK.
*This tutorial assumes that you are "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
Add the following code to use the @md5sum@ program to compute the hash of each file in a collection:
-<pre><code class="userinput">{% include 'run_md5sum_py' %}</code></pre>
+<notextile> {% code 'run_md5sum_py' as python %} </notextile>
Make the file executable:
You should now be able to run your new script using Crunch, with "script" referring to our new "run-md5sum.py" script.
<notextile>
-<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">cat >~/the_job <<EOF
-{
- "script": "run-md5sum.py",
- "script_version": "you:master",
- "script_parameters":
- {
- "input": "c1bad4b39ca5a924e481008009d94e32+210"
- }
-}
-EOF</span>
-~/<b>you</b>/crunch_scripts$ <span class="userinput">arv job create --job "$(cat the_job)"</span>
+<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">cat >~/the_pipeline <<EOF
{
- ...
- "uuid":"qr1hi-xxxxx-xxxxxxxxxxxxxxx"
- ...
-}
-~/<b>you</b>/crunch_scripts$ <span class="userinput">arv job get --uuid qr1hi-xxxxx-xxxxxxxxxxxxxxx</span>
-{
- ...
- "output":"4d164b1658c261b9afc6b479130016a3+54",
- ...
+ "name":"Run external md5sum program",
+ "components":{
+ "do_hash":{
+ "script":"run-md5sum.py",
+ "script_parameters":{
+ "input":{
+ "required": true,
+ "dataclass": "Collection"
+ }
+ },
+ "script_version":"<b>you</b>:master"
+ }
+ }
}
+EOF
+</span>~/<b>you</b>/crunch_scripts$ <span class="userinput">arv pipeline_template create --pipeline-template "$(cat ~/the_pipeline)"</span>
</code></pre>
</notextile>
+
+Your new pipeline template will appear on the "Workbench %(rarr)→% Compute %(rarr)→% Pipeline templates":http://{{ site.arvados_workbench_host }}/pipeline_instances page. You can run the "pipeline using workbench":tutorial-pipeline-workbench.html
layout: default
navsection: userguide
navmenu: Tutorials
-title: "Writing a Crunch script"
-
+title: "Writing a pipeline"
...
-h1. Writing a Crunch script
-
In this tutorial, we will write the "hash" script demonstrated in the first tutorial.
*This tutorial assumes that you are "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
~$ <span class="userinput">git config --global user.email <b>you</b>@example.com</span></code></pre>
</notextile>
-On the Arvados Workbench, navigate to _Compute %(rarr)→% Code repositories._ You should see two repositories, one named "arvados" (under the *name* column) and a second with your user name. Next to *name* is the column *push_url*. Copy the *push_url* cell associated with your repository. This should look like <notextile><code>git@git.{{ site.arvados_api_host }}:<b>you</b>.git</code></notextile>.
+On the Arvados Workbench, navigate to "Compute %(rarr)→% Code repositories":http://{{site.arvados_workbench_host}}/repositories . You should see a repository with your user name listed in the *name* column. Next to *name* is the column *push_url*. Copy the *push_url* value associated with your repository. This should look like <notextile><code>git@git.{{ site.arvados_api_host }}:<b>you</b>.git</code></notextile>.
Next, on the Arvados virtual machine, clone your git repository:
Add the following code to compute the md5 hash of each file in a collection:
-<pre><code class="userinput">{% include 'tutorial_hash_script_py' %}</code></pre>
+<notextile> {% code 'tutorial_hash_script_py' as python %} </notextile>
Make the file executable:
notextile. <pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">chmod +x hash.py</span></code></pre>
{% include 'notebox_begin' %}
-The below steps describe how to execute the script after committing changes to git. To test the script locally, please see the "debugging a crunch script":tutorial-job-debug.html page.
+The steps below describe how to execute the script after committing changes to git. To run a script locally for testing, please see "debugging a crunch script":{{site.baseurl}}/user/topics/tutorial-job-debug.html .
{% include 'notebox_end' %}
* [new branch] master -> master</code></pre>
</notextile>
-You should now be able to run your script using Crunch, similar to how we did it in the "first tutorial.":tutorial-job1.html The field @"script_version"@ should be @you:master@ to tell Crunch to run the script at the head of the "master" git branch, which you just uploaded.
+h2. Create a pipeline template
+
+Next, create a file that contains the pipeline definition:
<notextile>
-<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">cat >~/the_job <<EOF
-{
- "script": "hash.py",
- "script_version": "you:master",
- "script_parameters":
- {
- "input": "c1bad4b39ca5a924e481008009d94e32+210"
- }
-}
-EOF</span>
-~/<b>you</b>/crunch_scripts$ <span class="userinput">arv job create --job "$(cat ~/the_job)"</span>
-{
- ...
- "uuid":"qr1hi-xxxxx-xxxxxxxxxxxxxxx"
- ...
-}
-~/<b>you</b>/crunch_scripts$ <span class="userinput">arv job get --uuid qr1hi-xxxxx-xxxxxxxxxxxxxxx</span>
+<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">cd ~</span>
+~$ <span class="userinput">cat >the_pipeline <<EOF
{
- ...
- "output":"880b55fb4470b148a447ff38cacdd952+54",
- ...
+ "name":"My first pipeline",
+ "components":{
+ "do_hash":{
+ "script":"hash.py",
+ "script_parameters":{
+ "input":{
+ "required": true,
+ "dataclass": "Collection"
+ }
+ },
+ "script_version":"<b>you</b>:master"
+ }
+ }
}
-~/<b>you</b>/crunch_scripts$ <span class="userinput">arv keep get 880b55fb4470b148a447ff38cacdd952+54/md5sum.txt</span>
-44b8ae3fde7a8a88d2f7ebd237625b4f var-GS000016015-ASM.tsv.bz2
+EOF
+</span></code></pre>
+</notextile>
+
+* @cat@ is a standard Unix utility that simply copies standard input to standard output
+* @<<EOF@ tells the shell to direct the following lines into the standard input for @cat@ up until it sees the line @EOF@
+* @>the_pipeline@ redirects standard output to a file called @the_pipeline@
+* @"name"@ is a human-readable name for the pipeline
+* @"components"@ is a set of scripts that make up the pipeline
+* The component is listed with a human-readable name (@"do_hash"@ in this example)
+* @"script"@ specifies the name of the script to run. The script is searched for in the "crunch_scripts/" subdirectory of the @git@ checkout specified by @"script_version"@.
+* @"script_version"@ specifies the version of the script that you wish to run. This can be in the form of an explicit @git@ revision hash, or in the form "repository:branch" (in which case it will take the HEAD of the specified branch). Arvados logs the script version that was used in the run, enabling you to go back and re-run any past job with the guarantee that the exact same code will be used as was used in the previous run. You can access a list of available @git@ repositories on the Arvados workbench under "Compute %(rarr)→% Code repositories":http://{{site.arvados_workbench_host}}//repositories .
+* @"script_parameters"@ describes the parameters for the script. In this example, there is one parameter called @input@ which is @required@ and is a @Collection@.
+
+Now, use @arv pipeline_template create@ tell Arvados about your pipeline template:
+
+<notextile>
+<pre><code>~$ <span class="userinput">arv pipeline_template create --pipeline-template "$(cat the_pipeline)"</span>
</code></pre>
</notextile>
-Next, "debugging a crunch script.":tutorial-job-debug.html
+Your new pipeline template will appear on the "Workbench %(rarr)→% Compute %(rarr)→% Pipeline templates":http://{{ site.arvados_workbench_host }}/pipeline_instances page. You can run the "pipeline using workbench":tutorial-pipeline-workbench.html
---
layout: default
navsection: userguide
-navmenu: Tutorials
-title: "Storing and Retrieving data using Arvados Keep"
-
+title: "Storing and Retrieving data using Keep"
...
-h1. Storing and Retrieving data using Arvados Keep
-
This tutorial introduces you to the Arvados file storage system.
The output value @c1bad4b39ca5a924e481008009d94e32+210@ from @arv keep put@ is the Keep locator. This enables you to access the file you just uploaded, and is explained in the next section.
-h2. Putting a directory
+h2(#dir). Putting a directory
You can also use @arv keep put@ to add an entire directory:
</code></pre>
</notextile>
+The locator @887cd41e9c613463eab2f0d885c6dd96+83@ represents a collection with multiple files.
+
h1. Getting Data from Keep
-In Keep, information is stored in *data blocks*. Data blocks are normally between 1 byte and 64 megabytes in size. If a file exceeds the maximum size of a single data block, the file will be split across multiple data blocks until the entire file can be stored. These data blocks may be stored and replicated across multiple disks, servers, or clusters. Each data block has its own identifier for the contents of that specific data block.
+h2. Using Workbench
-In order to reassemble the file, Keep stores a *collection* data block which lists in sequence the data blocks that make up the original file. A collection data block may store the information for multiple files, including a directory structure.
+You may access collections through the "Collections section of Arvados Workbench":https://{{ site.arvados_workbench_host }}/collections located at "https://{{ site.arvados_workbench_host }}/collections":https://{{ site.arvados_workbench_host }}/collections . You can also access individual collections and individual files within a collection. Some examples:
-In this example we will use @c1bad4b39ca5a924e481008009d94e32+210@ which we added to keep in the previous section. First let us examine the contents of this collection using @arv keep get@:
+* "https://{{ site.arvados_workbench_host }}/collections/c1bad4b39ca5a924e481008009d94e32+210":https://{{ site.arvados_workbench_host }}/collections/c1bad4b39ca5a924e481008009d94e32+210
+* "https://{{ site.arvados_workbench_host }}/collections/887cd41e9c613463eab2f0d885c6dd96+83/alice.txt":https://{{ site.arvados_workbench_host }}/collections/887cd41e9c613463eab2f0d885c6dd96+83/alice.txt
+
+h2(#arv-get). Using arv-get
+
+You can view the contents of a collection using @arv keep ls@:
<notextile>
-<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep get c1bad4b39ca5a924e481008009d94e32+210</span>
-. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep ls c1bad4b39ca5a924e481008009d94e32+210</span>
+var-GS000016015-ASM.tsv.bz2
+</code></pre>
+
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep ls 887cd41e9c613463eab2f0d885c6dd96+83</span>
+alice.txt
+bob.txt
+carol.txt
</code></pre>
</notextile>
-The command @arv keep get@ fetches the contents of the locator @c1bad4b39ca5a924e481008009d94e32+210@. This is a locator for a collection data block, so it fetches the contents of the collection. In this example, this collection consists of a single file @var-GS000016015-ASM.tsv.bz2@ which is 227212247 bytes long, and is stored using four sequential data blocks, <code>204e43b8a1185621ca55a94839582e6f+67108864</code>, <code>b9677abbac956bd3e86b1deb28dfac03+67108864</code>, <code>fc15aff2a762b13f521baf042140acec+67108864</code>, <code>323d2a3ce20370c4ca1d3462a344f8fd+25885655</code>.
+Use @-s@ to print file sizes rounded up to the nearest kilobyte:
-Notice that the block identifer <code>204e43b8a1185621ca55a94839582e6f+67108864</code> consists of:
-* the md5 hash @204e43b8a1185621ca55a94839582e6f@ which matches the md5 hash of @block1@
-* a size hint @67108864@ which matches the size of @block1@
+<notextile>
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep ls -s c1bad4b39ca5a924e481008009d94e32+210</span>
+221887 var-GS000016015-ASM.tsv.bz2
+</code></pre>
+</notextile>
-Next, let's use @arv keep get@ to download and reassemble @var-GS000016015-ASM.tsv.bz2@ using the following command:
+Use @arv keep get@ to download the contents of a collection and place it in the directory specified in the second argument (in this example, @.@ for the current directory):
<notextile>
-<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep get c1bad4b39ca5a924e481008009d94e32+210/var-GS000016015-ASM.tsv.bz2 .</span>
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep get c1bad4b39ca5a924e481008009d94e32+210/ .</span>
</code></pre>
+</notextile>
-This downloads the file <code>var-GS000016015-ASM.tsv.bz2</code> described by collection <code>c1bad4b39ca5a924e481008009d94e32+210</code> from Keep and places it into the local directory. Now that we have the file, we can compute the md5 hash of the complete file:
+You can also download indvidual files:
+
+<notextile>
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep get 887cd41e9c613463eab2f0d885c6dd96+83/alice.txt .</span>
+</code></pre>
+</notextile>
+
+With a local copy of the file, we can do some computation, for example computing the md5 hash of the complete file:
<notextile>
<pre><code>/scratch/<b>you</b>$ <span class="userinput">md5sum var-GS000016015-ASM.tsv.bz2</span>
</code></pre>
</notextile>
-h2. Accessing Collections
+h2. Using arv-mount
-There are a couple of other ways to access a collection. You may view the contents of a collection using @arv keep ls@:
+Use @arv-mount@ to take advantage of the "File System in User Space / FUSE":http://fuse.sourceforge.net/ feature of the Linux kernel to mount a Keep collection as if it were a regular directory tree.
<notextile>
-<pre><code>/scratch/<b>you</b>$ <span class="userinput">arv keep ls c1bad4b39ca5a924e481008009d94e32+210</span>
-./var-GS000016015-ASM.tsv.bz2
-/scratch/<b>you</b>$ <span class="userinput">arv keep ls -s c1bad4b39ca5a924e481008009d94e32+210</span>
- 221886 ./var-GS000016015-ASM.tsv.bz2
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">mkdir mnt</span>
+/scratch/<b>you</b>$ <span class="userinput">arv-mount --collection c1bad4b39ca5a924e481008009d94e32+210 mnt &</span>
+/scratch/<b>you</b>$ <span class="userinput">cd mnt</span>
+/scratch/<b>you</b>/mnt$ <span class="userinput">ls</span>
+var-GS000016015-ASM.tsv.bz2
+/scratch/<b>you</b>/mnt$ <span class="userinput">md5sum var-GS000016015-ASM.tsv.bz2</span>
+44b8ae3fde7a8a88d2f7ebd237625b4f var-GS000016015-ASM.tsv.bz2
+/scratch/<b>you</b>/mnt$ <span class="userinput">cd ..</span>
+/scratch/<b>you</b>$ <span class="userinput">fusermount -u mnt</span>
</code></pre>
</notextile>
-* @-s@ prints file sizes in kilobytes
+You can also mount the entire Keep namespace in "magic directory" mode:
-You may also access through the Arvados Workbench using a URI similar to this, where the last part of the path is the Keep locator:
+<notextile>
+<pre><code>/scratch/<b>you</b>$ <span class="userinput">mkdir mnt</span>
+/scratch/<b>you</b>$ <span class="userinput">arv-mount mnt &</span>
+/scratch/<b>you</b>$ <span class="userinput">cd mnt/c1bad4b39ca5a924e481008009d94e32+210</span>
+/scratch/<b>you</b>/mnt/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">ls</span>
+var-GS000016015-ASM.tsv.bz2
+/scratch/<b>you</b>/mnt/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">md5sum var-GS000016015-ASM.tsv.bz2</span>
+44b8ae3fde7a8a88d2f7ebd237625b4f var-GS000016015-ASM.tsv.bz2
+/scratch/<b>you</b>/mnt/c1bad4b39ca5a924e481008009d94e32+210$ <span class="userinput">cd ../..</span>
+/scratch/<b>you</b>$ <span class="userinput">fusermount -u mnt</span>
+</code></pre>
+</notextile>
-"https://workbench.{{ site.arvados_api_host }}/collections/c1bad4b39ca5a924e481008009d94e32+210":https://workbench.{{ site.arvados_api_host }}/collections/c1bad4b39ca5a924e481008009d94e32+210
+Using @arv-mount@ has several significant benefits:
-You are now ready to proceed to the next tutorial, "running a crunch job.":tutorial-job1.html
+* You can browse, open and read Keep entries as if they are regular files.
+* It is easy for existing tools to access files in Keep.
+* Data is downloaded on demand, it is not necessary to download an entire file or collection to start processing
---
layout: default
navsection: userguide
-navmenu: Tutorials
-title: "Constructing a Crunch pipeline"
-
+title: "Writing a multi-step pipeline"
...
-h1. Constructing a Crunch pipeline
-
A pipeline in Arvados is a collection of crunch scripts, in which the output from one script may be used as the input to another script.
*This tutorial assumes that you are "logged into an Arvados VM instance":{{site.baseurl}}/user/getting_started/ssh-access.html#login, and have a "working environment.":{{site.baseurl}}/user/getting_started/check-environment.html*
+This tutorial uses *@you@* to denote your username. Replace *@you@* with your user name in all the following examples.
+
h2. Create a new script
-Our second script will filter the output of @parallel_hash.py@ and only include hashes that start with 0. Create a new script in @crunch_scripts/@ called @0-filter.py@:
+Our second script will filter the output of @hash.py@ and only include hashes that start with 0. Create a new script in <notextile><code>~/<b>you</b>/crunch_scripts/</code></notextile> called @0-filter.py@:
-<pre><code class="userinput">{% include '0_filter_py' %}</code></pre>
+<notextile> {% code '0_filter_py' as python %} </notextile>
Now add it to git:
<notextile>
-<pre><code>$ <span class="userinput">git add 0-filter.py</span>
-$ <span class="userinput">git commit -m"zero filter"</span>
-$ <span class="userinput">git push origin master</span>
+<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">chmod +x 0-filter.py</span>
+~/<b>you</b>/crunch_scripts$ <span class="userinput">git add 0-filter.py</span>
+~/<b>you</b>/crunch_scripts$ <span class="userinput">git commit -m"zero filter"</span>
+~/<b>you</b>/crunch_scripts$ <span class="userinput">git push origin master</span>
</code></pre>
</notextile>
Next, create a file that contains the pipeline definition:
<notextile>
-<pre><code>$ <span class="userinput">cat >the_pipeline <<EOF
+<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">cat >~/the_pipeline <<EOF
{
- "name":"my_first_pipeline",
+ "name":"Filter md5 hash values",
"components":{
"do_hash":{
- "script":"parallel-hash.py",
+ "script":"hash.py",
"script_parameters":{
- "input": "887cd41e9c613463eab2f0d885c6dd96+83"
+ "input":{
+ "required": true,
+ "dataclass": "Collection"
+ }
},
- "script_version":"you:master"
+ "script_version":"<b>you</b>:master"
},
"filter":{
"script":"0-filter.py",
"output_of":"do_hash"
}
},
- "script_version":"you:master"
+ "script_version":"<b>you</b>:master"
}
}
}
EOF
-</code></pre>
+</span></code></pre>
</notextile>
-* @"name"@ is a human-readable name for the pipeline
-* @"components"@ is a set of scripts that make up the pipeline
-* Each component is listed with a human-readable name (@"do_hash"@ and @"filter"@ in this example)
-* Each item in @"components"@ is a single Arvados job, and uses the same format that we saw previously with @arv job create@
-* @"output_of"@ indicates that the @"input"@ of @"filter"@ is the @"output"@ of the @"do_hash"@ component. This is a _dependency_. Arvados uses the dependencies between jobs to automatically determine the correct order to run the jobs.
+* @"output_of"@ indicates that the @input@ of the @do_hash@ component is connected to the @output@ of @filter@. This is a _dependency_. Arvados uses the dependencies between jobs to automatically determine the correct order to run the jobs.
Now, use @arv pipeline_template create@ tell Arvados about your pipeline template:
<notextile>
-<pre><code>$ <span class="userinput">arv pipeline_template create --pipeline-template "$(cat the_pipeline)"</span>
-qr1hi-p5p6p-xxxxxxxxxxxxxxx
+<pre><code>~/<b>you</b>/crunch_scripts$ <span class="userinput">arv pipeline_template create --pipeline-template "$(cat ~/the_pipeline)"</span>
</code></pre>
</notextile>
-Your new pipeline template will appear on the Workbench %(rarr)→% Compute %(rarr)→% Pipeline templates page.
-
-h3. Running a pipeline
-
-Run the pipeline using @arv pipeline run@, using the UUID that you received from @arv pipeline create@:
-
-<notextile>
-<pre><code>$ <span class="userinput">arv pipeline run --template qr1hi-p5p6p-xxxxxxxxxxxxxxx</span>
-2013-12-16 14:08:40 +0000 -- pipeline_instance qr1hi-d1hrv-vxzkp38nlde9yyr
-do_hash qr1hi-8i9sb-hoyc2u964ecv1s6 queued 2013-12-16T14:08:40Z
-filter - -
-2013-12-16 14:08:51 +0000 -- pipeline_instance qr1hi-d1hrv-vxzkp38nlde9yyr
-do_hash qr1hi-8i9sb-hoyc2u964ecv1s6 e2ccd204bca37c77c0ba59fc470cd0f7+162
-filter qr1hi-8i9sb-w5k40fztqgg9i2x queued 2013-12-16T14:08:50Z
-2013-12-16 14:09:01 +0000 -- pipeline_instance qr1hi-d1hrv-vxzkp38nlde9yyr
-do_hash qr1hi-8i9sb-hoyc2u964ecv1s6 e2ccd204bca37c77c0ba59fc470cd0f7+162
-filter qr1hi-8i9sb-w5k40fztqgg9i2x 735ac35adf430126cf836547731f3af6+56
-</code></pre>
-</notextile>
-
-This instantiates your pipeline and displays a live feed of its status. The new pipeline instance will also show up on the Workbench %(rarr)→% Compute %(rarr)→% Pipeline instances page.
-
-Arvados adds each pipeline component to the job queue as its dependencies are satisfied (or immediately if it has no dependencies) and finishes when all components are completed or failed and there is no more work left to do.
-
-The Keep locators of the output of each of @"do_hash"@ and @"filter"@ component are available from the output log shown above. The output is also available on the Workbench by navigating to %(rarr)→% Compute %(rarr)→% Pipeline instances %(rarr)→% pipeline uuid under the *id* column %(rarr)→% components.
-
-<notextile>
-<pre><code>$ <span class="userinput">arv keep get e2ccd204bca37c77c0ba59fc470cd0f7+162/md5sum.txt</span>
-0f1d6bcf55c34bed7f92a805d2d89bbf alice.txt
-504938460ef369cd275e4ef58994cffe bob.txt
-8f3b36aff310e06f3c5b9e95678ff77a carol.txt
-$ <span class="userinput">arv keep get 735ac35adf430126cf836547731f3af6+56</span>
-0f1d6bcf55c34bed7f92a805d2d89bbf alice.txt
-</code></pre>
-</notextile>
-
-Indeed, the filter has picked out just the "alice" file as having a hash that starts with 0.
-
-h3. Running a pipeline with different parameters
-
-Notice that the pipeline definition explicitly specifies the Keep locator for the input:
-
-<notextile>
-<pre><code>...
- "do_hash":{
- "script_parameters":{
- "input": "887cd41e9c613463eab2f0d885c6dd96+83"
- },
- }
-...
-</code></pre>
-</notextile>
-
-What if we want to run the pipeline on a different input block? One option is to define a new pipeline template, but would potentially result in clutter with many pipeline templates defined for one-off jobs. Instead, you can override values in the input of the component like this:
-
-<notextile>
-<pre><code>$ <span class="userinput">arv pipeline run --template qr1hi-d1hrv-vxzkp38nlde9yyr do_hash::input=33a9f3842b01ea3fdf27cc582f5ea2af+242</span>
-2013-12-17 20:31:24 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
-do_hash qr1hi-8i9sb-rffhuay4jryl2n2 queued 2013-12-17T20:31:24Z
-filter - -
-2013-12-17 20:31:34 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
-do_hash qr1hi-8i9sb-rffhuay4jryl2n2 {:done=>1, :running=>1, :failed=>0, :todo=>0}
-filter - -
-2013-12-17 20:31:44 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
-do_hash qr1hi-8i9sb-rffhuay4jryl2n2 {:done=>1, :running=>1, :failed=>0, :todo=>0}
-filter - -
-2013-12-17 20:31:55 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
-do_hash qr1hi-8i9sb-rffhuay4jryl2n2 880b55fb4470b148a447ff38cacdd952+54
-filter qr1hi-8i9sb-j347g1sqovdh0op queued 2013-12-17T20:31:55Z
-2013-12-17 20:32:05 +0000 -- pipeline_instance qr1hi-d1hrv-tlkq20687akys8e
-do_hash qr1hi-8i9sb-rffhuay4jryl2n2 880b55fb4470b148a447ff38cacdd952+54
-filter qr1hi-8i9sb-j347g1sqovdh0op fb728f0ffe152058fa64b9aeed344cb5+54
-</code></pre>
-</notextile>
-
-Now check the output:
-
-<notextile>
-<pre><code>$ <span class="userinput">arv keep ls -s fb728f0ffe152058fa64b9aeed344cb5+54</span>
-0 0-filter.txt
-</code></pre>
-</notextile>
+Your new pipeline template will appear on the "Workbench %(rarr)→% Compute %(rarr)→% Pipeline templates":http://{{ site.arvados_workbench_host }}/pipeline_instances page.
-Here the filter script output is empty, so none of the files in the collection have hash code that start with 0.
--- /dev/null
+---
+layout: default
+navsection: userguide
+title: "Running a pipeline using Workbench"
+...
+
+notextile. <div class="spaced-out">
+
+# Go to "Collections":http://{{ site.arvados_workbench_host }}/collections .
+# On the collections page, go to the search box <span class="glyphicon glyphicon-search"></span> and search for "tutorial".
+# This should yield a collection with the contents "var-GS000016015-ASM.tsv.bz2"
+# Click on the check box to the left of "var-GS000016015-ASM.tsv.bz2". This puts the collection in your persistent selection list. Click on the paperclip <span class="glyphicon glyphicon-paperclip"></span> in the upper right to get a dropdown menu listing your current selections.
+# Go to "Pipeline templates":http://{{ site.arvados_workbench_host }}/pipeline_templates .
+# Look for a pipeline named "Tutorial pipeline".
+# Click on the play button <span class="glyphicon glyphicon-play"></span> to the left of "Tutorial pipeline". This will take you to a new page to configure the pipeline.
+# Under *parameter* look for "input". Set the value of "input" by clicking on on "none" to get a editing popup. At the top of the selection list in the editing popup will be the collection that you selected in step 4.
+# You can now click on "Run pipeline" in the upper right to start the pipeline.
+# This will reload the page with the pipeline queued to run.
+# The page refreshes automatically every 15 seconds. You should see the pipeline running, and then finish successfully.
+# Once it is finished, click on the link under the *output* column. This will take you to the collection page for the output of this pipeline.
+# Click on "md5sum.txt" to see the actual file that is the output of this pipeline.
+# On the collection page, click on the "Provenance graph" tab to see a graphical representation of the data elements and pipelines that were involved in generating this file.
+
+notextile. </div>
+
require 'zenweb'
+require 'liquid'
module ZenwebLiquid
VERSION = '0.0.1'
##
# Render a page's liquid and return the intermediate result
def liquid template, content, page, binding = TOPLEVEL_BINDING
- require 'liquid'
Liquid::Template.file_system = Liquid::LocalFileSystem.new(File.join(File.dirname(Rake.application().rakefile), "_includes"))
unless defined? @liquid_template
@liquid_template = Liquid::Template.parse(template)
@liquid_template.render(vars)
end
end
+
+ class LiquidCode < Liquid::Include
+ Syntax = /(#{Liquid::QuotedFragment}+)(\s+(?:as)\s+(#{Liquid::QuotedFragment}+))?/o
+
+ def initialize(tag_name, markup, tokens)
+ Liquid::Tag.instance_method(:initialize).bind(self).call(tag_name, markup, tokens)
+
+ if markup =~ Syntax
+ @template_name = $1
+ @language = $3
+ @attributes = {}
+ else
+ raise SyntaxError.new("Error in tag 'code' - Valid syntax: include '[code_file]' as '[language]'")
+ end
+ end
+
+ def render(context)
+ require 'coderay'
+
+ partial = load_cached_partial(context)
+ html = ''
+
+ context.stack do
+ html = CodeRay.scan(partial.root.nodelist.join, @language).div
+ end
+
+ html
+ end
+
+ Liquid::Template.register_tag('code', LiquidCode)
+ end
end
if p.is_a? Hash and p[:output_of] == cname.to_s
debuglog "parameter #{c2name}::#{pname} == #{c[:job][:output]}"
c2[:script_parameters][pname] = c[:job][:output]
+ moretodo = true
end
end
end
elsif c[:job][:running] ||
(!c[:job][:started_at] && !c[:job][:cancelled_at])
- moretodo ||= true
+ moretodo = true
elsif c[:job][:cancelled_at]
debuglog "component #{cname} job #{c[:job][:uuid]} cancelled."
end
gem 'google-api-client', '~> 0.6.3'
gem 'trollop'
-gem 'arvados-cli', '>= 0.1.20140310170846'
+gem 'arvados-cli', '>= 0.1.20140311162926'
addressable (2.3.5)
andand (1.3.3)
arel (3.0.2)
- arvados-cli (0.1.20140310170846)
+ arvados-cli (0.1.20140311162926)
activesupport (~> 3.2, >= 3.2.13)
andand (~> 1.3, >= 1.3.3)
curb (~> 0.8)
DEPENDENCIES
acts_as_api
andand
- arvados-cli (>= 0.1.20140310170846)
+ arvados-cli (>= 0.1.20140311162926)
coffee-rails (~> 3.2.0)
google-api-client (~> 0.6.3)
jquery-rails
APP_SECRET = rand(2**512).to_s(36) # CHANGE ME!
# Update your custom Omniauth provider URL here
-CUSTOM_PROVIDER_URL = 'http://auth.clinicalfuture.com'
+CUSTOM_PROVIDER_URL = 'http://auth.curoverse.com'
Rails.application.config.middleware.use OmniAuth::Builder do
provider :josh_id, APP_ID, APP_SECRET, CUSTOM_PROVIDER_URL