X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/da51b9328abab2df757ed13eadc7c3557315094b..b54a5ea817d3d2087eaa07dcf98ec8a82af56d06:/doc/sdk/python/crunch-utility-libraries.html.textile.liquid diff --git a/doc/sdk/python/crunch-utility-libraries.html.textile.liquid b/doc/sdk/python/crunch-utility-libraries.html.textile.liquid new file mode 100644 index 0000000000..a897e9487d --- /dev/null +++ b/doc/sdk/python/crunch-utility-libraries.html.textile.liquid @@ -0,0 +1,225 @@ +--- +layout: default +navsection: sdk +navmenu: Python +title: "Crunch utility libraries" + +... + +h1. Crunch utility libraries + +Several utility libraries are included with Arvados. They are intended to make it quicker and easier to write your own crunch scripts. + +* "Python SDK extras":#pythonsdk +* "Toolkit wrappers":#toolkit_wrappers + +h2(#pythonsdk). Python SDK extras + +The Python SDK adds some convenience features that are particularly useful in crunch scripts, in addition to the standard set of API calls. + +In a crunch job, the environment variables @ARVADOS_API_HOST@ and @ARVADOS_API_TOKEN@ will be set up so the job has the privileges of the user who submitted the job. + +
+import arvados + +my_user = arvados.api().users().current().execute() +my_uuid = my_user['uuid'] ++ +h3. Get the current job and task parameters + +@arvados.current_job()@ and @arvados.current_task()@ are convenient ways to retrieve the current Job and Task, using the @JOB_UUID@ and @TASK_UUID@ environment variables provided to each crunch task process. + +
+this_job = arvados.current_job() +this_task = arvados.current_task() +this_job_input = this_job['script_parameters']['input'] +this_task_input = this_task['parameters']['input'] ++ +h3(#one_task_per_input). Queue a task for each input file + +A common pattern for a crunch job is to run one task to scan the input, and one task per input file to do the work. + +The @one_task_per_input_file()@ function implements this pattern. Pseudocode: + +
+if this is the job's first (default) task: + for each file in the 'input' collection: + queue a new task, with parameters['input'] = file + exit +else: + return ++ +Usage: + +
+import arvados +arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True) + +# Now do the work on a single file +my_input = this_task['parameters']['input'] ++ +h3. Set the current task's output and success flag + +Each task in a crunch job must make an API call to record its output and set its @success@ attribute to True. The object returned by @current_task()@ has a @set_output()@ method to make the process more succinct. + +
+arvados.current_task().set_output(my_output_locator) ++ +h3. arvados_ipc.py + +Manage child processes and FIFOs (pipes). + + +This module makes it easier to check the exit status of every child process you start, and close the unused end of each FIFO at the appropriate time. + +
+from arvados_ipc import * + +children = {} +pipes = {} + +pipe_setup(pipes, 'hellopipe') +if 0 == named_fork(children, 'child_a'): + pipe_closeallbut(pipes, ('hellopipe', 'w')) + os.write(pipes['hellopipe', 'w'], "Hello, parent.") + os._exit(0) + +pipe_closeallbut(pipes, ('hellopipe', 'r')) +with os.fdopen(pipes['hellopipe', 'r'], 'rb') as f: + message = f.read() + sys.stderr.write("Child says: " + message + "\n") + +if not waitpid_and_check_children(children): + raise Exception("Child process exited non-zero.") ++ +The "crunch scripts" included with Arvados include some more examples of using the arvados_ipc module. + +h2(#toolkit_wrappers). Toolkit wrappers + +The following *arvados-∗.py* modules provide "extract, build, run" helpers to make it easy to incorporate common analysis tools in your crunch scripts. + +h3. arvados_bwa.py + +Build and run the "bwa":http://bio-bwa.sourceforge.net/bwa.shtml program. + +The module retrieves the bwa source code from Keep, using the job's @bwa_tbz@ parameter. + +
+import arvados_bwa +arvados_bwa.run('aln', [ref_basename, '-'], + stdin=open(fastq_filename,'rb'), + stdout=open(aln_filename,'wb')) ++ +On qr1hi.arvadosapi.com, the source distribution @bwa-0.7.5a.tar.bz2@ is available in the collection @8b6e2c4916133e1d859c9e812861ce13+70@. + +
+{ + "script_parameters":{ + "bwa_tbz":"8b6e2c4916133e1d859c9e812861ce13+70", + ... + }, + ... +} ++ +h3. arvados_gatk2.py + +Extract and run the "Genome Analysis Toolkit":http://www.broadinstitute.org/gatk/ programs. + +The module retrieves the binary distribution tarball from Keep, using the job's @gatk_tbz@ parameter. + +
+arvados_gatk2.run( + args=[ + '-nct', 8, + '-T', 'BaseRecalibrator', + '-R', ref_fasta_files[0], + '-I', input_bam_files[0], + '-o', recal_file, + ]) ++ +On qr1hi.arvadosapi.com, the binary distribution @GenomeAnalysisTK-2.6-4.tar.bz2@ is available in the collection @5790482512cf6d5d6dfd50b7fd61e1d1+86@. + +The GATK data bundle is available in the collection @d237a90bae3870b3b033aea1e99de4a9+10820@. + +
+{ + "script_parameters":{ + "gatk_tbz":"7e0a277d6d2353678a11f56bab3b13f2+87", + "gatk_bundle":"d237a90bae3870b3b033aea1e99de4a9+10820", + ... + }, + ... +} ++ +h3. arvados_samtools.py + +Build and run the "samtools":http://samtools.sourceforge.net/samtools.shtml program. + + +The module retrieves the samtools source code from Keep, using the job's @samtools_tgz@ parameter. + +
+import arvados_samtools +arvados_samtools.run('view', ['-S', '-b', '-'], + stdin=open(sam_filename,'rb'), + stdout=open(bam_filename,'wb')) ++ +On qr1hi.arvadosapi.com, the source distribution @samtools-0.1.19.tar.gz@ is available in the collection @c777e23cf13e5d5906abfdc08d84bfdb+74@. + +
+{ + "script_parameters":{ + "samtools_tgz":"c777e23cf13e5d5906abfdc08d84bfdb+74", + ... + }, + ... +} ++ + +h3. arvados_picard.py + +Build and run the "picard":http://picard.sourceforge.net/command-line-overview.shtml program. + + +The module retrieves the picard binary distribution from Keep, using the job's @picard_zip@ parameter. + +
+import arvados_picard +arvados_picard.run( + 'FixMateInformation', + params={ + 'i': input_bam_path, + 'o': '/dev/stdout', + 'quiet': 'true', + 'so': 'coordinate', + 'validation_stringency': 'LENIENT', + 'compression_level': 0 + }, + stdout=open('out.bam','wb')) ++ +On qr1hi.arvadosapi.com, the binary distribution @picard-tools-1.82.zip@ is available in the collection @687f74675c6a0e925dec619cc2bec25f+77@. + +
+{ + "script_parameters":{ + "picard_zip":"687f74675c6a0e925dec619cc2bec25f+77", + ... + }, + ... +} ++ +