Merge branch '11917-dont-clear-cache'
[arvados.git] / crunch_scripts / arvados_bwa.py
index b34d72fd92759b371e968a3ec3f2b9b25b7c3c47..aefc1f064b25d179c3a731470cc609b01ae48f95 100644 (file)
@@ -1,3 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import arvados
 import re
 import os
@@ -7,30 +11,47 @@ import subprocess
 
 bwa_install_path = None
 
-def setup():
+def install_path():
+    """
+    Extract the bwa source tree, build the bwa binary, and return the
+    path to the source tree.
+    """
     global bwa_install_path
     if bwa_install_path:
         return bwa_install_path
-    bwa_path = arvados.util.tarball_extract(
+
+    bwa_install_path = arvados.util.tarball_extract(
         tarball = arvados.current_job()['script_parameters']['bwa_tbz'],
         path = 'bwa')
 
     # build "bwa" binary
-    lockfile = open(os.path.split(bwa_path)[0] + '.bwa-make.lock',
+    lockfile = open(os.path.split(bwa_install_path)[0] + '.bwa-make.lock',
                     'w')
     fcntl.flock(lockfile, fcntl.LOCK_EX)
-    arvados.util.run_command(['make', '-j16'], cwd=bwa_path)
+    arvados.util.run_command(['make', '-j16'], cwd=bwa_install_path)
     lockfile.close()
 
-    bwa_install_path = bwa_path
-    return bwa_path
+    return bwa_install_path
 
 def bwa_binary():
-    global bwa_install_path
-    return os.path.join(bwa_install_path, 'bwa')
+    """
+    Return the path to the bwa executable.
+    """
+    return os.path.join(install_path(), 'bwa')
 
 def run(command, command_args, **kwargs):
-    global bwa_install_path
+    """
+    Build and run the bwa binary.
+
+    command is the bwa module, e.g., "index" or "aln".
+
+    command_args is a list of additional command line arguments, e.g.,
+    ['-a', 'bwtsw', 'ref.fasta']
+
+    It is assumed that we are running in a Crunch job environment, and
+    the job's "bwa_tbz" parameter is a collection containing the bwa
+    source tree in a .tbz file.
+    """
     execargs = [bwa_binary(),
                 command]
     execargs += command_args
@@ -43,6 +64,22 @@ def run(command, command_args, **kwargs):
         stdout=kwargs.get('stdout', sys.stderr))
 
 def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
+    """
+    Queue one task for each pair of fastq files in this job's input
+    collection.
+
+    Each new task will have two parameters, named "input_1" and
+    "input_2", each being a manifest containing a single fastq file.
+
+    A matching pair of files in the input collection is assumed to
+    have names "x_1.y" and "x_2.y".
+
+    Files in the input collection that are not part of a matched pair
+    are silently ignored.
+
+    if_sequence and and_end_task arguments have the same significance
+    as in arvados.job_setup.one_task_per_input_file().
+    """
     if if_sequence != arvados.current_task()['sequence']:
         return
     job_input = arvados.current_job()['script_parameters']['input']
@@ -76,5 +113,3 @@ def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
                                    body={'success':True}
                                    ).execute()
         exit(0)
-
-setup()