#!/usr/bin/env python

import arvados
import os
import re
import sys
import pyrtg

this_job = arvados.current_job()
this_task = arvados.current_task()
fastq_path = arvados.util.collection_extract(
    collection = this_job['script_parameters']['input'],
    path = 'fastq')
fastq_files = filter(lambda f: f != '.locator', os.listdir(fastq_path))
tmp_dir_base = os.path.join(arvados.current_task().tmpdir, 'tmp')
out_dir = os.path.join(arvados.current_task().tmpdir, 'reads')

arvados.util.run_command(['rm', '-rf', tmp_dir_base], stderr=sys.stderr)
arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
os.mkdir(tmp_dir_base)

# convert fastq to sdf
tmp_dirs = []
for leftarm in fastq_files:
    if re.search(r'_1.f(ast)?q(.gz)?$', leftarm):
        rightarm = re.sub(r'_1(.f(ast)?q(.gz)?)$', '_2\\1', leftarm)
        if rightarm in fastq_files:
            tmp_dirs += ['%s/%08d' % (tmp_dir_base, len(tmp_dirs))]
            pyrtg.run_rtg('format', tmp_dirs[-1],
                          ['-f', 'fastq',
                           '-q', 'sanger',
                           '-l', os.path.join(fastq_path, leftarm),
                           '-r', os.path.join(fastq_path, rightarm)])

# split sdf
pyrtg.run_rtg('sdfsplit', out_dir,
              ['-n', '1500000'] + tmp_dirs)

# store output
out = arvados.CollectionWriter()
out.write_directory_tree(out_dir, max_manifest_depth=1)
this_task.set_output(out.finish())