4363: Accept manifest_text in a non-UTF-8 encoding that is equal to its UTF-8 encoding.
[arvados.git] / crunch_scripts / split-fastq.py
index 8382eb13f95caeebd1440e9e37eda0afa9863824..17aabf2930393a48d3539483d5198cab5af35631 100755 (executable)
@@ -88,42 +88,42 @@ def splitfastq(p):
             if count % 10000 == 0:
                 print >>sys.stderr, "Record %s at %s" % (count, p[i]["end"])
 
-prog = re.compile(r'(.*?)(_12)?\.fastq(\.gz)?$')
+prog = re.compile(r'(.*?)(_[12])?\.fastq(\.gz)?$')
 
 # Look for fastq files
 for s in inp.all_streams():
     for f in s.all_files():
         name_pieces = prog.match(f.name())
-        if name_pieces != None:
+        if name_pieces is not None:
             if s.name() != ".":
                 # The downstream tool (run-command) only iterates over the top
                 # level of directories so if there are fastq files in
                 # directories in the input, the choice is either to forget
                 # there are directories (which might lead to name conflicts) or
                 # just fail.
-                print >>sys.stderr, "fastq must be at the root of the collection")
+                print >>sys.stderr, "fastq must be at the root of the collection"
                 sys.exit(1)
 
-            if name_pieces.group(2) != None:
+            p = None
+            if name_pieces.group(2) is not None:
                 if name_pieces.group(2) == "_1":
                     p = [{}, {}]
                     p[0]["reader"] = s.files()[name_pieces.group(0)]
-                    if name_pieces.group(2) != None:
-                        p[1]["reader"] = s.files()[name_pieces.group(1) + "_2.fastq" + name_pieces.group(2)]
-                    else:
-                        p[1]["reader"] = s.files()[name_pieces.group(1) + "_2.fastq"]
+                    p[1]["reader"] = s.files()[name_pieces.group(1) + "_2.fastq" + (name_pieces.group(3) if name_pieces.group(3) else '')]
             else:
                 p = [{}]
                 p[0]["reader"] = s.files()[name_pieces.group(0)]
 
-            if chunking:
-                splitfastq(p)
-            else:
-                for i in xrange(0, len(p)):
-                    m = p[i]["reader"].as_manifest()[1:]
-                    manifest_list.append(["./_" + str(piece), m[:-1]])
-                piece += 1
-
-manifest_text = "\n".join(" ".join(m) for m in manifest_list)
+            if p is not None:
+                if chunking:
+                    splitfastq(p)
+                else:
+                    for i in xrange(0, len(p)):
+                        m = p[i]["reader"].as_manifest().split()
+                        m[0] = "./_" + str(piece)
+                        manifest_list.append(m)
+                    piece += 1
+
+manifest_text = "\n".join(" ".join(m) for m in manifest_list) + "\n"
 
 arvados.current_task().set_output(manifest_text)