X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0bd1c28bed9a0756c61037947d5a9dccd5066f00..b4091adb7ac1a85de6ae1f18895e9d8f9da5d441:/doc/_includes/_concurrent_hash_script_py.liquid

diff --git a/doc/_includes/_concurrent_hash_script_py.liquid b/doc/_includes/_concurrent_hash_script_py.liquid
index a914e0482a..2c55298841 100644
--- a/doc/_includes/_concurrent_hash_script_py.liquid
+++ b/doc/_includes/_concurrent_hash_script_py.liquid
@@ -1,6 +1,12 @@
 #!/usr/bin/env python
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
 
 import hashlib
+import os
 import arvados
 
 # Jobs consist of one or more tasks.  A task is a single invocation of
@@ -11,7 +17,7 @@ this_task = arvados.current_task()
 
 # Tasks have a sequence number for ordering.  All tasks
 # with the current sequence number must finish successfully
-# before tasks in the next sequence are started. 
+# before tasks in the next sequence are started.
 # The first task has sequence number 0
 if this_task['sequence'] == 0:
     # Get the "input" field from "script_parameters" on the task object
@@ -21,7 +27,7 @@ if this_task['sequence'] == 0:
     cr = arvados.CollectionReader(job_input)
 
     # Loop over each stream in the collection (a stream is a subset of
-    # files that logically represents a directory
+    # files that logically represents a directory)
     for s in cr.all_streams():
 
         # Loop over each file in the stream
@@ -62,29 +68,21 @@ else:
 
     collection = arvados.CollectionReader(this_task_input)
 
-    out = arvados.CollectionWriter()
-    out.set_current_file_name("md5sum.txt")
-
     # There should only be one file in the collection, so get the
-    # first one.  collection.all_files() returns an iterator so we
-    # need to make it into a list for indexed access.
-    input_file = list(collection.all_files())[0]
+    # first one from the all files iterator.
+    input_file = next(collection.all_files())
+    output_path = os.path.normpath(os.path.join(input_file.stream_name(),
+                                                input_file.name))
 
     # Everything after this is the same as the first tutorial.
     digestor = hashlib.new('md5')
-
-    while True:
-        buf = input_file.read(2**20)
-        if len(buf) == 0:
-            break
+    for buf in input_file.readall():
         digestor.update(buf)
 
-    hexdigest = digestor.hexdigest()
-    file_name = input_file.name()
-    if input_file.stream_name() != '.':
-        file_name = os.join(input_file.stream_name(), file_name)
-    out.write("%s %s\n" % (hexdigest, file_name))
-    output_id = out.finish()
-    this_task.set_output(output_id)
+    out = arvados.CollectionWriter()
+    with out.open('md5sum.txt') as out_file:
+        out_file.write("{} {}\n".format(digestor.hexdigest(), output_path))
+
+    this_task.set_output(out.finish())
 
 # Done!