#!/usr/bin/env python
-# Import the hashlib module (part of the Python standard library) to compute md5.
-import hashlib
-
-# Import the Arvados sdk module
-import arvados
-
-# Get information about the task from the environment
+import hashlib # Import the hashlib module to compute MD5.
+import os # Import the os module for basic path manipulation
+import arvados # Import the Arvados sdk module
+
+# Automatically parallelize this job by running one task per file.
+# This means that if the input consists of many files, each file will
+# be processed in parallel on different nodes enabling the job to
+# be completed quicker.
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
+ input_as_path=True)
+
+# Get object representing the current task
this_task = arvados.current_task()
-# Get the "input" field from "script_parameters" on the job creation object
-this_job_input = arvados.getjobparam('input')
-
-# Create the object access to the collection referred to in the input
-collection = arvados.CollectionReader(this_job_input)
-
-# Create an object to write a new collection as output
-out = arvados.CollectionWriter()
-
-# Set the name of output file within the collection
-out.set_current_file_name("md5sum.txt")
+# Create the message digest object that will compute the MD5 hash
+digestor = hashlib.new('md5')
-# Get an iterator over the files listed in the collection
-all_files = collection.all_files()
+# Get the input file for the task
+input_id, input_path = this_task['parameters']['input'].split('/', 1)
-# Iterate over each file
-for input_file in all_files:
- # Create the object that will actually compute the md5 hash
- digestor = hashlib.new('md5')
+# Open the input collection
+input_collection = arvados.CollectionReader(input_id)
- while True:
- # read a 1 megabyte block from the file
- buf = input_file.read(2**20)
+# Open the input file for reading
+with input_collection.open(input_path) as input_file:
+ for buf in input_file.readall(): # Iterate the file's data blocks
+ digestor.update(buf) # Update the MD5 hash object
- # break when there is no more data left
- if len(buf) == 0:
- break
-
- # update the md5 hash object
- digestor.update(buf)
-
- # Get the final hash code
- hexdigest = digestor.hexdigest()
-
- # Get the file name from the StreamFileReader object
- file_name = input_file.name()
-
- # The "stream name" is the subdirectory inside the collection in which
- # the file is located; '.' is the root of the collection.
- if input_file.stream_name() != '.':
- file_name = os.join(input_file.stream_name(), file_name)
+# Write a new collection as output
+out = arvados.CollectionWriter()
- # Write an output line with the md5 value and file name.
- out.write("%s %s\n" % (hexdigest, file_name))
+# Write an output file with one line: the MD5 value and input path
+with out.open('md5sum.txt') as out_file:
+ out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id,
+ os.path.normpath(input_path)))
-# Commit the output to keep. This returns a Keep id.
-output_id = out.finish()
+# Commit the output to Keep.
+output_locator = out.finish()
-# Set the output for this task to the Keep id
-this_task.set_output(output_id)
+# Use the resulting locator as the output for this task.
+this_task.set_output(output_locator)
# Done!