#!/usr/bin/env python

import hashlib      # Import the hashlib module to compute md5.
import arvados      # Import the Arvados sdk module

# Automatically parallelize this job by running one task per file.
# This means that if the input consists of many files, each file will
# be processed in parallel on different nodes enabling the job to 
# be completed quicker.
arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, 
                                          input_as_path=True)

# Create the message digest object that will compute the md5 hash
digestor = hashlib.new('md5')

# Get the input file for the task
input_file = arvados.get_task_param_mount('input')

# Open the input file for reading
with open(input_file) as f:
    while True:
        buf = f.read(2**20)      # read a 1 megabyte block from the file
        if len(buf) == 0:        # break when there is no more data left
            break
        digestor.update(buf)     # update the md5 hash object

# Get object representing the current task
this_task = arvados.current_task()

 # Write a new collection as output
out = arvados.CollectionWriter()

 # Set output file within the collection
out.set_current_file_name("md5sum.txt")

# Write an output line with the md5 value and input
out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input']))

 # Commit the output to keep.  This returns a Keep id.
output_id = out.finish()

# Set the output for this task to the Keep id
this_task.set_output(output_id) 

# Done!