#!/usr/bin/env python

# Import the hashlib module (part of the Python standard library) to compute md5.
import hashlib

# Import the Arvados sdk module
import arvados

# Get information about the task from the environment
this_task = arvados.current_task()

# Get the "input" field from "script_parameters" on the job creation object
this_job_input = arvados.getjobparam('input')

# Create the object access to the collection referred to in the input
collection = arvados.CollectionReader(this_job_input)

# Create an object to write a new collection as output
out = arvados.CollectionWriter()

# Set the name of output file within the collection
out.set_current_file_name("md5sum.txt")

# Get an iterator over the files listed in the collection
all_files = collection.all_files()

# Iterate over each file
for input_file in all_files:
    # Create the object that will actually compute the md5 hash
    digestor = hashlib.new('md5')

    while True:
        # read a 1 megabyte block from the file
        buf = input_file.read(2**20)

        # break when there is no more data left
        if len(buf) == 0:
            break

        # update the md5 hash object
        digestor.update(buf)

    # Get the final hash code
    hexdigest = digestor.hexdigest()

    # Get the file name from the StreamFileReader object
    file_name = input_file.name()

    # The "stream name" is the subdirectory inside the collection in which
    # the file is located; '.' is the root of the collection.
    if input_file.stream_name() != '.':
        file_name = os.join(input_file.stream_name(), file_name)

    # Write an output line with the md5 value and file name.
    out.write("%s %s\n" % (hexdigest, file_name))

# Commit the output to keep.  This returns a Keep id.
output_id = out.finish()

# Set the output for this task to the Keep id
this_task.set_output(output_id)

# Done!