#!/usr/bin/env python import hashlib # Import the hashlib module to compute MD5. import os # Import the os module for basic path manipulation import arvados # Import the Arvados sdk module # Automatically parallelize this job by running one task per file. # This means that if the input consists of many files, each file will # be processed in parallel on different nodes enabling the job to # be completed quicker. arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True) # Get object representing the current task this_task = arvados.current_task() # Create the message digest object that will compute the MD5 hash digestor = hashlib.new('md5') # Get the input file for the task input_id, input_path = this_task['parameters']['input'].split('/', 1) # Open the input collection input_collection = arvados.CollectionReader(input_id) # Open the input file for reading with input_collection.open(input_path) as input_file: for buf in input_file.readall(): # Iterate the file's data blocks digestor.update(buf) # Update the MD5 hash object # Write a new collection as output out = arvados.CollectionWriter() # Write an output file with one line: the MD5 value and input path with out.open('md5sum.txt') as out_file: out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id, os.path.normpath(input_path))) # Commit the output to Keep. output_locator = out.finish() # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done!