#!/usr/bin/env python import hashlib # Import the hashlib module to compute md5. import arvados # Import the Arvados sdk module # Automatically parallelize this job by running one task per file. # This means that if the input consists of many files, each file will # be processed in parallel on different nodes enabling the job to # be completed quicker. arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True) # Create the message digest object that will compute the md5 hash digestor = hashlib.new('md5') # Get the input file for the task input_file = arvados.get_task_param_mount('input') # Open the input file for reading with open(input_file) as f: while True: buf = f.read(2**20) # read a 1 megabyte block from the file if len(buf) == 0: # break when there is no more data left break digestor.update(buf) # update the md5 hash object # Get object representing the current task this_task = arvados.current_task() # Write a new collection as output out = arvados.CollectionWriter() # Set output file within the collection out.set_current_file_name("md5sum.txt") # Write an output line with the md5 value and input out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input'])) # Commit the output to keep. This returns a Keep id. output_id = out.finish() # Set the output for this task to the Keep id this_task.set_output(output_id) # Done!