3 import hashlib # Import the hashlib module to compute md5.
4 import arvados # Import the Arvados sdk module
6 # Automatically parallelize this job by running one task per file.
7 # This means that if the input consists of many files, each file will
8 # be processed in parallel on different nodes enabling the job to
9 # be completed quicker.
10 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
13 # Create the message digest object that will compute the md5 hash
14 digestor = hashlib.new('md5')
16 # Get the input file for the task
17 input_file = arvados.get_task_param_mount('input')
19 # Open the input file for reading
20 with open(input_file) as f:
22 buf = f.read(2**20) # read a 1 megabyte block from the file
23 if len(buf) == 0: # break when there is no more data left
25 digestor.update(buf) # update the md5 hash object
27 # Get object representing the current task
28 this_task = arvados.current_task()
30 # Write a new collection as output
31 out = arvados.CollectionWriter()
33 # Set output file within the collection
34 out.set_current_file_name("md5sum.txt")
36 # Write an output line with the md5 value and input
37 out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input']))
39 # Commit the output to keep. This returns a Keep id.
40 output_id = out.finish()
42 # Set the output for this task to the Keep id
43 this_task.set_output(output_id)