3 # Import the hashlib module (part of the Python standard library) to compute md5.
6 # Import the Arvados sdk module
9 # Get information about the task from the environment
10 this_task = arvados.current_task()
12 # Get the "input" field from "script_parameters" on the job creation object
13 this_job_input = arvados.getjobparam('input')
15 # Create the object access to the collection referred to in the input
16 collection = arvados.CollectionReader(this_job_input)
18 # Create an object to write a new collection as output
19 out = arvados.CollectionWriter()
21 # Set the name of output file within the collection
22 out.set_current_file_name("md5sum.txt")
24 # Get an iterator over the files listed in the collection
25 all_files = collection.all_files()
27 # Iterate over each file
28 for input_file in all_files:
29 # Create the object that will actually compute the md5 hash
30 digestor = hashlib.new('md5')
33 # read a 1 megabyte block from the file
34 buf = input_file.read(2**20)
36 # break when there is no more data left
40 # update the md5 hash object
43 # Get the final hash code
44 hexdigest = digestor.hexdigest()
46 # Get the file name from the StreamFileReader object
47 file_name = input_file.name()
49 # The "stream name" is the subdirectory inside the collection in which
50 # the file is located; '.' is the root of the collection.
51 if input_file.stream_name() != '.':
52 file_name = os.join(input_file.stream_name(), file_name)
54 # Write an output line with the md5 value and file name.
55 out.write("%s %s\n" % (hexdigest, file_name))
57 # Commit the output to keep. This returns a Keep id.
58 output_id = out.finish()
60 # Set the output for this task to the Keep id
61 this_task.set_output(output_id)