doc/_includes/tutorial_hash_script.py

   1 #!/usr/bin/env python
   2
   3 # Import the hashlib module (part of the Python standard library) to compute md5.
   4 import hashlib
   5
   6 # Import the Arvados sdk module
   7 import arvados
   8
   9 # Get information about the task from the environment
  10 this_task = arvados.current_task()
  11
  12 # Get the "input" field from "script_parameters" on the job creation object
  13 this_job_input = arvados.getjobparam('input')
  14
  15 # Create the object access to the collection referred to in the input
  16 collection = arvados.CollectionReader(this_job_input)
  17
  18 # Create an object to write a new collection as output
  19 out = arvados.CollectionWriter()
  20
  21 # Set the name of output file within the collection
  22 out.set_current_file_name("md5sum.txt")
  23
  24 # Get an iterator over the files listed in the collection
  25 all_files = collection.all_files()
  26
  27 # Iterate over each file
  28 for input_file in all_files:
  29     # Create the object that will actually compute the md5 hash
  30     digestor = hashlib.new('md5')
  31
  32     while True:
  33         # read a 1 megabyte block from the file
  34         buf = input_file.read(2**20)
  35
  36         # break when there is no more data left
  37         if len(buf) == 0:
  38             break
  39
  40         # update the md5 hash object
  41         digestor.update(buf)
  42
  43     # Get the final hash code
  44     hexdigest = digestor.hexdigest()
  45
  46     # Get the file name from the StreamFileReader object
  47     file_name = input_file.name()
  48
  49     # The "stream name" is the subdirectory inside the collection in which
  50     # the file is located; '.' is the root of the collection.
  51     if input_file.stream_name() != '.':
  52         file_name = os.join(input_file.stream_name(), file_name)
  53
  54     # Write an output line with the md5 value and file name.
  55     out.write("%s %s\n" % (hexdigest, file_name))
  56
  57 # Commit the output to keep.  This returns a Keep id.
  58 output_id = out.finish()
  59
  60 # Set the output for this task to the Keep id
  61 this_task.set_output(output_id)
  62
  63 # Done!