doc/_includes/_tutorial_hash_script_py.liquid

   1 #!/usr/bin/env python
   2
   3 import hashlib      # Import the hashlib module to compute MD5.
   4 import os           # Import the os module for basic path manipulation
   5 import arvados      # Import the Arvados sdk module
   6
   7 # Automatically parallelize this job by running one task per file.
   8 # This means that if the input consists of many files, each file will
   9 # be processed in parallel on different nodes enabling the job to
  10 # be completed quicker.
  11 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
  12                                           input_as_path=True)
  13
  14 # Get object representing the current task
  15 this_task = arvados.current_task()
  16
  17 # Create the message digest object that will compute the MD5 hash
  18 digestor = hashlib.new('md5')
  19
  20 # Get the input file for the task
  21 input_id, input_path = this_task['parameters']['input'].split('/', 1)
  22
  23 # Open the input collection
  24 input_collection = arvados.CollectionReader(input_id)
  25
  26 # Open the input file for reading
  27 with input_collection.open(input_path) as input_file:
  28     for buf in input_file.readall():  # Iterate the file's data blocks
  29         digestor.update(buf)          # Update the MD5 hash object
  30
  31 # Write a new collection as output
  32 out = arvados.CollectionWriter()
  33
  34 # Write an output file with one line: the MD5 value and input path
  35 with out.open('md5sum.txt') as out_file:
  36     out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id,
  37                                        os.path.normpath(input_path)))
  38
  39 # Commit the output to Keep.
  40 output_locator = out.finish()
  41
  42 # Use the resulting locator as the output for this task.
  43 this_task.set_output(output_locator)
  44
  45 # Done!