doc/_includes/_tutorial_hash_script_py.liquid

   1 #!/usr/bin/env python
   2
   3 import hashlib      # Import the hashlib module to compute md5.
   4 import arvados      # Import the Arvados sdk module
   5
   6 # Automatically parallelize this job by running one task per file.
   7 # This means that if the input consists of many files, each file will
   8 # be processed in parallel on different nodes enabling the job to
   9 # be completed quicker.
  10 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
  11                                           input_as_path=True)
  12
  13 # Create the message digest object that will compute the md5 hash
  14 digestor = hashlib.new('md5')
  15
  16 # Get the input file for the task
  17 input_file = arvados.get_task_param_mount('input')
  18
  19 # Open the input file for reading
  20 with open(input_file) as f:
  21     while True:
  22         buf = f.read(2**20)      # read a 1 megabyte block from the file
  23         if len(buf) == 0:        # break when there is no more data left
  24             break
  25         digestor.update(buf)     # update the md5 hash object
  26
  27 # Get object representing the current task
  28 this_task = arvados.current_task()
  29
  30  # Write a new collection as output
  31 out = arvados.CollectionWriter()
  32
  33  # Set output file within the collection
  34 out.set_current_file_name("md5sum.txt")
  35
  36 # Write an output line with the md5 value and input
  37 out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input']))
  38
  39  # Commit the output to keep.  This returns a Keep id.
  40 output_id = out.finish()
  41
  42 # Set the output for this task to the Keep id
  43 this_task.set_output(output_id)
  44
  45 # Done!