doc/_includes/_tutorial_hash_script_py.liquid

   1 #!/usr/bin/env python
   2 {% comment %}
   3 Copyright (C) The Arvados Authors. All rights reserved.
   4
   5 SPDX-License-Identifier: CC-BY-SA-3.0
   6 {% endcomment %}
   7
   8 import hashlib      # Import the hashlib module to compute MD5.
   9 import os           # Import the os module for basic path manipulation
  10 import arvados      # Import the Arvados sdk module
  11
  12 # Automatically parallelize this job by running one task per file.
  13 # This means that if the input consists of many files, each file will
  14 # be processed in parallel on different nodes enabling the job to
  15 # be completed quicker.
  16 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
  17                                           input_as_path=True)
  18
  19 # Get object representing the current task
  20 this_task = arvados.current_task()
  21
  22 # Create the message digest object that will compute the MD5 hash
  23 digestor = hashlib.new('md5')
  24
  25 # Get the input file for the task
  26 input_id, input_path = this_task['parameters']['input'].split('/', 1)
  27
  28 # Open the input collection
  29 input_collection = arvados.CollectionReader(input_id)
  30
  31 # Open the input file for reading
  32 with input_collection.open(input_path) as input_file:
  33     for buf in input_file.readall():  # Iterate the file's data blocks
  34         digestor.update(buf)          # Update the MD5 hash object
  35
  36 # Write a new collection as output
  37 out = arvados.CollectionWriter()
  38
  39 # Write an output file with one line: the MD5 value and input path
  40 with out.open('md5sum.txt') as out_file:
  41     out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id,
  42                                        os.path.normpath(input_path)))
  43
  44 # Commit the output to Keep.
  45 output_locator = out.finish()
  46
  47 # Use the resulting locator as the output for this task.
  48 this_task.set_output(output_locator)
  49
  50 # Done!