X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/1c5176d87df0dbd25db6ff1fb2ab82ae17472145..0eb72b526bf8bbb011551ecf019f604e17a534f1:/doc/_includes/_tutorial_hash_script_py.liquid diff --git a/doc/_includes/_tutorial_hash_script_py.liquid b/doc/_includes/_tutorial_hash_script_py.liquid index b9c7f31532..9eacb763dc 100644 --- a/doc/_includes/_tutorial_hash_script_py.liquid +++ b/doc/_includes/_tutorial_hash_script_py.liquid @@ -1,45 +1,50 @@ #!/usr/bin/env python +{% comment %} +Copyright (C) The Arvados Authors. All rights reserved. + +SPDX-License-Identifier: CC-BY-SA-3.0 +{% endcomment %} import hashlib # Import the hashlib module to compute MD5. +import os # Import the os module for basic path manipulation import arvados # Import the Arvados sdk module # Automatically parallelize this job by running one task per file. # This means that if the input consists of many files, each file will -# be processed in parallel on different nodes enabling the job to +# be processed in parallel on different nodes enabling the job to # be completed quicker. -arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, +arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True) +# Get object representing the current task +this_task = arvados.current_task() + # Create the message digest object that will compute the MD5 hash digestor = hashlib.new('md5') # Get the input file for the task -input_file = arvados.get_task_param_mount('input') +input_id, input_path = this_task['parameters']['input'].split('/', 1) -# Open the input file for reading -with open(input_file) as f: - while True: - buf = f.read(2**20) # read a 1 megabyte block from the file - if len(buf) == 0: # break when there is no more data left - break - digestor.update(buf) # update the MD5 hash object +# Open the input collection +input_collection = arvados.CollectionReader(input_id) -# Get object representing the current task -this_task = arvados.current_task() +# Open the input file for reading +with input_collection.open(input_path) as input_file: + for buf in input_file.readall(): # Iterate the file's data blocks + digestor.update(buf) # Update the MD5 hash object - # Write a new collection as output +# Write a new collection as output out = arvados.CollectionWriter() - # Set output file within the collection -out.set_current_file_name("md5sum.txt") - -# Write an output line with the MD5 value and input -out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input'])) +# Write an output file with one line: the MD5 value and input path +with out.open('md5sum.txt') as out_file: + out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id, + os.path.normpath(input_path))) - # Commit the output to keep. This returns a Keep id. -output_id = out.finish() +# Commit the output to Keep. +output_locator = out.finish() -# Set the output for this task to the Keep id -this_task.set_output(output_id) +# Use the resulting locator as the output for this task. +this_task.set_output(output_locator) # Done!