X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/b54a5ea817d3d2087eaa07dcf98ec8a82af56d06..9cc572d6a44262e21251372e28b549cfc09e681a:/doc/_includes/_tutorial_hash_script_py.liquid diff --git a/doc/_includes/_tutorial_hash_script_py.liquid b/doc/_includes/_tutorial_hash_script_py.liquid index f9b2ec0948..9eacb763dc 100644 --- a/doc/_includes/_tutorial_hash_script_py.liquid +++ b/doc/_includes/_tutorial_hash_script_py.liquid @@ -1,63 +1,50 @@ #!/usr/bin/env python +{% comment %} +Copyright (C) The Arvados Authors. All rights reserved. -# Import the hashlib module (part of the Python standard library) to compute md5. -import hashlib +SPDX-License-Identifier: CC-BY-SA-3.0 +{% endcomment %} -# Import the Arvados sdk module -import arvados +import hashlib # Import the hashlib module to compute MD5. +import os # Import the os module for basic path manipulation +import arvados # Import the Arvados sdk module -# Get information about the task from the environment -this_task = arvados.current_task() - -# Get the "input" field from "script_parameters" on the job creation object -this_job_input = arvados.getjobparam('input') - -# Create the object access to the collection referred to in the input -collection = arvados.CollectionReader(this_job_input) - -# Create an object to write a new collection as output -out = arvados.CollectionWriter() +# Automatically parallelize this job by running one task per file. +# This means that if the input consists of many files, each file will +# be processed in parallel on different nodes enabling the job to +# be completed quicker. +arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, + input_as_path=True) -# Set the name of output file within the collection -out.set_current_file_name("md5sum.txt") - -# Get an iterator over the files listed in the collection -all_files = collection.all_files() - -# Iterate over each file -for input_file in all_files: - # Create the object that will actually compute the md5 hash - digestor = hashlib.new('md5') - - while True: - # read a 1 megabyte block from the file - buf = input_file.read(2**20) +# Get object representing the current task +this_task = arvados.current_task() - # break when there is no more data left - if len(buf) == 0: - break +# Create the message digest object that will compute the MD5 hash +digestor = hashlib.new('md5') - # update the md5 hash object - digestor.update(buf) +# Get the input file for the task +input_id, input_path = this_task['parameters']['input'].split('/', 1) - # Get the final hash code - hexdigest = digestor.hexdigest() +# Open the input collection +input_collection = arvados.CollectionReader(input_id) - # Get the file name from the StreamFileReader object - file_name = input_file.name() +# Open the input file for reading +with input_collection.open(input_path) as input_file: + for buf in input_file.readall(): # Iterate the file's data blocks + digestor.update(buf) # Update the MD5 hash object - # The "stream name" is the subdirectory inside the collection in which - # the file is located; '.' is the root of the collection. - if input_file.stream_name() != '.': - file_name = os.join(input_file.stream_name(), file_name) +# Write a new collection as output +out = arvados.CollectionWriter() - # Write an output line with the md5 value and file name. - out.write("%s %s\n" % (hexdigest, file_name)) +# Write an output file with one line: the MD5 value and input path +with out.open('md5sum.txt') as out_file: + out_file.write("{} {}/{}\n".format(digestor.hexdigest(), input_id, + os.path.normpath(input_path))) -# Commit the output to keep. This returns a Keep id. -output_id = out.finish() +# Commit the output to Keep. +output_locator = out.finish() -# Set the output for this task to the Keep id -this_task.set_output(output_id) +# Use the resulting locator as the output for this task. +this_task.set_output(output_locator) # Done!