X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/18258f6a3762ba7d83b05260b3c22f71423c0373..6a5064852f67f11915dd0c956128a6363f1d83c8:/doc/_includes/_tutorial_hash_script_py.liquid diff --git a/doc/_includes/_tutorial_hash_script_py.liquid b/doc/_includes/_tutorial_hash_script_py.liquid index f9b2ec0948..b9c7f31532 100644 --- a/doc/_includes/_tutorial_hash_script_py.liquid +++ b/doc/_includes/_tutorial_hash_script_py.liquid @@ -1,63 +1,45 @@ #!/usr/bin/env python -# Import the hashlib module (part of the Python standard library) to compute md5. -import hashlib +import hashlib # Import the hashlib module to compute MD5. +import arvados # Import the Arvados sdk module -# Import the Arvados sdk module -import arvados +# Automatically parallelize this job by running one task per file. +# This means that if the input consists of many files, each file will +# be processed in parallel on different nodes enabling the job to +# be completed quicker. +arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, + input_as_path=True) -# Get information about the task from the environment -this_task = arvados.current_task() - -# Get the "input" field from "script_parameters" on the job creation object -this_job_input = arvados.getjobparam('input') - -# Create the object access to the collection referred to in the input -collection = arvados.CollectionReader(this_job_input) - -# Create an object to write a new collection as output -out = arvados.CollectionWriter() - -# Set the name of output file within the collection -out.set_current_file_name("md5sum.txt") +# Create the message digest object that will compute the MD5 hash +digestor = hashlib.new('md5') -# Get an iterator over the files listed in the collection -all_files = collection.all_files() - -# Iterate over each file -for input_file in all_files: - # Create the object that will actually compute the md5 hash - digestor = hashlib.new('md5') +# Get the input file for the task +input_file = arvados.get_task_param_mount('input') +# Open the input file for reading +with open(input_file) as f: while True: - # read a 1 megabyte block from the file - buf = input_file.read(2**20) - - # break when there is no more data left - if len(buf) == 0: + buf = f.read(2**20) # read a 1 megabyte block from the file + if len(buf) == 0: # break when there is no more data left break + digestor.update(buf) # update the MD5 hash object - # update the md5 hash object - digestor.update(buf) - - # Get the final hash code - hexdigest = digestor.hexdigest() +# Get object representing the current task +this_task = arvados.current_task() - # Get the file name from the StreamFileReader object - file_name = input_file.name() + # Write a new collection as output +out = arvados.CollectionWriter() - # The "stream name" is the subdirectory inside the collection in which - # the file is located; '.' is the root of the collection. - if input_file.stream_name() != '.': - file_name = os.join(input_file.stream_name(), file_name) + # Set output file within the collection +out.set_current_file_name("md5sum.txt") - # Write an output line with the md5 value and file name. - out.write("%s %s\n" % (hexdigest, file_name)) +# Write an output line with the MD5 value and input +out.write("%s %s\n" % (digestor.hexdigest(), this_task['parameters']['input'])) -# Commit the output to keep. This returns a Keep id. + # Commit the output to keep. This returns a Keep id. output_id = out.finish() # Set the output for this task to the Keep id -this_task.set_output(output_id) +this_task.set_output(output_id) # Done!