13924: Add index for PDH lookups.
[arvados.git] / crunch_scripts / hash
1 #!/usr/bin/env python                                                                                                                                                                            
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: Apache-2.0
5
6 import arvados
7 import hashlib
8 import os
9
10 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True)
11
12 this_job = arvados.current_job()
13 this_task = arvados.current_task()
14
15 if 'algorithm' in this_job['script_parameters']:
16     alg = this_job['script_parameters']['algorithm']
17 else:
18     alg = 'md5'
19 digestor = hashlib.new(alg)
20
21 input_file = arvados.get_task_param_mount('input')
22
23 with open(input_file) as f:
24     while True:
25         buf = f.read(2**20)
26         if len(buf) == 0:
27             break
28         digestor.update(buf)
29
30 hexdigest = digestor.hexdigest()
31
32 file_name = '/'.join(this_task['parameters']['input'].split('/')[1:])
33
34 out = arvados.CollectionWriter()
35 out.set_current_file_name("md5sum.txt")
36 out.write("%s %s\n" % (hexdigest, file_name))
37 this_task.set_output(out.finish())