12018: Enhanced readability of ProcessFile() function.
[arvados.git] / crunch_scripts / pgp-survey-import
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: Apache-2.0
5
6 import arvados
7 import string
8 import json
9 import UserDict
10 import sys
11
12 this_job = arvados.current_job()
13 this_task = arvados.current_task()
14 this_job_input = this_job['script_parameters']['input']
15
16 out = arvados.CollectionWriter()
17 out.set_current_file_name("arvados_objects.json")
18 out.write("[\n")
19 separator = ""
20
21 traits = {}
22 done_bytes = 0
23 done_ratio = 0
24 for input_file in arvados.CollectionReader(this_job_input).all_files():
25     for line_number, line in enumerate(input_file.readlines()):
26
27         done_bytes += len(line)
28         new_done_ratio = 1.0 * done_bytes / input_file.size()
29         if line_number == 2 or new_done_ratio - done_ratio > 0.05:
30             sys.stderr.write("progress: %d%% after %d lines\n" % (int(done_ratio * 100), line_number+1))
31             done_ratio = new_done_ratio
32
33         words = string.split(string.strip(line), "\t")
34         if line_number == 0:
35             headings = words
36             for t in arvados.api('v1').traits().list(
37                 where={'name':words},
38                 limit=1000
39                 ).execute()['items']:
40                 traits[t['name']] = t
41             for i, trait_name in enumerate(words[3:], start=3):
42                 # find or create trait
43                 if trait_name not in traits:
44                     traits_match = arvados.api('v1').traits().list(
45                         where={'name':trait_name}
46                         ).execute()['items']
47                     if len(traits_match) > 0:
48                         traits[trait_name] = traits_match[0]
49                     else:
50                         traits[trait_name] = arvados.api('v1').traits().create(
51                             trait={'name':trait_name}).execute()
52                 out.write(separator)
53                 out.write(json.dumps(traits[trait_name]))
54                 separator = ",\n"
55         else:
56             huID_links_match = arvados.api('v1').links().list(
57                 where={'link_class':'identifier','name':words[0]}
58                 ).execute()['items']
59             if len(huID_links_match) > 0:
60                 human_uuid = huID_links_match[0]['head_uuid']
61             else:
62                 human = arvados.api('v1').humans().create(
63                     body={}
64                     ).execute()
65                 huID_link = arvados.api('v1').links().create(
66                     body={
67                         'link_class':'identifier',
68                         'name':words[0],
69                         'head_kind':'arvados#human',
70                         'head_uuid':human['uuid']
71                         }
72                     ).execute()
73                 human_uuid = human['uuid']
74             human_trait = {}
75             for t in arvados.api('v1').links().list(
76                 limit=10000,
77                 where={
78                     'tail_uuid':human_uuid,
79                     'tail_kind':'arvados#human',
80                     'head_kind':'arvados#trait',
81                     'link_class':'human_trait',
82                     'name':'pgp-survey-response'
83                     }
84                 ).execute()['items']:
85                 human_trait[t['head_uuid']] = t
86             for i, trait_value in enumerate(words[3:], start=3):
87                 trait_uuid = traits[headings[i]]['uuid']
88                 if trait_uuid in human_trait:
89                     trait_link = human_trait[trait_uuid]
90                     if trait_link['properties']['value'] != trait_value:
91                         # update database value to match survey response
92                         trait_link['properties']['value'] = trait_value
93                         arvados.api('v1').links().update(
94                             uuid=trait_link['uuid'],
95                             body={'properties':trait_link['properties']}
96                             ).execute()
97                     out.write(",\n")
98                     out.write(json.dumps(trait_link))
99                 elif trait_value == '':
100                     # nothing in database, nothing in input
101                     pass
102                 else:
103                     trait_link = {
104                         'tail_uuid':human_uuid,
105                         'tail_kind':'arvados#human',
106                         'head_uuid':traits[headings[i]]['uuid'],
107                         'head_kind':'arvados#trait',
108                         'link_class':'human_trait',
109                         'name':'pgp-survey-response',
110                         'properties': { 'value': trait_value }
111                         }
112                     arvados.api('v1').links().create(
113                         body=trait_link
114                         ).execute()
115                     out.write(",\n")
116                     out.write(json.dumps(trait_link))
117
118 out.write("\n]\n")
119 this_task.set_output(out.finish())