Merge branch '8497-datamanager-batchsize-1000' of https://github.com/wtsi-hgi/arvados...
[arvados.git] / crunch_scripts / pgp-survey-import
1 #!/usr/bin/env python
2
3 import arvados
4 import string
5 import json
6 import UserDict
7 import sys
8
9 this_job = arvados.current_job()
10 this_task = arvados.current_task()
11 this_job_input = this_job['script_parameters']['input']
12
13 out = arvados.CollectionWriter()
14 out.set_current_file_name("arvados_objects.json")
15 out.write("[\n")
16 separator = ""
17
18 traits = {}
19 done_bytes = 0
20 done_ratio = 0
21 for input_file in arvados.CollectionReader(this_job_input).all_files():
22     for line_number, line in enumerate(input_file.readlines()):
23
24         done_bytes += len(line)
25         new_done_ratio = 1.0 * done_bytes / input_file.size()
26         if line_number == 2 or new_done_ratio - done_ratio > 0.05:
27             sys.stderr.write("progress: %d%% after %d lines\n" % (int(done_ratio * 100), line_number+1))
28             done_ratio = new_done_ratio
29
30         words = string.split(string.strip(line), "\t")
31         if line_number == 0:
32             headings = words
33             for t in arvados.api('v1').traits().list(
34                 where={'name':words},
35                 limit=1000
36                 ).execute()['items']:
37                 traits[t['name']] = t
38             for i, trait_name in enumerate(words[3:], start=3):
39                 # find or create trait
40                 if trait_name not in traits:
41                     traits_match = arvados.api('v1').traits().list(
42                         where={'name':trait_name}
43                         ).execute()['items']
44                     if len(traits_match) > 0:
45                         traits[trait_name] = traits_match[0]
46                     else:
47                         traits[trait_name] = arvados.api('v1').traits().create(
48                             trait={'name':trait_name}).execute()
49                 out.write(separator)
50                 out.write(json.dumps(traits[trait_name]))
51                 separator = ",\n"
52         else:
53             huID_links_match = arvados.api('v1').links().list(
54                 where={'link_class':'identifier','name':words[0]}
55                 ).execute()['items']
56             if len(huID_links_match) > 0:
57                 human_uuid = huID_links_match[0]['head_uuid']
58             else:
59                 human = arvados.api('v1').humans().create(
60                     body={}
61                     ).execute()
62                 huID_link = arvados.api('v1').links().create(
63                     body={
64                         'link_class':'identifier',
65                         'name':words[0],
66                         'head_kind':'arvados#human',
67                         'head_uuid':human['uuid']
68                         }
69                     ).execute()
70                 human_uuid = human['uuid']
71             human_trait = {}
72             for t in arvados.api('v1').links().list(
73                 limit=10000,
74                 where={
75                     'tail_uuid':human_uuid,
76                     'tail_kind':'arvados#human',
77                     'head_kind':'arvados#trait',
78                     'link_class':'human_trait',
79                     'name':'pgp-survey-response'
80                     }
81                 ).execute()['items']:
82                 human_trait[t['head_uuid']] = t
83             for i, trait_value in enumerate(words[3:], start=3):
84                 trait_uuid = traits[headings[i]]['uuid']
85                 if trait_uuid in human_trait:
86                     trait_link = human_trait[trait_uuid]
87                     if trait_link['properties']['value'] != trait_value:
88                         # update database value to match survey response
89                         trait_link['properties']['value'] = trait_value
90                         arvados.api('v1').links().update(
91                             uuid=trait_link['uuid'],
92                             body={'properties':trait_link['properties']}
93                             ).execute()
94                     out.write(",\n")
95                     out.write(json.dumps(trait_link))
96                 elif trait_value == '':
97                     # nothing in database, nothing in input
98                     pass
99                 else:
100                     trait_link = {
101                         'tail_uuid':human_uuid,
102                         'tail_kind':'arvados#human',
103                         'head_uuid':traits[headings[i]]['uuid'],
104                         'head_kind':'arvados#trait',
105                         'link_class':'human_trait',
106                         'name':'pgp-survey-response',
107                         'properties': { 'value': trait_value }
108                         }
109                     arvados.api('v1').links().create(
110                         body=trait_link
111                         ).execute()
112                     out.write(",\n")
113                     out.write(json.dumps(trait_link))
114
115 out.write("\n]\n")
116 this_task.set_output(out.finish())