add pgp-survey-import
authorTom Clegg <tom@clinicalfuture.com>
Fri, 28 Jun 2013 05:42:14 +0000 (01:42 -0400)
committerTom Clegg <tom@clinicalfuture.com>
Fri, 28 Jun 2013 08:17:34 +0000 (04:17 -0400)
crunch_scripts/pgp-survey-import [new file with mode: 0755]

diff --git a/crunch_scripts/pgp-survey-import b/crunch_scripts/pgp-survey-import
new file mode 100755 (executable)
index 0000000..8d9b3d5
--- /dev/null
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+import arvados
+import string
+import json
+import UserDict
+import sys
+
+this_job = arvados.current_job()
+this_task = arvados.current_task()
+this_job_input = this_job['script_parameters']['input']
+
+out = arvados.CollectionWriter()
+out.set_current_file_name("arvados_objects.json")
+out.write("[\n")
+separator = ""
+
+traits = {}
+done_bytes = 0
+done_ratio = 0
+for input_file in arvados.CollectionReader(this_job_input).all_files():
+    for line_number, line in enumerate(input_file.readlines()):
+
+        done_bytes += len(line)
+        new_done_ratio = 1.0 * done_bytes / input_file.size()
+        if line_number == 2 or new_done_ratio - done_ratio > 0.05:
+            sys.stderr.write("progress: %d%% after %d lines\n" % (int(done_ratio * 100), line_number+1))
+            done_ratio = new_done_ratio
+
+        words = string.split(string.strip(line), "\t")
+        if line_number == 0:
+            headings = words
+            for t in arvados.service.traits().list(
+                where=json.dumps({'name':words}),
+                limit=1000
+                ).execute()['items']:
+                traits[t['name']] = t
+            for i, trait_name in enumerate(words[3:], start=3):
+                # find or create trait
+                if trait_name not in traits:
+                    traits_match = arvados.service.traits().list(
+                        where=json.dumps({'name':trait_name})
+                        ).execute()['items']
+                    if len(traits_match) > 0:
+                        traits[trait_name] = traits_match[0]
+                    else:
+                        traits[trait_name] = arvados.service.traits().create(
+                            trait=json.dumps({'name':trait_name})).execute()
+                out.write(separator)
+                out.write(json.dumps(traits[trait_name]))
+                separator = ",\n"
+        else:
+            huID_links_match = arvados.service.links().list(
+                where=json.dumps({'link_class':'identifier','name':words[0]})
+                ).execute()['items']
+            if len(huID_links_match) > 0:
+                human_uuid = huID_links_match[0]['head_uuid']
+            else:
+                human = arvados.service.humans().create(
+                    human=json.dumps({})
+                    ).execute()
+                huID_link = arvados.service.links().create(
+                    link=json.dumps({
+                            'link_class':'identifier',
+                            'name':words[0],
+                            'head_kind':'arvados#human',
+                            'head_uuid':human['uuid']
+                            })
+                    ).execute()
+                human_uuid = human['uuid']
+            human_trait = {}
+            for t in arvados.service.links().list(
+                limit=10000,
+                where=json.dumps({
+                    'tail_uuid':human_uuid,
+                    'tail_kind':'arvados#human',
+                    'head_kind':'arvados#trait',
+                    'link_class':'human_trait',
+                    'name':'pgp-survey-response'
+                    })
+                ).execute()['items']:
+                human_trait[t['head_uuid']] = t
+            for i, trait_value in enumerate(words[3:], start=3):
+                trait_uuid = traits[headings[i]]['uuid']
+                if trait_uuid in human_trait:
+                    trait_link = human_trait[trait_uuid]
+                    if trait_link['properties']['value'] != trait_value:
+                        # update database value to match survey response
+                        trait_link['properties']['value'] = trait_value
+                        arvados.service.links().update(
+                            uuid=trait_link['uuid'],
+                            link=json.dumps({'properties':trait_link['properties']})
+                            ).execute()
+                    out.write(",\n")
+                    out.write(json.dumps(trait_link))
+                elif trait_value == '':
+                    # nothing in database, nothing in input
+                    pass
+                else:
+                    trait_link = {
+                        'tail_uuid':human_uuid,
+                        'tail_kind':'arvados#human',
+                        'head_uuid':traits[headings[i]]['uuid'],
+                        'head_kind':'arvados#trait',
+                        'link_class':'human_trait',
+                        'name':'pgp-survey-response',
+                        'properties': { 'value': trait_value }
+                        }
+                    arvados.service.links().create(
+                        link=json.dumps(trait_link)
+                        ).execute()
+                    out.write(",\n")
+                    out.write(json.dumps(trait_link))
+
+out.write("\n]\n")
+this_task.set_output(out.finish())