From: Tom Clegg Date: Fri, 28 Jun 2013 05:42:14 +0000 (-0400) Subject: add pgp-survey-import X-Git-Tag: 1.1.0~3144 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/36f80655c840b0d7393f82056c05858e29c123cf add pgp-survey-import --- diff --git a/crunch_scripts/pgp-survey-import b/crunch_scripts/pgp-survey-import new file mode 100755 index 0000000000..8d9b3d5855 --- /dev/null +++ b/crunch_scripts/pgp-survey-import @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +import arvados +import string +import json +import UserDict +import sys + +this_job = arvados.current_job() +this_task = arvados.current_task() +this_job_input = this_job['script_parameters']['input'] + +out = arvados.CollectionWriter() +out.set_current_file_name("arvados_objects.json") +out.write("[\n") +separator = "" + +traits = {} +done_bytes = 0 +done_ratio = 0 +for input_file in arvados.CollectionReader(this_job_input).all_files(): + for line_number, line in enumerate(input_file.readlines()): + + done_bytes += len(line) + new_done_ratio = 1.0 * done_bytes / input_file.size() + if line_number == 2 or new_done_ratio - done_ratio > 0.05: + sys.stderr.write("progress: %d%% after %d lines\n" % (int(done_ratio * 100), line_number+1)) + done_ratio = new_done_ratio + + words = string.split(string.strip(line), "\t") + if line_number == 0: + headings = words + for t in arvados.service.traits().list( + where=json.dumps({'name':words}), + limit=1000 + ).execute()['items']: + traits[t['name']] = t + for i, trait_name in enumerate(words[3:], start=3): + # find or create trait + if trait_name not in traits: + traits_match = arvados.service.traits().list( + where=json.dumps({'name':trait_name}) + ).execute()['items'] + if len(traits_match) > 0: + traits[trait_name] = traits_match[0] + else: + traits[trait_name] = arvados.service.traits().create( + trait=json.dumps({'name':trait_name})).execute() + out.write(separator) + out.write(json.dumps(traits[trait_name])) + separator = ",\n" + else: + huID_links_match = arvados.service.links().list( + where=json.dumps({'link_class':'identifier','name':words[0]}) + ).execute()['items'] + if len(huID_links_match) > 0: + human_uuid = huID_links_match[0]['head_uuid'] + else: + human = arvados.service.humans().create( + human=json.dumps({}) + ).execute() + huID_link = arvados.service.links().create( + link=json.dumps({ + 'link_class':'identifier', + 'name':words[0], + 'head_kind':'arvados#human', + 'head_uuid':human['uuid'] + }) + ).execute() + human_uuid = human['uuid'] + human_trait = {} + for t in arvados.service.links().list( + limit=10000, + where=json.dumps({ + 'tail_uuid':human_uuid, + 'tail_kind':'arvados#human', + 'head_kind':'arvados#trait', + 'link_class':'human_trait', + 'name':'pgp-survey-response' + }) + ).execute()['items']: + human_trait[t['head_uuid']] = t + for i, trait_value in enumerate(words[3:], start=3): + trait_uuid = traits[headings[i]]['uuid'] + if trait_uuid in human_trait: + trait_link = human_trait[trait_uuid] + if trait_link['properties']['value'] != trait_value: + # update database value to match survey response + trait_link['properties']['value'] = trait_value + arvados.service.links().update( + uuid=trait_link['uuid'], + link=json.dumps({'properties':trait_link['properties']}) + ).execute() + out.write(",\n") + out.write(json.dumps(trait_link)) + elif trait_value == '': + # nothing in database, nothing in input + pass + else: + trait_link = { + 'tail_uuid':human_uuid, + 'tail_kind':'arvados#human', + 'head_uuid':traits[headings[i]]['uuid'], + 'head_kind':'arvados#trait', + 'link_class':'human_trait', + 'name':'pgp-survey-response', + 'properties': { 'value': trait_value } + } + arvados.service.links().create( + link=json.dumps(trait_link) + ).execute() + out.write(",\n") + out.write(json.dumps(trait_link)) + +out.write("\n]\n") +this_task.set_output(out.finish())