#!/usr/bin/env python # Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 import arvados import string import json import UserDict import sys this_job = arvados.current_job() this_task = arvados.current_task() this_job_input = this_job['script_parameters']['input'] out = arvados.CollectionWriter() out.set_current_file_name("arvados_objects.json") out.write("[\n") separator = "" traits = {} done_bytes = 0 done_ratio = 0 for input_file in arvados.CollectionReader(this_job_input).all_files(): for line_number, line in enumerate(input_file.readlines()): done_bytes += len(line) new_done_ratio = 1.0 * done_bytes / input_file.size() if line_number == 2 or new_done_ratio - done_ratio > 0.05: sys.stderr.write("progress: %d%% after %d lines\n" % (int(done_ratio * 100), line_number+1)) done_ratio = new_done_ratio words = string.split(string.strip(line), "\t") if line_number == 0: headings = words for t in arvados.api('v1').traits().list( where={'name':words}, limit=1000 ).execute()['items']: traits[t['name']] = t for i, trait_name in enumerate(words[3:], start=3): # find or create trait if trait_name not in traits: traits_match = arvados.api('v1').traits().list( where={'name':trait_name} ).execute()['items'] if len(traits_match) > 0: traits[trait_name] = traits_match[0] else: traits[trait_name] = arvados.api('v1').traits().create( trait={'name':trait_name}).execute() out.write(separator) out.write(json.dumps(traits[trait_name])) separator = ",\n" else: huID_links_match = arvados.api('v1').links().list( where={'link_class':'identifier','name':words[0]} ).execute()['items'] if len(huID_links_match) > 0: human_uuid = huID_links_match[0]['head_uuid'] else: human = arvados.api('v1').humans().create( body={} ).execute() huID_link = arvados.api('v1').links().create( body={ 'link_class':'identifier', 'name':words[0], 'head_kind':'arvados#human', 'head_uuid':human['uuid'] } ).execute() human_uuid = human['uuid'] human_trait = {} for t in arvados.api('v1').links().list( limit=10000, where={ 'tail_uuid':human_uuid, 'tail_kind':'arvados#human', 'head_kind':'arvados#trait', 'link_class':'human_trait', 'name':'pgp-survey-response' } ).execute()['items']: human_trait[t['head_uuid']] = t for i, trait_value in enumerate(words[3:], start=3): trait_uuid = traits[headings[i]]['uuid'] if trait_uuid in human_trait: trait_link = human_trait[trait_uuid] if trait_link['properties']['value'] != trait_value: # update database value to match survey response trait_link['properties']['value'] = trait_value arvados.api('v1').links().update( uuid=trait_link['uuid'], body={'properties':trait_link['properties']} ).execute() out.write(",\n") out.write(json.dumps(trait_link)) elif trait_value == '': # nothing in database, nothing in input pass else: trait_link = { 'tail_uuid':human_uuid, 'tail_kind':'arvados#human', 'head_uuid':traits[headings[i]]['uuid'], 'head_kind':'arvados#trait', 'link_class':'human_trait', 'name':'pgp-survey-response', 'properties': { 'value': trait_value } } arvados.api('v1').links().create( body=trait_link ).execute() out.write(",\n") out.write(json.dumps(trait_link)) out.write("\n]\n") this_task.set_output(out.finish())