From 9afff42e4ae2e4a17388db30e24f1ca3344cb1e9 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 10 Jul 2013 04:23:41 -0400 Subject: [PATCH] extend pgp data tutorial --- doc/user/tutorial-trait-search.textile | 126 ++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 5 deletions(-) diff --git a/doc/user/tutorial-trait-search.textile b/doc/user/tutorial-trait-search.textile index 058a19dc64..d347927d31 100644 --- a/doc/user/tutorial-trait-search.textile +++ b/doc/user/tutorial-trait-search.textile @@ -101,17 +101,19 @@ u'1h9kt-7a9it-q3w6j9od4ibpoyl', u'1h9kt-7a9it-qz8vzkuuz97ezwv', u'1h9kt-7a9it-t1v8sjz6dm9jmjf', u'1h9kt-7a9it-qe8wrbyvuqs5jew'] -h3. Find huIDs. +h3. Find PGP IDs. -For now we don't need to look up the Human objects themselves. We just need to look up "identifier" links to find their huIDs: +For now we don't need to look up the Human objects themselves. + +As an aside, we will look up "identifier" links to find PGP-assigned participant identifiers:
 human_uuids = map(lambda l: l['tail_uuid'], trait_links)
-huid_links = arvados.service.links().list(limit=1000,where=json.dumps({
+pgpid_links = arvados.service.links().list(limit=1000,where=json.dumps({
     "link_class": "identifier",
     "head_uuid": human_uuids
   })).execute()['items']
-map(lambda l: l['name'], huid_links)
+map(lambda l: l['name'], pgpid_links)
 
↓ @@ -125,9 +127,123 @@ map(lambda l: l['name'], huid_links) u'huDF04CC', u'huE2E371'] -These huIDs let us find public profiles: +These PGP IDs let us find public profiles: * "https://my.personalgenomes.org/profile/huE2E371":https://my.personalgenomes.org/profile/huE2E371 * "https://my.personalgenomes.org/profile/huDF04CC":https://my.personalgenomes.org/profile/huDF04CC * ... +h3. Find data. + +Find Collections that were provided by these Humans. + +
+provenance_links = arvados.service.links().list(where=json.dumps({
+    "link_class": "provenance",
+    "name": "provided",
+    "tail_uuid": human_uuids
+  })).execute()['items']
+collection_uuids = map(lambda l: l['head_uuid'], provenance_links)
+
+# build map of human uuid -> PGP ID
+pgpid = {}
+for pgpid_link in pgpid_links:
+  pgpid[pgpid_link['head_uuid']] = pgpid_link['name']
+
+# build map of collection uuid -> PGP ID
+for p_link in provenance_links:
+  pgpid[p_link['head_uuid']] = pgpid[p_link['tail_uuid']]
+
+# get details (e.g., list of files) of each collection
+collections = arvados.service.collections().list(where=json.dumps({
+    "uuid": collection_uuids
+  })).execute()['items']
+
+# print PGP public profile links with file locators
+for c in collections:
+  for f in c['files']:
+    print "https://my.personalgenomes.org/profile/%s %s %s%s" % (pgpid[c['uuid']], c['uuid'], ('' if f[0] == '.' else f[0]+'/'), f[1])
+
+
+ +↓ + +
+https://my.personalgenomes.org/profile/hu43860C a58dca7609fa84c8c38a7e926a97b2fc+302+K@qr1hi var-GS00253-DNA_A01_200_37-ASM.tsv.bz2
+https://my.personalgenomes.org/profile/huB1FD55 ea30eb9e46eedf7f05ed6e348c2baf5d+291+K@qr1hi var-GS000010320-ASM.tsv.bz2
+https://my.personalgenomes.org/profile/huDF04CC 4ab0df8f22f595d1747a22c476c05873+242+K@qr1hi var-GS000010427-ASM.tsv.bz2
+https://my.personalgenomes.org/profile/hu7A2F1D 756d0ada29b376140f64e7abfe6aa0e7+242+K@qr1hi var-GS000014566-ASM.tsv.bz2
+https://my.personalgenomes.org/profile/hu553620 7ed4e425bb1c7cc18387cbd9388181df+242+K@qr1hi var-GS000015272-ASM.tsv.bz2
+https://my.personalgenomes.org/profile/huD09534 542112e210daff30dd3cfea4801a9f2f+242+K@qr1hi var-GS000016374-ASM.tsv.bz2
+https://my.personalgenomes.org/profile/hu599905 33a9f3842b01ea3fdf27cc582f5ea2af+242+K@qr1hi var-GS000016015-ASM.tsv.bz2
+https://my.personalgenomes.org/profile/hu599905 d6e2e57cd60ba5979006d0b03e45e726+81+K@qr1hi Witch_results.zip
+https://my.personalgenomes.org/profile/hu553620 ea4f2d325592a1272f989d141a917fdd+85+K@qr1hi Devenwood_results.zip
+https://my.personalgenomes.org/profile/hu7A2F1D 4580f6620bb15b25b18373766e14e4a7+85+K@qr1hi Innkeeper_results.zip
+https://my.personalgenomes.org/profile/huD09534 fee37be9440b912eb90f5e779f272416+82+K@qr1hi Hallet_results.zip
+
+ +h3. Search for a variant. + +Look for variant rs1126809 in each of the "var" files (these contain variant calls from WGS data). + +
+job = {}
+for c in collections:
+  if [] != filter(lambda f: re.search('^var-.*\.tsv\.bz2', f[1]), c['files']):
+    job[c['uuid']] = arvados.service.jobs().create(job=json.dumps({
+      'script': 'grep',
+      'script_parameters': {'input': c['uuid'], 'pattern': "rs1126809\\b"},
+      'script_version': 'e7aeb42'
+    })).execute()
+    print "%s %s" % (pgpid[c['uuid']], job[c['uuid']]['uuid'])
+
+
+ +↓ + +
+hu43860C qr1hi-8i9sb-wyqq2eji4ehiwkq
+huB1FD55 qr1hi-8i9sb-ep68uf0jkj3je7q
+huDF04CC qr1hi-8i9sb-4ts4cvx6mbtcrsk
+hu7A2F1D qr1hi-8i9sb-5lkiu9sh7vdgven
+hu553620 qr1hi-8i9sb-nu4p6hjmziic022
+huD09534 qr1hi-8i9sb-bt9389e9g3ff0m1
+hu599905 qr1hi-8i9sb-ocg0i8r75luvke3
+
+ +Check job progress: + +
+map(lambda j: arvados.service.jobs().get(uuid=j['uuid']).execute()['success'], job.values())
+
+ +↓ + +
+[True, True, True, True, True, True, True]
+
+ +After the jobs have completed, check output file sizes. + +
+for collection_uuid in job:
+  job_uuid = job[collection_uuid]['uuid']
+  job_output = arvados.service.jobs().get(uuid=job_uuid).execute()['output']
+  output_files = arvados.service.collections().get(uuid=job_output).execute()['files']
+  print "%s %3d %s" % (pgpid[collection_uuid], output_files[0][2], job_output)
+
+
+ +↓ + +
+hu599905  80 5644238bfb2a1925d423f2c264819cfb+75+K@qr1hi
+huD09534  80 f98f92573cf521333607910d320cc33b+75+K@qr1hi
+huB1FD55   0 c10e07d8d90b51ee7f3b0a5855dc77c3+65+K@qr1hi
+hu7A2F1D  80 922c4ce8d3dab3268edf8b9312cc63d4+75+K@qr1hi
+hu553620   0 66da988f45a7ee16b6058fcbe9859d69+65+K@qr1hi
+huDF04CC  80 bbe919451a437dde236a561d4e469ad2+75+K@qr1hi
+hu43860C   0 45797e38410de9b9ddef2f4f0ec41a93+76+K@qr1hi
+
+ +Thus, of the 7 WGS results available for PGP participants reporting non-melanoma skin cancer, 4 include the rs1126809 / TYR-R402Q variant. -- 2.30.2