11162: Support public http and https file references
[arvados.git] / sdk / cwl / arvados_cwl / http.py
1 import requests
2 import email.utils
3 import time
4 import datetime
5 import re
6 import arvados
7 import arvados.collection
8 import urlparse
9
10 def my_formatdate(dt):
11     return email.utils.formatdate(timeval=time.mktime(now.timetuple()), localtime=False, usegmt=True)
12
13 def my_parsedate(text):
14     return datetime.datetime(*email.utils.parsedate(text)[:6])
15
16 def fresh_cache(url, properties):
17     pr = properties[url]
18     expires = None
19
20     if "Cache-Control" in pr:
21         if re.match(r"immutable", pr["Cache-Control"]):
22             return True
23
24         g = re.match(r"(s-maxage|max-age)=(\d+)", pr["Cache-Control"])
25         if g:
26             expires = my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
27
28     if expires is None and "Expires" in pr:
29         expires = my_parsedate(pr["Expires"])
30
31     if not expires:
32         return False
33
34     return (datetime.datetime.utcnow() < expires)
35
36 def remember_headers(url, properties, headers):
37     properties.setdefault(url, {})
38     for h in ("Cache-Control", "ETag", "Expires", "Date"):
39         if h in headers:
40             properties[url][h] = headers[h]
41     if "Date" not in headers:
42         properties[url]["Date"] = my_formatdate(datetime.datetime.utcnow())
43
44
45 def changed(url, properties):
46     req = requests.head(url)
47     remember_headers(url, properties, req.headers)
48
49     if req.status_code != 200:
50         raise Exception("Got status %s" % req.status_code)
51
52     pr = properties[url]
53     if "ETag" in pr and "ETag" in req.headers:
54         if pr["ETag"] == req.headers["ETag"]:
55             return False
56     return True
57
58 def http_to_keep(api, project_uuid, url):
59     r = api.collections().list(filters=[["properties", "exists", url]]).execute()
60     name = urlparse.urlparse(url).path.split("/")[-1]
61
62     for item in r["items"]:
63         properties = item["properties"]
64         if fresh_cache(url, properties):
65             # Do nothing
66             return "keep:%s/%s" % (item["portable_data_hash"], name)
67
68         if not changed(url, properties):
69             # ETag didn't change, same content, just update headers
70             api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
71             return "keep:%s/%s" % (item["portable_data_hash"], name)
72
73     properties = {}
74     req = requests.get(url, stream=True)
75
76     if req.status_code != 200:
77         raise Exception("Got status %s" % req.status_code)
78
79     remember_headers(url, properties, req.headers)
80
81     c = arvados.collection.Collection()
82
83     with c.open(name, "w") as f:
84         for chunk in req.iter_content(chunk_size=128):
85             f.write(chunk)
86
87     c.save_new(name="Downloaded from %s" % url, owner_uuid=project_uuid)
88
89     api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()
90
91     return "keep:%s/%s" % (c.portable_data_hash(), name)