11162: Smarter http downloads.
authorPeter Amstutz <pamstutz@veritasgenetics.com>
Wed, 23 May 2018 19:23:44 +0000 (15:23 -0400)
committerPeter Amstutz <pamstutz@veritasgenetics.com>
Thu, 24 May 2018 19:35:42 +0000 (15:35 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz@veritasgenetics.com>

sdk/cwl/arvados_cwl/http.py

index ab59ad3a5310c753189116e25f8373fd3246a5fa..ea77786c4c1806ba6e522bc4ac0eecaaf32655f5 100644 (file)
@@ -14,7 +14,11 @@ def my_formatdate(dt):
     return email.utils.formatdate(timeval=time.mktime(now.timetuple()), localtime=False, usegmt=True)
 
 def my_parsedate(text):
-    return datetime.datetime(*email.utils.parsedate(text)[:6])
+    parsed = email.utils.parsedate(text)
+    if parsed:
+        return datetime.datetime(*parsed[:6])
+    else:
+        datetime.datetime(1970, 1, 1)
 
 def fresh_cache(url, properties):
     pr = properties[url]
@@ -53,7 +57,7 @@ def remember_headers(url, properties, headers):
 
 
 def changed(url, properties):
-    req = requests.head(url)
+    req = requests.head(url, allow_redirects=True)
     remember_headers(url, properties, req.headers)
 
     if req.status_code != 200:
@@ -67,21 +71,22 @@ def changed(url, properties):
 
 def http_to_keep(api, project_uuid, url):
     r = api.collections().list(filters=[["properties", "exists", url]]).execute()
-    name = urlparse.urlparse(url).path.split("/")[-1]
 
     for item in r["items"]:
         properties = item["properties"]
         if fresh_cache(url, properties):
             # Do nothing
-            return "keep:%s/%s" % (item["portable_data_hash"], name)
+            cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
+            return "keep:%s/%s" % (item["portable_data_hash"], cr.keys()[0])
 
         if not changed(url, properties):
             # ETag didn't change, same content, just update headers
             api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
-            return "keep:%s/%s" % (item["portable_data_hash"], name)
+            cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
+            return "keep:%s/%s" % (item["portable_data_hash"], cr.keys()[0])
 
     properties = {}
-    req = requests.get(url, stream=True)
+    req = requests.get(url, stream=True, allow_redirects=True)
 
     if req.status_code != 200:
         raise Exception("Failed to download '%s' got status %s " % (req.status_code, url))
@@ -92,6 +97,15 @@ def http_to_keep(api, project_uuid, url):
 
     c = arvados.collection.Collection()
 
+    if req.headers.get("Content-Disposition"):
+        grp = re.search(r'filename=("((\"|[^"])+)"|([^][()<>@,;:\"/?={} ]+))', req.headers["Content-Disposition"])
+        if grp.groups(2):
+            name = grp.groups(2)
+        else:
+            name = grp.groups(3)
+    else:
+        name = urlparse.urlparse(url).path.split("/")[-1]
+
     count = 0
     start = time.time()
     checkpoint = start