import urllib.parse
import logging
import calendar
+import urllib.parse
logger = logging.getLogger('arvados.cwl-runner')
remember_headers(url, properties, req.headers, now)
if req.status_code != 200:
- raise Exception("Got status %s" % req.status_code)
+ # Sometimes endpoints are misconfigured and will deny HEAD but
+ # allow GET so instead of failing here, we'll try GET If-None-Match
+ return True
pr = properties[url]
if "ETag" in pr and "ETag" in req.headers:
return True
+def etag_quote(etag):
+ # if it already has leading and trailing quotes, do nothing
+ if etag[0] == '"' and etag[-1] == '"':
+ return etag
+ else:
+ # Add quotes.
+ return '"' + etag + '"'
+
def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
r = api.collections().list(filters=[["properties", "exists", url]]).execute()
now = utcnow()
+ etags = {}
+
for item in r["items"]:
properties = item["properties"]
if fresh_cache(url, properties, now):
cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+ if "ETag" in properties and len(properties["ETag"]) > 2:
+ etags[properties["ETag"]] = item
+
properties = {}
- req = requests.get(url, stream=True, allow_redirects=True)
+ headers = {}
+ if etags:
+ headers['If-None-Match'] = ', '.join([etag_quote(k) for k,v in etags.items()])
+ req = requests.get(url, stream=True, allow_redirects=True, headers=headers)
- if req.status_code != 200:
+ if req.status_code not in (200, 304):
raise Exception("Failed to download '%s' got status %s " % (url, req.status_code))
remember_headers(url, properties, req.headers, now)
+ if req.status_code == 304 and "ETag" in req.headers and req.headers["ETag"] in etags:
+ item = etags[req.headers["ETag"]]
+ item["properties"].update(properties)
+ api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute()
+ cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
+ return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0])
+
if "Content-Length" in properties[url]:
cl = int(properties[url]["Content-Length"])
logger.info("Downloading %s (%s bytes)", url, cl)
logger.info("%d downloaded, %3.2f MiB/s", count, (bps / (1024*1024)))
checkpoint = loopnow
- c.save_new(name="Downloaded from %s" % url, owner_uuid=project_uuid, ensure_unique_name=True)
+ logger.info("Download complete")
+
+ collectionname = "Downloaded from %s" % urllib.parse.quote(url, safe='')
+
+ # max length - space to add a timestamp used by ensure_unique_name
+ max_name_len = 254 - 28
+
+ if len(collectionname) > max_name_len:
+ over = len(collectionname) - max_name_len
+ split = int(max_name_len/2)
+ collectionname = collectionname[0:split] + "…" + collectionname[split+over:]
+
+ c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)
api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()