X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/7d4da75d3980d465053c44b4d5b16afe166912a6..c6428570be58b01ce80c257adce06114cea1d88f:/sdk/cwl/arvados_cwl/http.py diff --git a/sdk/cwl/arvados_cwl/http.py b/sdk/cwl/arvados_cwl/http.py index ccc2e793b0..33aa098845 100644 --- a/sdk/cwl/arvados_cwl/http.py +++ b/sdk/cwl/arvados_cwl/http.py @@ -16,6 +16,7 @@ import arvados.collection import urllib.parse import logging import calendar +import urllib.parse logger = logging.getLogger('arvados.cwl-runner') @@ -76,7 +77,9 @@ def changed(url, properties, now): remember_headers(url, properties, req.headers, now) if req.status_code != 200: - raise Exception("Got status %s" % req.status_code) + # Sometimes endpoints are misconfigured and will deny HEAD but + # allow GET so instead of failing here, we'll try GET If-None-Match + return True pr = properties[url] if "ETag" in pr and "ETag" in req.headers: @@ -85,11 +88,21 @@ def changed(url, properties, now): return True +def etag_quote(etag): + # if it already has leading and trailing quotes, do nothing + if etag[0] == '"' and etag[-1] == '"': + return etag + else: + # Add quotes. + return '"' + etag + '"' + def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow): r = api.collections().list(filters=[["properties", "exists", url]]).execute() now = utcnow() + etags = {} + for item in r["items"]: properties = item["properties"] if fresh_cache(url, properties, now): @@ -103,14 +116,27 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow): cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api) return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0]) + if "ETag" in properties and len(properties["ETag"]) > 2: + etags[properties["ETag"]] = item + properties = {} - req = requests.get(url, stream=True, allow_redirects=True) + headers = {} + if etags: + headers['If-None-Match'] = ', '.join([etag_quote(k) for k,v in etags.items()]) + req = requests.get(url, stream=True, allow_redirects=True, headers=headers) - if req.status_code != 200: + if req.status_code not in (200, 304): raise Exception("Failed to download '%s' got status %s " % (url, req.status_code)) remember_headers(url, properties, req.headers, now) + if req.status_code == 304 and "ETag" in req.headers and req.headers["ETag"] in etags: + item = etags[req.headers["ETag"]] + item["properties"].update(properties) + api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute() + cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api) + return "keep:%s/%s" % (item["portable_data_hash"], list(cr.keys())[0]) + if "Content-Length" in properties[url]: cl = int(properties[url]["Content-Length"]) logger.info("Downloading %s (%s bytes)", url, cl) @@ -132,7 +158,7 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow): count = 0 start = time.time() checkpoint = start - with c.open(name, "w") as f: + with c.open(name, "wb") as f: for chunk in req.iter_content(chunk_size=1024): count += len(chunk) f.write(chunk) @@ -148,7 +174,19 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow): logger.info("%d downloaded, %3.2f MiB/s", count, (bps / (1024*1024))) checkpoint = loopnow - c.save_new(name="Downloaded from %s" % url, owner_uuid=project_uuid, ensure_unique_name=True) + logger.info("Download complete") + + collectionname = "Downloaded from %s" % urllib.parse.quote(url, safe='') + + # max length - space to add a timestamp used by ensure_unique_name + max_name_len = 254 - 28 + + if len(collectionname) > max_name_len: + over = len(collectionname) - max_name_len + split = int(max_name_len/2) + collectionname = collectionname[0:split] + "…" + collectionname[split+over:] + + c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True) api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()