21639: Tell the kernel to map in whole blocks with madvise
[arvados.git] / sdk / python / arvados / http_to_keep.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import calendar
6 import dataclasses
7 import datetime
8 import email.utils
9 import logging
10 import re
11 import time
12 import typing
13 import urllib.parse
14
15 import pycurl
16
17 import arvados
18 import arvados.collection
19 from arvados._pycurlhelper import PyCurlHelper
20
21 logger = logging.getLogger('arvados.http_import')
22
23 def _my_formatdate(dt):
24     return email.utils.formatdate(timeval=calendar.timegm(dt.timetuple()),
25                                   localtime=False, usegmt=True)
26
27 def _my_parsedate(text):
28     parsed = email.utils.parsedate_tz(text)
29     if parsed:
30         if parsed[9]:
31             # Adjust to UTC
32             return datetime.datetime(*parsed[:6]) + datetime.timedelta(seconds=parsed[9])
33         else:
34             # TZ is zero or missing, assume UTC.
35             return datetime.datetime(*parsed[:6])
36     else:
37         return datetime.datetime(1970, 1, 1)
38
39 def _fresh_cache(url, properties, now):
40     pr = properties[url]
41     expires = None
42
43     logger.debug("Checking cache freshness for %s using %s", url, pr)
44
45     if "Cache-Control" in pr:
46         if re.match(r"immutable", pr["Cache-Control"]):
47             return True
48
49         g = re.match(r"(s-maxage|max-age)=(\d+)", pr["Cache-Control"])
50         if g:
51             expires = _my_parsedate(pr["Date"]) + datetime.timedelta(seconds=int(g.group(2)))
52
53     if expires is None and "Expires" in pr:
54         expires = _my_parsedate(pr["Expires"])
55
56     if expires is None:
57         # Use a default cache time of 24 hours if upstream didn't set
58         # any cache headers, to reduce redundant downloads.
59         expires = _my_parsedate(pr["Date"]) + datetime.timedelta(hours=24)
60
61     if not expires:
62         return False
63
64     return (now < expires)
65
66 def _remember_headers(url, properties, headers, now):
67     properties.setdefault(url, {})
68     for h in ("Cache-Control", "Etag", "Expires", "Date", "Content-Length"):
69         if h in headers:
70             properties[url][h] = headers[h]
71     if "Date" not in headers:
72         properties[url]["Date"] = _my_formatdate(now)
73
74 @dataclasses.dataclass
75 class _Response:
76     status_code: int
77     headers: typing.Mapping[str, str]
78
79
80 class _Downloader(PyCurlHelper):
81     # Wait up to 60 seconds for connection
82     # How long it can be in "low bandwidth" state before it gives up
83     # Low bandwidth threshold is 32 KiB/s
84     DOWNLOADER_TIMEOUT = (60, 300, 32768)
85
86     def __init__(self, apiclient):
87         super(_Downloader, self).__init__(title_case_headers=True)
88         self.curl = pycurl.Curl()
89         self.curl.setopt(pycurl.NOSIGNAL, 1)
90         self.curl.setopt(pycurl.OPENSOCKETFUNCTION,
91                     lambda *args, **kwargs: self._socket_open(*args, **kwargs))
92         self.target = None
93         self.apiclient = apiclient
94
95     def head(self, url):
96         get_headers = {'Accept': 'application/octet-stream'}
97         self._headers = {}
98
99         self.curl.setopt(pycurl.URL, url.encode('utf-8'))
100         self.curl.setopt(pycurl.HTTPHEADER, [
101             '{}: {}'.format(k,v) for k,v in get_headers.items()])
102
103         self.curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
104         self.curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
105         self.curl.setopt(pycurl.NOBODY, True)
106         self.curl.setopt(pycurl.FOLLOWLOCATION, True)
107
108         self._setcurltimeouts(self.curl, self.DOWNLOADER_TIMEOUT, True)
109
110         try:
111             self.curl.perform()
112         except Exception as e:
113             raise arvados.errors.HttpError(0, str(e))
114         finally:
115             if self._socket:
116                 self._socket.close()
117                 self._socket = None
118
119         return _Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
120
121     def download(self, url, headers):
122         self.count = 0
123         self.start = time.time()
124         self.checkpoint = self.start
125         self._headers = {}
126         self._first_chunk = True
127         self.collection = None
128         self.parsedurl = urllib.parse.urlparse(url)
129
130         get_headers = {'Accept': 'application/octet-stream'}
131         get_headers.update(headers)
132
133         self.curl.setopt(pycurl.URL, url.encode('utf-8'))
134         self.curl.setopt(pycurl.HTTPHEADER, [
135             '{}: {}'.format(k,v) for k,v in get_headers.items()])
136
137         self.curl.setopt(pycurl.WRITEFUNCTION, self.body_write)
138         self.curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
139
140         self.curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
141         self.curl.setopt(pycurl.HTTPGET, True)
142         self.curl.setopt(pycurl.FOLLOWLOCATION, True)
143
144         self._setcurltimeouts(self.curl, self.DOWNLOADER_TIMEOUT, False)
145
146         try:
147             self.curl.perform()
148         except Exception as e:
149             raise arvados.errors.HttpError(0, str(e))
150         finally:
151             if self._socket:
152                 self._socket.close()
153                 self._socket = None
154
155         return _Response(self.curl.getinfo(pycurl.RESPONSE_CODE), self._headers)
156
157     def headers_received(self):
158         self.collection = arvados.collection.Collection(api_client=self.apiclient)
159
160         if "Content-Length" in self._headers:
161             self.contentlength = int(self._headers["Content-Length"])
162             logger.info("File size is %s bytes", self.contentlength)
163         else:
164             self.contentlength = None
165
166         if self._headers.get("Content-Disposition"):
167             grp = re.search(r'filename=("((\"|[^"])+)"|([^][()<>@,;:\"/?={} ]+))',
168                             self._headers["Content-Disposition"])
169             if grp.group(2):
170                 self.name = grp.group(2)
171             else:
172                 self.name = grp.group(4)
173         else:
174             self.name = self.parsedurl.path.split("/")[-1]
175
176         # Can't call curl.getinfo(pycurl.RESPONSE_CODE) until
177         # perform() is done but we need to know the status before that
178         # so we have to parse the status line ourselves.
179         mt = re.match(r'^HTTP\/(\d(\.\d)?) ([1-5]\d\d) ([^\r\n\x00-\x08\x0b\x0c\x0e-\x1f\x7f]*)\r\n$', self._headers["x-status-line"])
180         code = int(mt.group(3))
181
182         if not self.name:
183             logger.error("Cannot determine filename from URL or headers")
184             return
185
186         if code == 200:
187             self.target = self.collection.open(self.name, "wb")
188
189     def body_write(self, chunk):
190         if self._first_chunk:
191             self.headers_received()
192             self._first_chunk = False
193
194         self.count += len(chunk)
195
196         if self.target is None:
197             # "If this number is not equal to the size of the byte
198             # string, this signifies an error and libcurl will abort
199             # the request."
200             return 0
201
202         self.target.write(chunk)
203         loopnow = time.time()
204         if (loopnow - self.checkpoint) < 20:
205             return
206
207         bps = self.count / (loopnow - self.start)
208         if self.contentlength is not None:
209             logger.info("%2.1f%% complete, %6.2f MiB/s, %1.0f seconds left",
210                         ((self.count * 100) / self.contentlength),
211                         (bps / (1024.0*1024.0)),
212                         ((self.contentlength-self.count) // bps))
213         else:
214             logger.info("%d downloaded, %6.2f MiB/s", count, (bps / (1024.0*1024.0)))
215         self.checkpoint = loopnow
216
217
218 def _changed(url, clean_url, properties, now, curldownloader):
219     req = curldownloader.head(url)
220
221     if req.status_code != 200:
222         # Sometimes endpoints are misconfigured and will deny HEAD but
223         # allow GET so instead of failing here, we'll try GET If-None-Match
224         return True
225
226     # previous version of this code used "ETag", now we are
227     # normalizing to "Etag", check for both.
228     etag = properties[url].get("Etag") or properties[url].get("ETag")
229
230     if url in properties:
231         del properties[url]
232     _remember_headers(clean_url, properties, req.headers, now)
233
234     if "Etag" in req.headers and etag == req.headers["Etag"]:
235         # Didn't change
236         return False
237
238     return True
239
240 def _etag_quote(etag):
241     # if it already has leading and trailing quotes, do nothing
242     if etag[0] == '"' and etag[-1] == '"':
243         return etag
244     else:
245         # Add quotes.
246         return '"' + etag + '"'
247
248
249 def check_cached_url(api, project_uuid, url, etags,
250                      utcnow=datetime.datetime.utcnow,
251                      varying_url_params="",
252                      prefer_cached_downloads=False):
253
254     logger.info("Checking Keep for %s", url)
255
256     varying_params = [s.strip() for s in varying_url_params.split(",")]
257
258     parsed = urllib.parse.urlparse(url)
259     query = [q for q in urllib.parse.parse_qsl(parsed.query)
260              if q[0] not in varying_params]
261
262     clean_url = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params,
263                                          urllib.parse.urlencode(query, safe="/"),  parsed.fragment))
264
265     r1 = api.collections().list(filters=[["properties", "exists", url]]).execute()
266
267     if clean_url == url:
268         items = r1["items"]
269     else:
270         r2 = api.collections().list(filters=[["properties", "exists", clean_url]]).execute()
271         items = r1["items"] + r2["items"]
272
273     now = utcnow()
274
275     curldownloader = _Downloader(api)
276
277     for item in items:
278         properties = item["properties"]
279
280         if clean_url in properties:
281             cache_url = clean_url
282         elif url in properties:
283             cache_url = url
284         else:
285             raise Exception("Shouldn't happen, got an API result for %s that doesn't have the URL in properties" % item["uuid"])
286
287         if prefer_cached_downloads or _fresh_cache(cache_url, properties, now):
288             # HTTP caching rules say we should use the cache
289             cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
290             return (item["portable_data_hash"], next(iter(cr.keys())), item["uuid"], clean_url, now)
291
292         if not _changed(cache_url, clean_url, properties, now, curldownloader):
293             # Etag didn't change, same content, just update headers
294             api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
295             cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
296             return (item["portable_data_hash"], next(iter(cr.keys())), item["uuid"], clean_url, now)
297
298         for etagstr in ("Etag", "ETag"):
299             if etagstr in properties[cache_url] and len(properties[cache_url][etagstr]) > 2:
300                 etags[properties[cache_url][etagstr]] = item
301
302     logger.debug("Found ETag values %s", etags)
303
304     return (None, None, None, clean_url, now)
305
306
307 def http_to_keep(api, project_uuid, url,
308                  utcnow=datetime.datetime.utcnow, varying_url_params="",
309                  prefer_cached_downloads=False):
310     """Download a file over HTTP and upload it to keep, with HTTP headers as metadata.
311
312     Before downloading the URL, checks to see if the URL already
313     exists in Keep and applies HTTP caching policy, the
314     varying_url_params and prefer_cached_downloads flags in order to
315     decide whether to use the version in Keep or re-download it.
316     """
317
318     etags = {}
319     cache_result = check_cached_url(api, project_uuid, url, etags,
320                                     utcnow, varying_url_params,
321                                     prefer_cached_downloads)
322
323     if cache_result[0] is not None:
324         return cache_result
325
326     clean_url = cache_result[3]
327     now = cache_result[4]
328
329     properties = {}
330     headers = {}
331     if etags:
332         headers['If-None-Match'] = ', '.join([_etag_quote(k) for k,v in etags.items()])
333     logger.debug("Sending GET request with headers %s", headers)
334
335     logger.info("Beginning download of %s", url)
336
337     curldownloader = _Downloader(api)
338
339     req = curldownloader.download(url, headers)
340
341     c = curldownloader.collection
342
343     if req.status_code not in (200, 304):
344         raise Exception("Failed to download '%s' got status %s " % (url, req.status_code))
345
346     if curldownloader.target is not None:
347         curldownloader.target.close()
348
349     _remember_headers(clean_url, properties, req.headers, now)
350
351     if req.status_code == 304 and "Etag" in req.headers and req.headers["Etag"] in etags:
352         item = etags[req.headers["Etag"]]
353         item["properties"].update(properties)
354         api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute()
355         cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
356         return (item["portable_data_hash"], list(cr.keys())[0], item["uuid"], clean_url, now)
357
358     logger.info("Download complete")
359
360     collectionname = "Downloaded from %s" % urllib.parse.quote(clean_url, safe='')
361
362     # max length - space to add a timestamp used by ensure_unique_name
363     max_name_len = 254 - 28
364
365     if len(collectionname) > max_name_len:
366         over = len(collectionname) - max_name_len
367         split = int(max_name_len/2)
368         collectionname = collectionname[0:split] + "…" + collectionname[split+over:]
369
370     c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)
371
372     api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()
373
374     return (c.portable_data_hash(), curldownloader.name, c.manifest_locator(), clean_url, now)