20937: Improve error handling 20937-arv-copy-http
authorPeter Amstutz <peter.amstutz@curii.com>
Tue, 10 Oct 2023 17:49:15 +0000 (13:49 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Tue, 10 Oct 2023 17:49:15 +0000 (13:49 -0400)
Also clean up docs a bit

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

doc/user/topics/arv-copy.html.textile.liquid
sdk/python/arvados/commands/arv_copy.py
sdk/python/arvados/http_to_keep.py

index 7bb4375bae85f610688476162a2f42ab0ebdaa71..86b68f2854a4fee7eb73e0006d2dd3516521fe6e 100644 (file)
@@ -92,7 +92,7 @@ h3. How to copy a project
 We will use the uuid @jutro-j7d0g-xj19djofle3aryq@ as an example project.
 
 <notextile>
-<pre><code>~$ <span class="userinput">peteramstutz@shell:~$ arv-copy --project-uuid pirca-j7d0g-lr8sq3tx3ovn68k jutro-j7d0g-xj19djofle3aryq
+<pre><code>~$ <span class="userinput">~$ arv-copy --project-uuid pirca-j7d0g-lr8sq3tx3ovn68k jutro-j7d0g-xj19djofle3aryq
 2021-09-08 21:29:32 arvados.arv-copy[6377] INFO:
 2021-09-08 21:29:32 arvados.arv-copy[6377] INFO: Success: created copy with uuid pirca-j7d0g-ig9gvu5piznducp
 </code></pre>
@@ -107,20 +107,13 @@ h3. Importing HTTP resources to Keep
 You can also use @arv-copy@ to copy the contents of a HTTP URL into Keep.  When you do this, Arvados keeps track of the original URL the resource came from.  This allows you to refer to the resource by its original URL in Workflow inputs, but actually read from the local copy in Keep.
 
 <notextile>
-<pre><code>~$ <span class="userinput">peteramstutz@shell:~$ arv-copy --project-uuid tordo-j7d0g-lr8sq3tx3ovn68k https://example.com/index.html
+<pre><code>~$ <span class="userinput">~$ arv-copy --project-uuid tordo-j7d0g-lr8sq3tx3ovn68k https://example.com/index.html
 tordo-4zz18-dhpb6y9km2byb94
 2023-10-06 10:15:36 arvados.arv-copy[374147] INFO: Success: created copy with uuid tordo-4zz18-dhpb6y9km2byb94
 </code></pre>
 </notextile>
 
-In addition, if you provide a different cluster in @--src@, then @arv-copy@ will search the other cluster for a collection associated with that URL, and if found, copy from that collection instead of downloading from the original URL.
-
-<notextile>
-<pre><code>~$ <span class="userinput">peteramstutz@shell:~$ arv-copy --src pirca --project-uuid tordo-j7d0g-lr8sq3tx3ovn68k https://example.com/index.html
-tordo-4zz18-dhpb6y9km2byb94
-2023-10-06 10:15:36 arvados.arv-copy[374147] INFO: Success: created copy with uuid tordo-4zz18-dhpb6y9km2byb94
-</code></pre>
-</notextile>
+In addition, when importing from HTTP URLs, you may provide a different cluster than the destination in @--src@. This tells @arv-copy@ to search the other cluster for a collection associated with that URL, and if found, copy the collection from that cluster instead of downloading from the original URL.
 
 The following @arv-copy@ command line options affect the behavior of HTTP import.
 
index d9b8bd47dc490a16f315fdf5d830d75e1cfc5007..494f2e29b6b574b21df2a113f729bfc647b95210 100755 (executable)
@@ -146,21 +146,29 @@ def main():
 
     # Identify the kind of object we have been given, and begin copying.
     t = uuid_type(src_arv, args.object_uuid)
-    if t == 'Collection':
-        set_src_owner_uuid(src_arv.collections(), args.object_uuid, args)
-        result = copy_collection(args.object_uuid,
-                                 src_arv, dst_arv,
-                                 args)
-    elif t == 'Workflow':
-        set_src_owner_uuid(src_arv.workflows(), args.object_uuid, args)
-        result = copy_workflow(args.object_uuid, src_arv, dst_arv, args)
-    elif t == 'Group':
-        set_src_owner_uuid(src_arv.groups(), args.object_uuid, args)
-        result = copy_project(args.object_uuid, src_arv, dst_arv, args.project_uuid, args)
-    elif t == 'httpURL':
-        result = copy_from_http(args.object_uuid, src_arv, dst_arv, args)
-    else:
-        abort("cannot copy object {} of type {}".format(args.object_uuid, t))
+
+    try:
+        if t == 'Collection':
+            set_src_owner_uuid(src_arv.collections(), args.object_uuid, args)
+            result = copy_collection(args.object_uuid,
+                                     src_arv, dst_arv,
+                                     args)
+        elif t == 'Workflow':
+            set_src_owner_uuid(src_arv.workflows(), args.object_uuid, args)
+            result = copy_workflow(args.object_uuid, src_arv, dst_arv, args)
+        elif t == 'Group':
+            set_src_owner_uuid(src_arv.groups(), args.object_uuid, args)
+            result = copy_project(args.object_uuid, src_arv, dst_arv, args.project_uuid, args)
+        elif t == 'httpURL':
+            result = copy_from_http(args.object_uuid, src_arv, dst_arv, args)
+        else:
+            abort("cannot copy object {} of type {}".format(args.object_uuid, t))
+    except Exception as e:
+        if args.verbose:
+            logger.exception("%s", e)
+        else:
+            logger.error("%s", e)
+        exit(1)
 
     # Clean up any outstanding temp git repositories.
     for d in listvalues(local_repo_dir):
@@ -570,7 +578,7 @@ def copy_collection(obj_uuid, src, dst, args):
     dst_keep = arvados.keep.KeepClient(api_client=dst, num_retries=args.retries)
     dst_manifest = io.StringIO()
     dst_locators = {}
-    bytes_written = [0]
+    bytes_written = 0
     bytes_expected = total_collection_size(manifest)
     if args.progress:
         progress_writer = ProgressWriter(human_progress)
@@ -625,12 +633,14 @@ def copy_collection(obj_uuid, src, dst, args):
                     # Drain the 'get' queue so we end early
                     while True:
                         get_queue.get(False)
+                        get_queue.task_done()
                 except queue.Empty:
                     pass
             finally:
                 get_queue.task_done()
 
     def put_thread():
+        nonlocal bytes_written
         while True:
             item = put_queue.get()
             if item is None:
@@ -651,15 +661,16 @@ def copy_collection(obj_uuid, src, dst, args):
                 dst_locator = dst_keep.put(data, classes=(args.storage_classes or []))
                 with lock:
                     dst_locators[blockhash] = dst_locator
-                    bytes_written[0] += loc.size
+                    bytes_written += loc.size
                     if progress_writer:
-                        progress_writer.report(obj_uuid, bytes_written[0], bytes_expected)
+                        progress_writer.report(obj_uuid, bytes_written, bytes_expected)
             except e:
                 logger.error("Error putting block %s (%s bytes): %s", blockhash, loc.size, e)
                 try:
                     # Drain the 'get' queue so we end early
                     while True:
                         get_queue.get(False)
+                        get_queue.task_done()
                 except queue.Empty:
                     pass
                 transfer_error.append(e)
@@ -711,7 +722,7 @@ def copy_collection(obj_uuid, src, dst, args):
         dst_manifest.write("\n")
 
     if progress_writer:
-        progress_writer.report(obj_uuid, bytes_written[0], bytes_expected)
+        progress_writer.report(obj_uuid, bytes_written, bytes_expected)
         progress_writer.finish()
 
     # Copy the manifest and save the collection.
index b37ab5910914b67454038de18682b1bc41b5863f..1da8cf4946c652bfca3208ca632821144ccab1f5 100644 (file)
@@ -182,6 +182,10 @@ class _Downloader(PyCurlHelper):
         mt = re.match(r'^HTTP\/(\d(\.\d)?) ([1-5]\d\d) ([^\r\n\x00-\x08\x0b\x0c\x0e-\x1f\x7f]*)\r\n$', self._headers["x-status-line"])
         code = int(mt.group(3))
 
+        if not self.name:
+            logger.error("Cannot determine filename from URL or headers")
+            return
+
         if code == 200:
             self.target = self.collection.open(self.name, "wb")
 
@@ -193,7 +197,10 @@ class _Downloader(PyCurlHelper):
         self.count += len(chunk)
 
         if self.target is None:
-            return
+            # "If this number is not equal to the size of the byte
+            # string, this signifies an error and libcurl will abort
+            # the request."
+            return 0
 
         self.target.write(chunk)
         loopnow = time.time()