5787: Write files that are pointed to by symlinks that are outside of Keep.
[arvados.git] / crunch_scripts / crunchutil / vwd.py
index 4a975468f0f0e886f76d1636a2716649d023a15b..5b9edf5e8395a0343a871c17a31aae69a585ba25 100644 (file)
@@ -1,8 +1,8 @@
 import arvados
 import os
-import robust_put
 import stat
 import arvados.commands.run
+import logging
 
 # Implements "Virtual Working Directory"
 # Provides a way of emulating a shared writable directory in Keep based
@@ -34,19 +34,19 @@ def checkout(source_collection, target_dir, keepmount=None):
             os.symlink(os.path.join(root, f), os.path.join(target_dir, rel, f))
 
 def checkin(target_dir):
-    """Write files in the target_dir to Keep.
+    """Write files in `target_dir` to Keep.
 
-    Symlinks into the keep mount in the output dir are efficiently added to the
-    collection with no data copying.
+    Regular files or symlinks to files outside the keep mount are written to
+    Keep as normal files (Keep does not support symlinks).
+
+    Symlinks to files in the keep mount will result in files in the new
+    collection which reference existing Keep blocks, no data copying necessay.
 
     Returns a new Collection object, with data flushed but the collection record
     not saved to the API.
 
     """
 
-    # delete symlinks, commit directory, merge manifests and return combined
-    # collection.
-
     outputcollection = arvados.collection.Collection(num_retries=5)
 
     if target_dir[-1:] != '/':
@@ -54,30 +54,42 @@ def checkin(target_dir):
 
     collections = {}
 
+    logger = logging.getLogger("arvados")
+
     for root, dirs, files in os.walk(target_dir):
         for f in files:
-            s = os.lstat(os.path.join(root, f))
-            if stat.S_ISLNK(s.st_mode):
-                # 1. check if it is a link into a collection
-                real = os.path.split(os.path.realpath(os.path.join(root, f)))
-                (pdh, branch) = arvados.commands.run.is_in_collection(real[0], real[1])
-                if pdh is not None:
-                    # 2. load collection
-                    if pdh not in collections:
-                        collections[pdh] = arvados.collection.CollectionReader(pdh,
-                                                                               api_client=outputcollection._my_api(),
-                                                                               keep_client=outputcollection._my_keep(),
-                                                                               num_retries=5)
-                    # 3. copy arvfile to new collection
-                    outputcollection.copy(branch, os.path.join(root[len(target_dir):], f), source_collection=collections[pdh])
-
-            elif stat.S_ISREG(s.st_mode):
-                reldir = root[len(target_dir):]
-                with outputcollection.open(os.path.join(reldir, f), "wb") as writer:
-                    with open(os.path.join(root, f), "rb") as reader:
-                        dat = reader.read(64*1024)
-                        while dat:
-                            writer.write(dat)
+            try:
+                s = os.lstat(os.path.join(root, f))
+
+                writeIt = False
+
+                if stat.S_ISREG(s.st_mode):
+                    writeIt = True
+                elif stat.S_ISLNK(s.st_mode):
+                    # 1. check if it is a link into a collection
+                    real = os.path.split(os.path.realpath(os.path.join(root, f)))
+                    (pdh, branch) = arvados.commands.run.is_in_collection(real[0], real[1])
+                    if pdh is not None:
+                        # 2. load collection
+                        if pdh not in collections:
+                            collections[pdh] = arvados.collection.CollectionReader(pdh,
+                                                                                   api_client=outputcollection._my_api(),
+                                                                                   keep_client=outputcollection._my_keep(),
+                                                                                   num_retries=5)
+                        # 3. copy arvfile to new collection
+                        outputcollection.copy(branch, os.path.join(root[len(target_dir):], f), source_collection=collections[pdh])
+                    else:
+                        writeIt = True
+
+                if writeIt:
+                    reldir = root[len(target_dir):]
+                    with outputcollection.open(os.path.join(reldir, f), "wb") as writer:
+                        with open(os.path.join(root, f), "rb") as reader:
                             dat = reader.read(64*1024)
+                            while dat:
+                                writer.write(dat)
+                                dat = reader.read(64*1024)
+            except (IOError, OSError) as e:
+                logger.error(e)
 
     return outputcollection