Merge branch '6702-gce-node-create-fix' closes #6702
authorPeter Amstutz <peter.amstutz@curoverse.com>
Wed, 3 Feb 2016 20:54:18 +0000 (15:54 -0500)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Wed, 3 Feb 2016 20:54:18 +0000 (15:54 -0500)
14 files changed:
apps/workbench/app/models/arvados_api_client.rb
backports/python-llfuse/fpm-info.sh
doc/install/install-compute-node.html.textile.liquid
doc/install/install-crunch-dispatch.html.textile.liquid
doc/install/install-keepproxy.html.textile.liquid
doc/install/install-shell-server.html.textile.liquid
doc/install/install-workbench-app.html.textile.liquid
services/fuse/arvados_fuse/command.py
services/fuse/fpm-info.sh [new file with mode: 0644]
services/fuse/tests/integration_test.py
services/fuse/tests/mount_test_base.py
services/fuse/tests/test_exec.py [new file with mode: 0644]
services/login-sync/bin/arvados-login-sync
tools/crunchstat-summary/MANIFEST.in

index 4d549d194728eb00a9f3a2a01fd097d84955a16e..13d4a24c69cc5f7e687c47c0e95ed715ab9f5fa2 100644 (file)
@@ -89,7 +89,10 @@ class ArvadosApiClient
           @api_client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
         else
           # Use system CA certificates
-          @api_client.ssl_config.add_trust_ca('/etc/ssl/certs')
+          ["/etc/ssl/certs/ca-certificates.crt",
+           "/etc/pki/tls/certs/ca-bundle.crt"]
+            .select { |ca_path| File.readable?(ca_path) }
+            .each { |ca_path| @api_client.ssl_config.add_trust_ca(ca_path) }
         end
         if Rails.configuration.api_response_compression
           @api_client.transparent_gzip_decompression = true
index a7d9398701b7bc4c405027c26f5133fe7b23d383..327bc5e50f5793c6cb8a81a2f73117fac424e3be 100644 (file)
@@ -11,3 +11,6 @@ esac
 
 # FIXME: Remove this line after #6885 is done.
 fpm_args+=(--iteration 2)
+
+# FIXME: Remove once support for llfuse 0.42+ is in place
+fpm_args+=(-v 0.41.1)
index ccca0cc3b85a3ad0346b11fce2de989a52fe8a5c..9a64ac76d79532d643a895a7df5d2f9971dfd2fc 100644 (file)
@@ -54,7 +54,7 @@ Install SLURM following "the same process you used to install the Crunch dispatc
 
 h2. Copy configuration files from the dispatcher (API server)
 
-The @/etc/slurm-llnl/slurm.conf@ and @/etc/munge/munge.key@ files need to be identicaly across the dispatcher and all compute nodes. Copy the files you created in the "Install the Crunch dispatcher":install-crunch-dispatch.html step to this compute node.
+The @slurm.conf@ and @/etc/munge/munge.key@ files need to be identical across the dispatcher and all compute nodes. Copy the files you created in the "Install the Crunch dispatcher":install-crunch-dispatch.html step to this compute node.
 
 h2. Configure FUSE
 
index c75e7e8de1840031bc2ed864d791d60017874278..28a694be362372bf12d477ed46e0cbecc8fad3d6 100644 (file)
@@ -65,7 +65,7 @@ On Red Hat-based systems:
 </code></pre>
 </notextile>
 
-Now we need to give SLURM a configuration file in @/etc/slurm-llnl/slurm.conf@. Here's an example:
+Now we need to give SLURM a configuration file.  On Debian-based systems, this is installed at @/etc/slurm-llnl/slurm.conf@.  On Red Hat-based systems, this is installed at @/etc/slurm/slurm.conf@.  Here's an example @slurm.conf@:
 
 <notextile>
 <pre>
@@ -130,8 +130,8 @@ Whenever you change this file, you will need to update the copy _on every comput
 Each hostname in @slurm.conf@ must also resolve correctly on all SLURM worker nodes as well as the controller itself. Furthermore, the hostnames used in the configuration file must match the hostnames reported by @hostname@ or @hostname -s@ on the nodes themselves. This applies to the ControlMachine as well as the worker nodes.
 
 For example:
-* In @/etc/slurm-llnl/slurm.conf@ on control and worker nodes: @ControlMachine=uuid_prefix.your.domain@
-* In @/etc/slurm-llnl/slurm.conf@ on control and worker nodes: @NodeName=compute[0-255]@
+* In @slurm.conf@ on control and worker nodes: @ControlMachine=uuid_prefix.your.domain@
+* In @slurm.conf@ on control and worker nodes: @NodeName=compute[0-255]@
 * In @/etc/resolv.conf@ on control and worker nodes: @search uuid_prefix.your.domain@
 * On the control node: @hostname@ reports @uuid_prefix.your.domain@
 * On worker node 123: @hostname@ reports @compute123.uuid_prefix.your.domain@
index d427274e0bd0709cab6f5295abccd2425ae5e160..a6bb5d4bd9aeb3a6d2b276d20e1fc1e0505bf2c9 100644 (file)
@@ -79,7 +79,7 @@ upstream keepproxy {
 
 server {
   listen                <span class="userinput">[your public IP address]</span>:443 ssl;
-  server_name           keep.<span class="userinput">uuid_prefix</span>.your.domain
+  server_name           keep.<span class="userinput">uuid_prefix</span>.your.domain;
 
   proxy_connect_timeout 90s;
   proxy_read_timeout    300s;
index ec4c9d46cc8ee7c98a097baa3ddad49f55bd760c..dd5995ffdde442c85f665cb5feae14fd9b0fe879 100644 (file)
@@ -12,7 +12,21 @@ Please follow the "API token guide":../user/reference/api-tokens.html to get API
 
 h2. Install the Ruby SDK and utilities
 
-If you're using RVM:
+First, install the curl development libraries necessary to build the Arvados Ruby SDK.  On Debian-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo apt-get install libcurl4-openssl-dev</span>
+</code></pre>
+</notextile>
+
+On Red Hat-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install libcurl-devel</span>
+</code></pre>
+</notextile>
+
+Next, install the arvados-cli Ruby gem.  If you're using RVM:
 
 <notextile>
 <pre><code>~$ <span class="userinput">sudo /usr/local/rvm/bin/rvm-exec default gem install arvados-cli</span>
index c27e8c523b6862f0e53902e0d9426190395ee0df..5a60ca5484e49fe502617accf847988d9d815d81 100644 (file)
@@ -87,7 +87,7 @@ For best performance, we recommend you use Nginx as your Web server front-end, w
 <li>If you're deploying on an older Red Hat-based distribution and installed Pythyon 2.7 from Software Collections, configure Nginx to use it:
 
 <pre><code>~$ <span class="userinput">sudo usermod --shell /bin/bash nginx</span>
-~$ <span class="userinput">sudo -u nginx sh -c 'echo "[[ -z \$PS1 && -e /opt/rh/python27/enable ]] && source /opt/rh/python27/enable" >>~/.bash_profile'</span>
+~$ <span class="userinput">sudo -u nginx sh -c 'echo "[[ -z \$PS1 ]] && source scl_source enable python27" >>~/.bash_profile'</span>
 </code></pre>
 
 </li>
index 71623a5f3d07364bb4de94c42023af3b5658fc55..6d5300444ad58ee2cff24e0124f807d3ad6611c3 100644 (file)
@@ -82,6 +82,10 @@ class ArgumentParser(argparse.ArgumentParser):
 
         self.add_argument('--crunchstat-interval', type=float, help="Write stats to stderr every N seconds (default disabled)", default=0)
 
+        self.add_argument('--unmount-timeout',
+                          type=float, default=2.0,
+                          help="Time to wait for graceful shutdown after --exec program exits and filesystem is unmounted")
+
         self.add_argument('--exec', type=str, nargs=argparse.REMAINDER,
                             dest="exec_args", metavar=('command', 'args', '...', '--'),
                             help="""Mount, run a command, then unmount and exit""")
@@ -108,13 +112,19 @@ class Mount(object):
         llfuse.init(self.operations, self.args.mountpoint, self._fuse_options())
         if self.args.mode != 'by_pdh':
             self.operations.listen_for_events()
-        t = threading.Thread(None, lambda: llfuse.main())
-        t.start()
+        self.llfuse_thread = threading.Thread(None, lambda: self._llfuse_main())
+        self.llfuse_thread.daemon = True
+        self.llfuse_thread.start()
         self.operations.initlock.wait()
 
     def __exit__(self, exc_type, exc_value, traceback):
         subprocess.call(["fusermount", "-u", "-z", self.args.mountpoint])
-        self.operations.destroy()
+        self.llfuse_thread.join(timeout=self.args.unmount_timeout)
+        if self.llfuse_thread.is_alive():
+            self.logger.warning("Mount.__exit__:"
+                                " llfuse thread still alive %fs after umount"
+                                " -- abandoning and exiting anyway",
+                                self.args.unmount_timeout)
 
     def run(self):
         if self.args.exec_args:
@@ -277,63 +287,56 @@ From here, the following directories are available:
 '''.format(api_host, user_email)
 
     def _run_exec(self):
-        # Initialize the fuse connection
-        llfuse.init(self.operations, self.args.mountpoint, self._fuse_options())
-
-        # Subscribe to change events from API server
-        if self.args.mode != 'by_pdh':
-            self.operations.listen_for_events()
-
-        t = threading.Thread(None, lambda: llfuse.main())
-        t.start()
-
-        # wait until the driver is finished initializing
-        self.operations.initlock.wait()
-
         rc = 255
-        try:
-            sp = subprocess.Popen(self.args.exec_args, shell=False)
-
-            # forward signals to the process.
-            signal.signal(signal.SIGINT, lambda signum, frame: sp.send_signal(signum))
-            signal.signal(signal.SIGTERM, lambda signum, frame: sp.send_signal(signum))
-            signal.signal(signal.SIGQUIT, lambda signum, frame: sp.send_signal(signum))
-
-            # wait for process to complete.
-            rc = sp.wait()
-
-            # restore default signal handlers.
-            signal.signal(signal.SIGINT, signal.SIG_DFL)
-            signal.signal(signal.SIGTERM, signal.SIG_DFL)
-            signal.signal(signal.SIGQUIT, signal.SIG_DFL)
-        except Exception as e:
-            self.logger.exception(
-                'arv-mount: exception during exec %s', self.args.exec_args)
+        with self:
             try:
-                rc = e.errno
-            except AttributeError:
-                pass
-        finally:
-            subprocess.call(["fusermount", "-u", "-z", self.args.mountpoint])
-            self.operations.destroy()
+                sp = subprocess.Popen(self.args.exec_args, shell=False)
+
+                # forward signals to the process.
+                signal.signal(signal.SIGINT, lambda signum, frame: sp.send_signal(signum))
+                signal.signal(signal.SIGTERM, lambda signum, frame: sp.send_signal(signum))
+                signal.signal(signal.SIGQUIT, lambda signum, frame: sp.send_signal(signum))
+
+                # wait for process to complete.
+                rc = sp.wait()
+
+                # restore default signal handlers.
+                signal.signal(signal.SIGINT, signal.SIG_DFL)
+                signal.signal(signal.SIGTERM, signal.SIG_DFL)
+                signal.signal(signal.SIGQUIT, signal.SIG_DFL)
+            except Exception as e:
+                self.logger.exception(
+                    'arv-mount: exception during exec %s', self.args.exec_args)
+                try:
+                    rc = e.errno
+                except AttributeError:
+                    pass
         exit(rc)
 
     def _run_standalone(self):
         try:
             llfuse.init(self.operations, self.args.mountpoint, self._fuse_options())
 
-            if not (self.args.exec_args or self.args.foreground):
-                self.daemon_ctx = daemon.DaemonContext(working_directory=os.path.dirname(self.args.mountpoint),
-                                                       files_preserve=range(3, resource.getrlimit(resource.RLIMIT_NOFILE)[1]))
+            if not self.args.foreground:
+                self.daemon_ctx = daemon.DaemonContext(
+                    working_directory=os.path.dirname(self.args.mountpoint),
+                    files_preserve=range(
+                        3, resource.getrlimit(resource.RLIMIT_NOFILE)[1]))
                 self.daemon_ctx.open()
 
             # Subscribe to change events from API server
             self.operations.listen_for_events()
 
-            llfuse.main()
+            self._llfuse_main()
         except Exception as e:
             self.logger.exception('arv-mount: exception during mount: %s', e)
             exit(getattr(e, 'errno', 1))
-        finally:
-            self.operations.destroy()
         exit(0)
+
+    def _llfuse_main(self):
+        try:
+            llfuse.main()
+        except:
+            llfuse.close(unmount=False)
+            raise
+        llfuse.close()
diff --git a/services/fuse/fpm-info.sh b/services/fuse/fpm-info.sh
new file mode 100644 (file)
index 0000000..57671cd
--- /dev/null
@@ -0,0 +1 @@
+fpm_depends+=(fuse)
index faa4a55065945d907109b8994470ccccb2c5904d..5a45bfc103f34df2ae928ed5bcd305d100c408fa 100644 (file)
@@ -62,7 +62,9 @@ class IntegrationTest(unittest.TestCase):
             def wrapper(self, *args, **kwargs):
                 with arvados_fuse.command.Mount(
                         arvados_fuse.command.ArgumentParser().parse_args(
-                            argv + ['--foreground', self.mnt])):
+                            argv + ['--foreground',
+                                    '--unmount-timeout=0.1',
+                                    self.mnt])):
                     return func(self, *args, **kwargs)
             return wrapper
         return decorator
index 44ec1996d2121698b478e2ba25a6ae95d0bef4c7..c79daf80f54156b6e304839c01a66221217ae3c9 100644 (file)
@@ -37,6 +37,16 @@ class MountTestBase(unittest.TestCase):
         run_test_server.authorize_with("admin")
         self.api = api if api else arvados.safeapi.ThreadSafeApiCache(arvados.config.settings())
 
+    # This is a copy of Mount's method.  TODO: Refactor MountTestBase
+    # to use a Mount instead of copying its code.
+    def _llfuse_main(self):
+        try:
+            llfuse.main()
+        except:
+            llfuse.close(unmount=False)
+            raise
+        llfuse.close()
+
     def make_mount(self, root_class, **root_kwargs):
         self.operations = fuse.Operations(
             os.getuid(), os.getgid(),
@@ -45,7 +55,9 @@ class MountTestBase(unittest.TestCase):
         self.operations.inodes.add_entry(root_class(
             llfuse.ROOT_INODE, self.operations.inodes, self.api, 0, **root_kwargs))
         llfuse.init(self.operations, self.mounttmp, [])
-        threading.Thread(None, llfuse.main).start()
+        self.llfuse_thread = threading.Thread(None, lambda: self._llfuse_main())
+        self.llfuse_thread.daemon = True
+        self.llfuse_thread.start()
         # wait until the driver is finished initializing
         self.operations.initlock.wait()
         return self.operations.inodes[llfuse.ROOT_INODE]
@@ -55,17 +67,12 @@ class MountTestBase(unittest.TestCase):
         self.pool.join()
         del self.pool
 
-        # llfuse.close is buggy, so use fusermount instead.
-        #llfuse.close(unmount=True)
-
-        count = 0
-        success = 1
-        while (count < 9 and success != 0):
-          success = subprocess.call(["fusermount", "-u", self.mounttmp])
-          time.sleep(0.1)
-          count += 1
-
-        self.operations.destroy()
+        subprocess.call(["fusermount", "-u", "-z", self.mounttmp])
+        self.llfuse_thread.join(timeout=1)
+        if self.llfuse_thread.is_alive():
+            logger.warning("MountTestBase.tearDown():"
+                           " llfuse thread still alive 1s after umount"
+                           " -- abandoning and exiting anyway")
 
         os.rmdir(self.mounttmp)
         if self.keeptmp:
diff --git a/services/fuse/tests/test_exec.py b/services/fuse/tests/test_exec.py
new file mode 100644 (file)
index 0000000..66013a4
--- /dev/null
@@ -0,0 +1,60 @@
+import arvados_fuse.command
+import json
+import multiprocessing
+import os
+import run_test_server
+import tempfile
+import unittest
+
+try:
+    from shlex import quote
+except:
+    from pipes import quote
+
+def try_exec(mnt, cmd):
+    try:
+        arvados_fuse.command.Mount(
+            arvados_fuse.command.ArgumentParser().parse_args([
+                '--read-write',
+                '--mount-tmp=zzz',
+                '--unmount-timeout=0.1',
+                mnt,
+                '--exec'] + cmd)).run()
+    except SystemExit:
+        pass
+    else:
+        raise AssertionError('should have exited')
+
+
+class ExecMode(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        run_test_server.run()
+        run_test_server.run_keep(enforce_permissions=True, num_servers=2)
+        run_test_server.authorize_with('active')
+
+    @classmethod
+    def tearDownClass(cls):
+        run_test_server.stop_keep(num_servers=2)
+
+    def setUp(self):
+        self.mnt = tempfile.mkdtemp()
+        _, self.okfile = tempfile.mkstemp()
+        self.pool = multiprocessing.Pool(1)
+
+    def tearDown(self):
+        self.pool.terminate()
+        self.pool.join()
+        os.rmdir(self.mnt)
+        os.unlink(self.okfile)
+
+    def test_exec(self):
+        self.pool.apply(try_exec, (self.mnt, [
+            'sh', '-c',
+            'echo -n foo >{}; cp {} {}'.format(
+                quote(os.path.join(self.mnt, 'zzz', 'foo.txt')),
+                quote(os.path.join(self.mnt, 'zzz', '.arvados#collection')),
+                quote(os.path.join(self.okfile)))]))
+        self.assertRegexpMatches(
+            json.load(open(self.okfile))['manifest_text'],
+            r' 0:3:foo.txt\n')
index c92fc9b00dd0cc723e4fd64bd2b001daa8bce60c..e1b8c484f0413cb8ff6bfe3ac0be77aebbcd2aa7 100755 (executable)
@@ -68,7 +68,6 @@ begin
   logins.each do |l|
     next if seen[l[:username]]
     seen[l[:username]] = true if not seen.has_key?(l[:username])
-    @homedir = "/home/#{l[:username]}"
 
     unless uids[l[:username]]
       STDERR.puts "Creating account #{l[:username]}"
@@ -85,6 +84,7 @@ begin
                          out: devnull)
     end
     # Create .ssh directory if necessary
+    @homedir = Etc.getpwnam(l[:username]).dir
     userdotssh = File.join(@homedir, ".ssh")
     Dir.mkdir(userdotssh) if !File.exists?(userdotssh)
     @key = "#######################################################################################
@@ -109,4 +109,3 @@ rescue Exception => bang
   puts bang.backtrace.join("\n")
   exit 1
 end
-
index 4db8152d090ecd7b43f8be48fd2db363d03a2483..325ee59e854dfe36a83b406297d37047c7a90718 100644 (file)
@@ -1 +1,2 @@
 include agpl-3.0.txt
+include crunchstat_summary/chartjs.js