Merge branch '14745-azure-cloud-driver-fixups'
authorEric Biagiotti <ebiagiotti@veritasgenetcs.com>
Thu, 14 Feb 2019 19:56:47 +0000 (14:56 -0500)
committerEric Biagiotti <ebiagiotti@veritasgenetcs.com>
Thu, 14 Feb 2019 19:56:47 +0000 (14:56 -0500)
refs #14745

Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <eric.biagiotti@gmail.com>

15 files changed:
build/run-build-packages.sh
cmd/arvados-server/arvados-dispatch-cloud.service [moved from cmd/arvados-server/crunch-dispatch-cloud.service with 94% similarity]
lib/dispatchcloud/container/queue.go
lib/dispatchcloud/dispatcher.go
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/scheduler/fix_stale_locks.go
sdk/cli/arvados-cli.gemspec
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/http.py
sdk/cwl/setup.py
sdk/cwl/tests/test_http.py
sdk/cwl/tests/test_submit.py
sdk/go/arvados/config.go
services/api/app/models/api_client_authorization.rb
services/login-sync/arvados-login-sync.gemspec

index b0199aed9767700a9e891e529a1b6fdc84415c25..ef1df03d5a97498e0ba15b02dd16cb012a8ba3c8 100755 (executable)
@@ -303,7 +303,7 @@ package_go_binary cmd/arvados-server arvados-server \
     "Arvados server daemons"
 package_go_binary cmd/arvados-server arvados-controller \
     "Arvados cluster controller daemon"
-package_go_binary cmd/arvados-server crunch-dispatch-cloud \
+package_go_binary cmd/arvados-server arvados-dispatch-cloud \
     "Arvados cluster cloud dispatch"
 package_go_binary sdk/go/crunchrunner crunchrunner \
     "Crunchrunner executes a command inside a container and uploads the output"
similarity index 94%
rename from cmd/arvados-server/crunch-dispatch-cloud.service
rename to cmd/arvados-server/arvados-dispatch-cloud.service
index f8d71c9753fab219bcb24b17dfcb8a9885f4738f..5ea5d45e79c4940ee2ef83230988f4641c57c2b8 100644 (file)
@@ -17,7 +17,7 @@ StartLimitIntervalSec=0
 [Service]
 Type=notify
 EnvironmentFile=-/etc/arvados/environment
-ExecStart=/usr/bin/crunch-dispatch-cloud
+ExecStart=/usr/bin/arvados-dispatch-cloud
 Restart=always
 RestartSec=1
 
index 847fe9e27c03e3794f068bd59c7c13031798fb71..297782c35b9a972668fd7623d251e3411b26389b 100644 (file)
@@ -223,8 +223,25 @@ func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
                // error: it wouldn't help to try again, or to leave
                // it for a different dispatcher process to attempt.
                errorString := err.Error()
-               cq.logger.WithField("ContainerUUID", ctr.UUID).Warn("cancel container with no suitable instance type")
+               logger := cq.logger.WithField("ContainerUUID", ctr.UUID)
+               logger.WithError(err).Warn("cancel container with no suitable instance type")
                go func() {
+                       if ctr.State == arvados.ContainerStateQueued {
+                               // Can't set runtime error without
+                               // locking first. If Lock() is
+                               // successful, it will call addEnt()
+                               // again itself, and we'll fall
+                               // through to the
+                               // setRuntimeError/Cancel code below.
+                               err := cq.Lock(ctr.UUID)
+                               if err != nil {
+                                       logger.WithError(err).Warn("lock failed")
+                                       // ...and try again on the
+                                       // next Update, if the problem
+                                       // still exists.
+                               }
+                               return
+                       }
                        var err error
                        defer func() {
                                if err == nil {
@@ -239,14 +256,8 @@ func (cq *Queue) addEnt(uuid string, ctr arvados.Container) {
                                if latest.State == arvados.ContainerStateCancelled {
                                        return
                                }
-                               cq.logger.WithField("ContainerUUID", ctr.UUID).WithError(err).Warn("error while trying to cancel unsatisfiable container")
+                               logger.WithError(err).Warn("error while trying to cancel unsatisfiable container")
                        }()
-                       if ctr.State == arvados.ContainerStateQueued {
-                               err = cq.Lock(ctr.UUID)
-                               if err != nil {
-                                       return
-                               }
-                       }
                        err = cq.setRuntimeError(ctr.UUID, errorString)
                        if err != nil {
                                return
index 3e3f5ee1993f0f0dfb98e4c68e02e80b1ee6f888..2d73afcd2624ed1e55041f25eda9e91fe0f8f8ed 100644 (file)
@@ -118,7 +118,7 @@ func (disp *dispatcher) initialize() {
        disp.stopped = make(chan struct{})
        disp.logger = logrus.StandardLogger()
 
-       if key, err := ssh.ParsePrivateKey(disp.Cluster.Dispatch.PrivateKey); err != nil {
+       if key, err := ssh.ParsePrivateKey([]byte(disp.Cluster.Dispatch.PrivateKey)); err != nil {
                disp.logger.Fatalf("error parsing configured Dispatch.PrivateKey: %s", err)
        } else {
                disp.sshKey = key
index 674cacd5663676ead14e72c9543e8bfc12dadc51..0558d79f1a5b95737ab61958e467a7ee73177df9 100644 (file)
@@ -51,7 +51,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                        TimeoutShutdown: arvados.Duration(5 * time.Millisecond),
                },
                Dispatch: arvados.Dispatch{
-                       PrivateKey:         dispatchprivraw,
+                       PrivateKey:         string(dispatchprivraw),
                        PollInterval:       arvados.Duration(5 * time.Millisecond),
                        ProbeInterval:      arvados.Duration(5 * time.Millisecond),
                        StaleLockTimeout:   arvados.Duration(5 * time.Millisecond),
@@ -156,7 +156,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                c.Fatalf("timed out; still waiting for %d containers: %q", len(waiting), waiting)
        }
 
-       deadline := time.Now().Add(time.Second)
+       deadline := time.Now().Add(5 * time.Second)
        for range time.NewTicker(10 * time.Millisecond).C {
                insts, err := s.stubDriver.InstanceSets()[0].Instances(nil)
                c.Check(err, check.IsNil)
index 264f9e4ec6bbc3747401858a37c3f70b259116c1..4bd27021c675d1c8ce40753d131d0631041ea59c 100644 (file)
@@ -19,24 +19,15 @@ import (
 func (sch *Scheduler) fixStaleLocks() {
        wp := sch.pool.Subscribe()
        defer sch.pool.Unsubscribe(wp)
+
+       var stale []string
        timeout := time.NewTimer(sch.staleLockTimeout)
 waiting:
        for {
-               unlock := false
-               select {
-               case <-wp:
-                       // If all workers have been contacted, unlock
-                       // containers that aren't claimed by any
-                       // worker.
-                       unlock = sch.pool.CountWorkers()[worker.StateUnknown] == 0
-               case <-timeout.C:
-                       // Give up and unlock the containers, even
-                       // though they might be working.
-                       unlock = true
-               }
-
                running := sch.pool.Running()
                qEntries, _ := sch.queue.Entries()
+
+               stale = nil
                for uuid, ent := range qEntries {
                        if ent.Container.State != arvados.ContainerStateLocked {
                                continue
@@ -44,14 +35,30 @@ waiting:
                        if _, running := running[uuid]; running {
                                continue
                        }
-                       if !unlock {
-                               continue waiting
-                       }
-                       err := sch.queue.Unlock(uuid)
-                       if err != nil {
-                               sch.logger.Warnf("Unlock %s: %s", uuid, err)
+                       stale = append(stale, uuid)
+               }
+               if len(stale) == 0 {
+                       return
+               }
+
+               select {
+               case <-wp:
+                       // Stop waiting if all workers have been
+                       // contacted.
+                       if sch.pool.CountWorkers()[worker.StateUnknown] == 0 {
+                               break waiting
                        }
+               case <-timeout.C:
+                       // Give up.
+                       break waiting
+               }
+
+       }
+
+       for _, uuid := range stale {
+               err := sch.queue.Unlock(uuid)
+               if err != nil {
+                       sch.logger.Warnf("Unlock %s: %s", uuid, err)
                }
-               return
        }
 }
index 80abc9c497f2da9f0f72ebd022e47ae4fb07a14e..c7e20e2a72947d2e74f147e6a6c0fd68d14254f8 100644 (file)
@@ -30,7 +30,7 @@ Gem::Specification.new do |s|
   s.executables << "arv-crunch-job"
   s.executables << "arv-tag"
   s.required_ruby_version = '>= 2.1.0'
-  s.add_runtime_dependency 'arvados', '~> 1.2.0', '>= 1.2.0'
+  s.add_runtime_dependency 'arvados', '~> 1.3.0', '>= 1.3.0'
   # Our google-api-client dependency used to be < 0.9, but that could be
   # satisfied by the buggy 0.9.pre*.  https://dev.arvados.org/issues/9213
   s.add_runtime_dependency 'cure-google-api-client', '~> 0.6', '>= 0.6.3', '<0.8.9'
index e921b26dfe78279c1ee22fb6a19c5e93569ed542..af7c02a8f30010bfe85e51a6928e63a5a617d37e 100644 (file)
@@ -499,6 +499,9 @@ class RunnerContainer(Runner):
             extra_submit_params["cluster_id"] = runtimeContext.submit_runner_cluster
 
         if runtimeContext.submit_request_uuid:
+            if "cluster_id" in extra_submit_params:
+                # Doesn't make sense for "update" and actually fails
+                del extra_submit_params["cluster_id"]
             response = self.arvrunner.api.container_requests().update(
                 uuid=runtimeContext.submit_request_uuid,
                 body=job_spec,
index ccc2e793b067f4e0853da275947a08d3630723a3..47a304372c58a27ecde8d8c13bb55d6435f9cf79 100644 (file)
@@ -132,7 +132,7 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
     count = 0
     start = time.time()
     checkpoint = start
-    with c.open(name, "w") as f:
+    with c.open(name, "wb") as f:
         for chunk in req.iter_content(chunk_size=1024):
             count += len(chunk)
             f.write(chunk)
index 711796374849c115296e089e31a44729330f45fb..22c49a01bf4ed31592db013ac5f2cb49c4e789cd 100644 (file)
@@ -37,7 +37,7 @@ setup(name='arvados-cwl-runner',
           'schema-salad==3.0.20181129082112',
           'typing >= 3.6.4',
           'ruamel.yaml >=0.15.54, <=0.15.77',
-          'arvados-python-client>=1.2.1.20181130020805',
+          'arvados-python-client>=1.3.0.20190205182514',
           'setuptools',
           'ciso8601 >=1.0.6, <2.0.0',
           'subprocess32>=3.5.1',
index 88bd49fcd329faee02fc29f777adbdd41d3f21a2..4119fee383e27bcfe30a97d3de754d1879c067a9 100644 (file)
@@ -60,7 +60,7 @@ class TestHttpToKeep(unittest.TestCase):
 
         getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True)
 
-        cm.open.assert_called_with("file1.txt", "w")
+        cm.open.assert_called_with("file1.txt", "wb")
         cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt",
                                        owner_uuid=None, ensure_unique_name=True)
 
@@ -188,7 +188,7 @@ class TestHttpToKeep(unittest.TestCase):
 
         getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True)
 
-        cm.open.assert_called_with("file1.txt", "w")
+        cm.open.assert_called_with("file1.txt", "wb")
         cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt",
                                        owner_uuid=None, ensure_unique_name=True)
 
@@ -279,7 +279,7 @@ class TestHttpToKeep(unittest.TestCase):
 
         getmock.assert_called_with("http://example.com/download?fn=/file1.txt", stream=True, allow_redirects=True)
 
-        cm.open.assert_called_with("file1.txt", "w")
+        cm.open.assert_called_with("file1.txt", "wb")
         cm.save_new.assert_called_with(name="Downloaded from http://example.com/download?fn=/file1.txt",
                                        owner_uuid=None, ensure_unique_name=True)
 
index 782282d2a0d4312a8a25a18b970d009bc6a9f66f..39117d86e3ca976ffaa19e1e5596e37bf018b842 100644 (file)
@@ -512,7 +512,7 @@ class TestSubmit(unittest.TestCase):
             body=JsonDiffMatcher(expect_pipeline))
 
     @stubs
-    def test_submit_container(self, stubs):        
+    def test_submit_container(self, stubs):
         exited = arvados_cwl.main(
             ["--submit", "--no-wait", "--api=containers", "--debug",
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
@@ -559,7 +559,7 @@ class TestSubmit(unittest.TestCase):
         stubs.api.container_requests().create.assert_called_with(
             body=JsonDiffMatcher(expect_container))
         self.assertEqual(stubs.capture_stdout.getvalue(),
-                         stubs.expect_container_request_uuid + '\n') 
+                         stubs.expect_container_request_uuid + '\n')
         self.assertEqual(exited, 0)
 
     @stubs
@@ -602,7 +602,7 @@ class TestSubmit(unittest.TestCase):
             ["--submit", "--no-wait", "--api=containers", "--debug", "--on-error=stop",
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
             stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+
         expect_container = copy.deepcopy(stubs.expect_container_spec)
         expect_container["command"] = ['arvados-cwl-runner', '--local', '--api=containers',
                                        '--no-log-timestamps', '--disable-validate',
@@ -620,7 +620,7 @@ class TestSubmit(unittest.TestCase):
     @stubs
     def test_submit_container_output_name(self, stubs):
         output_name = "test_output_name"
-  
+
         exited = arvados_cwl.main(
             ["--submit", "--no-wait", "--api=containers", "--debug", "--output-name", output_name,
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
@@ -642,7 +642,7 @@ class TestSubmit(unittest.TestCase):
         self.assertEqual(exited, 0)
 
     @stubs
-    def test_submit_storage_classes(self, stubs):     
+    def test_submit_storage_classes(self, stubs):
         exited = arvados_cwl.main(
             ["--debug", "--submit", "--no-wait", "--api=containers", "--storage-classes=foo",
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
@@ -726,7 +726,7 @@ class TestSubmit(unittest.TestCase):
             ["--submit", "--no-wait", "--api=containers", "--debug", "--trash-intermediate",
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
             stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
-            
+
 
         expect_container = copy.deepcopy(stubs.expect_container_spec)
         expect_container["command"] = ['arvados-cwl-runner', '--local', '--api=containers',
@@ -1394,7 +1394,7 @@ class TestSubmit(unittest.TestCase):
             stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
 
         stubs.api.container_requests().update.assert_called_with(
-            uuid="zzzzz-xvhdp-yyyyyyyyyyyyyyy", body=JsonDiffMatcher(stubs.expect_container_spec), cluster_id="zzzzz")
+            uuid="zzzzz-xvhdp-yyyyyyyyyyyyyyy", body=JsonDiffMatcher(stubs.expect_container_spec))
         self.assertEqual(stubs.capture_stdout.getvalue(),
                          stubs.expect_container_request_uuid + '\n')
         self.assertEqual(exited, 0)
@@ -1541,7 +1541,7 @@ class TestCreateTemplate(unittest.TestCase):
         self.assertEqual(stubs.capture_stdout.getvalue(),
                          self.existing_template_uuid + '\n')
         self.assertEqual(exited, 0)
-        
+
 
 class TestCreateWorkflow(unittest.TestCase):
     existing_workflow_uuid = "zzzzz-7fd4e-validworkfloyml"
index 522eb41a72ddf991349fb1074b3a474be53a923e..636cf350c151b150c59a96c45c312ea7af6d45c2 100644 (file)
@@ -100,7 +100,7 @@ type InstanceType struct {
 type Dispatch struct {
        // PEM encoded SSH key (RSA, DSA, or ECDSA) able to log in to
        // cloud VMs.
-       PrivateKey []byte
+       PrivateKey string
 
        // Max time for workers to come up before abandoning stale
        // locks from previous run
index 39253e1036ba9a52b2070f9e0a7d4043fecb2d43..38538cb4ffbe8d6db29fcc430cc67620f25641b4 100644 (file)
@@ -155,6 +155,12 @@ class ApiClientAuthorization < ArvadosModel
         clnt = HTTPClient.new
         if Rails.configuration.sso_insecure
           clnt.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
+        else
+          # Use system CA certificates
+          ["/etc/ssl/certs/ca-certificates.crt",
+           "/etc/pki/tls/certs/ca-bundle.crt"]
+            .select { |ca_path| File.readable?(ca_path) }
+            .each { |ca_path| clnt.ssl_config.add_trust_ca(ca_path) }
         end
         remote_user = SafeJSON.load(
           clnt.get_content('https://' + host + '/arvados/v1/users/current',
index 605e8540ee1df59d3b96618ebef50f4b39567384..b64aab2dc6cb0e189341ab93d175e27d38a659ce 100644 (file)
@@ -24,7 +24,7 @@ Gem::Specification.new do |s|
   s.files       = ["bin/arvados-login-sync", "agpl-3.0.txt"]
   s.executables << "arvados-login-sync"
   s.required_ruby_version = '>= 2.1.0'
-  s.add_runtime_dependency 'arvados', '~> 1.2.0', '>= 1.2.0'
+  s.add_runtime_dependency 'arvados', '~> 1.3.0', '>= 1.3.0'
   s.homepage    =
     'https://arvados.org'
 end