16833: Merge branch 'master' into 16833-replace-epydoc
authorWard Vandewege <ward@curii.com>
Wed, 16 Sep 2020 20:00:07 +0000 (16:00 -0400)
committerWard Vandewege <ward@curii.com>
Wed, 16 Sep 2020 20:00:24 +0000 (16:00 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

25 files changed:
doc/Gemfile.lock
lib/config/config.default.yml
lib/config/generated_config.go
lib/crunchrun/background.go
lib/dispatchcloud/container/queue.go
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/scheduler/run_queue_test.go
lib/dispatchcloud/scheduler/scheduler.go
lib/dispatchcloud/test/stub_driver.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/worker.go
sdk/go/arvados/config.go
services/api/app/models/arvados_model.rb
services/api/lib/config_loader.rb
services/api/lib/enable_jobs_api.rb
services/api/test/functional/arvados/v1/groups_controller_test.rb
services/api/test/functional/arvados/v1/schema_controller_test.rb
services/api/test/functional/user_sessions_controller_test.rb
services/api/test/test_helper.rb
services/api/test/unit/application_test.rb
services/api/test/unit/collection_test.rb
services/api/test/unit/container_request_test.rb
services/api/test/unit/job_test.rb
services/api/test/unit/log_test.rb
services/api/test/unit/user_test.rb

index 344a0a86b51555d60c7bb812afbdc0d5a1819349..b5e62cacd6b0b605599fc1cff58428d6d73673fc 100644 (file)
@@ -1,28 +1,23 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    RedCloth (4.2.9)
-    coderay (1.1.0)
-    colorize (0.6.0)
-    kramdown (1.3.1)
-    less (1.2.21)
-      mutter (>= 0.4.2)
-      treetop (>= 1.4.2)
-    liquid (2.6.1)
-    makerakeworkwell (1.0.3)
-      rake (>= 0.9.2, < 11)
-    mutter (0.5.3)
-    polyglot (0.3.3)
-    rake (10.1.1)
-    treetop (1.4.15)
-      polyglot
-      polyglot (>= 0.3.1)
-    zenweb (3.3.1)
+    RedCloth (4.3.2)
+    coderay (1.1.3)
+    colorize (0.8.1)
+    commonjs (0.2.7)
+    kramdown (1.17.0)
+    less (2.6.0)
+      commonjs (~> 0.2.7)
+    liquid (4.0.3)
+    makerakeworkwell (1.0.4)
+      rake (>= 0.9.2, < 15)
+    rake (13.0.1)
+    zenweb (3.10.4)
       coderay (~> 1.0)
-      kramdown (~> 1.0)
-      less (~> 1.2)
+      kramdown (~> 1.4)
+      less (~> 2.0)
       makerakeworkwell (~> 1.0)
-      rake (>= 0.9, < 11)
+      rake (>= 0.9, < 15)
 
 PLATFORMS
   ruby
@@ -32,3 +27,6 @@ DEPENDENCIES
   colorize
   liquid
   zenweb
+
+BUNDLED WITH
+   2.1.4
index b1865a2217ce99c48a13ae8a17e4cf10d336cbf0..15e7c7c06ce723114cafc86d8e4c6ec0c2b99dff 100644 (file)
@@ -938,6 +938,11 @@ Clusters:
         # Time before repeating SIGTERM when killing a container.
         TimeoutSignal: 5s
 
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
         # Time to give up on SIGTERM and write off the worker.
         TimeoutTERM: 2m
 
index 201ae3604537f9f44a9e788320b7262685944f98..7ed332151b8bbacaa81fa7352e8251058296b8c8 100644 (file)
@@ -944,6 +944,11 @@ Clusters:
         # Time before repeating SIGTERM when killing a container.
         TimeoutSignal: 5s
 
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
         # Time to give up on SIGTERM and write off the worker.
         TimeoutTERM: 2m
 
index bf039afa0ad53799183607fe9795b5556f615bad..8cdba72c10d3c5902225456de9389bcc70b6dbca 100644 (file)
@@ -218,6 +218,24 @@ func ListProcesses(stdout, stderr io.Writer) int {
                        return nil
                }
 
+               proc, err := os.FindProcess(pi.PID)
+               if err != nil {
+                       // FindProcess should have succeeded, even if the
+                       // process does not exist.
+                       fmt.Fprintf(stderr, "%s: find process %d: %s", path, pi.PID, err)
+                       return nil
+               }
+               err = proc.Signal(syscall.Signal(0))
+               if err != nil {
+                       // Process is dead, even though lockfile was
+                       // still locked. Most likely a stuck arv-mount
+                       // process that inherited the lock from
+                       // crunch-run. Report container UUID as
+                       // "stale".
+                       fmt.Fprintln(stdout, pi.UUID, "stale")
+                       return nil
+               }
+
                fmt.Fprintln(stdout, pi.UUID)
                return nil
        }))
index 45b346383fab8641b27d16063a46bc4468fc96ce..a1ff414b73ecbd3be22276b716a64af6eaa3f225 100644 (file)
@@ -382,7 +382,7 @@ func (cq *Queue) poll() (map[string]*arvados.Container, error) {
                        *next[upd.UUID] = upd
                }
        }
-       selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters"}
+       selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters", "created_at"}
        limitParam := 1000
 
        mine, err := cq.fetchAll(arvados.ResourceListParams{
index 6e1850410b28bf3394ec4e29c4416a9551ec6d91..9f1eb098e01aa029ebe735f2f8a7a956bf56c236 100644 (file)
@@ -66,6 +66,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                                ProbeInterval:        arvados.Duration(5 * time.Millisecond),
                                MaxProbesPerSecond:   1000,
                                TimeoutSignal:        arvados.Duration(3 * time.Millisecond),
+                               TimeoutStaleRunLock:  arvados.Duration(3 * time.Millisecond),
                                TimeoutTERM:          arvados.Duration(20 * time.Millisecond),
                                ResourceTags:         map[string]string{"testtag": "test value"},
                                TagKeyPrefix:         "test:",
@@ -169,6 +170,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                        stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
                default:
                        stubvm.CrunchRunCrashRate = 0.1
+                       stubvm.ArvMountDeadlockRate = 0.1
                }
        }
        s.stubDriver.Bugf = c.Errorf
@@ -221,6 +223,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
index 6d703649893356bdb91429954128d1e5afd6a9c5..fd1d0a870b7ac9f34f9d1dd39f250fed62b4a099 100644 (file)
@@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
        c.Check(pool.Running(), check.HasLen, 0)
 }
 
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
        ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
        queue := test.Queue{
                ChooseType: chooseType,
                Containers: []arvados.Container{
                        {
-                               UUID:     test.ContainerUUID(1),
-                               Priority: 1,
-                               State:    arvados.ContainerStateLocked,
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
                                RuntimeConstraints: arvados.RuntimeConstraints{
                                        VCPUs: 1,
                                        RAM:   1 << 30,
@@ -417,9 +418,11 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
        }
        sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
        sch.runQueue()
+       sch.updateMetrics()
 
        c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
        c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
 
        // Create a pool without workers. The queued container will not be started, and the
        // 'over quota' metric will be 1 because no workers are available and canCreate defaults
@@ -427,7 +430,40 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
        pool = stubPool{}
        sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
        sch.runQueue()
+       sch.updateMetrics()
 
        c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
        c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+       // Reset the queue, and create a pool with an idle worker. The queued
+       // container will be started immediately and mLongestWaitTimeSinceQueue
+       // should be zero.
+       queue = test.Queue{
+               ChooseType: chooseType,
+               Containers: []arvados.Container{
+                       {
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
+                               RuntimeConstraints: arvados.RuntimeConstraints{
+                                       VCPUs: 1,
+                                       RAM:   1 << 30,
+                               },
+                       },
+               },
+       }
+       queue.Update()
+
+       pool = stubPool{
+               idle:    map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               running: map[string]time.Time{},
+       }
+       sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+       sch.runQueue()
+       sch.updateMetrics()
+
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
 }
index 6fd47e919411d69026316d24d8643e5693d06f38..c3e67dd11f70a4e00c8a74f59826efb13bf0e35c 100644 (file)
@@ -11,6 +11,7 @@ import (
        "sync"
        "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
@@ -46,6 +47,7 @@ type Scheduler struct {
 
        mContainersAllocatedNotStarted   prometheus.Gauge
        mContainersNotAllocatedOverQuota prometheus.Gauge
+       mLongestWaitTimeSinceQueue       prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
@@ -87,6 +89,37 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
                Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
        })
        reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+       sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "containers_longest_wait_time_seconds",
+               Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+       })
+       reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
+}
+
+func (sch *Scheduler) updateMetrics() {
+       earliest := time.Time{}
+       entries, _ := sch.queue.Entries()
+       running := sch.pool.Running()
+       for _, ent := range entries {
+               if ent.Container.Priority > 0 &&
+                       (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
+                       // Exclude containers that are preparing to run the payload (i.e.
+                       // ContainerStateLocked and running on a worker, most likely loading the
+                       // payload image
+                       if _, ok := running[ent.Container.UUID]; !ok {
+                               if ent.Container.CreatedAt.Before(earliest) || earliest.IsZero() {
+                                       earliest = ent.Container.CreatedAt
+                               }
+                       }
+               }
+       }
+       if !earliest.IsZero() {
+               sch.mLongestWaitTimeSinceQueue.Set(time.Since(earliest).Seconds())
+       } else {
+               sch.mLongestWaitTimeSinceQueue.Set(0)
+       }
 }
 
 // Start starts the scheduler.
@@ -141,6 +174,7 @@ func (sch *Scheduler) run() {
        for {
                sch.runQueue()
                sch.sync()
+               sch.updateMetrics()
                select {
                case <-sch.stop:
                        return
index 132bd4d695f0ef88095951b151be592029c31328..4d32cf221ce49461e092a834ad192460bc37a49d 100644 (file)
@@ -131,7 +131,7 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID,
                tags:         copyTags(tags),
                providerType: it.ProviderType,
                initCommand:  cmd,
-               running:      map[string]int64{},
+               running:      map[string]stubProcess{},
                killing:      map[string]bool{},
        }
        svm.SSHService = SSHService{
@@ -189,6 +189,8 @@ type StubVM struct {
        CrunchRunMissing      bool
        CrunchRunCrashRate    float64
        CrunchRunDetachDelay  time.Duration
+       ArvMountMaxExitLag    time.Duration
+       ArvMountDeadlockRate  float64
        ExecuteContainer      func(arvados.Container) int
        CrashRunningContainer func(arvados.Container)
 
@@ -198,12 +200,21 @@ type StubVM struct {
        initCommand  cloud.InitCommand
        providerType string
        SSHService   SSHService
-       running      map[string]int64
+       running      map[string]stubProcess
        killing      map[string]bool
        lastPID      int64
+       deadlocked   string
        sync.Mutex
 }
 
+type stubProcess struct {
+       pid int64
+
+       // crunch-run has exited, but arv-mount process (or something)
+       // still holds lock in /var/run/
+       exited bool
+}
+
 func (svm *StubVM) Instance() stubInstance {
        svm.Lock()
        defer svm.Unlock()
@@ -256,7 +267,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                svm.Lock()
                svm.lastPID++
                pid := svm.lastPID
-               svm.running[uuid] = pid
+               svm.running[uuid] = stubProcess{pid: pid}
                svm.Unlock()
                time.Sleep(svm.CrunchRunDetachDelay)
                fmt.Fprintf(stderr, "starting %s\n", uuid)
@@ -273,14 +284,13 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                                logger.Print("[test] exiting crunch-run stub")
                                svm.Lock()
                                defer svm.Unlock()
-                               if svm.running[uuid] != pid {
+                               if svm.running[uuid].pid != pid {
                                        bugf := svm.sis.driver.Bugf
                                        if bugf == nil {
                                                bugf = logger.Warnf
                                        }
-                                       bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
-                               } else {
-                                       delete(svm.running, uuid)
+                                       bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s].pid==%d", pid, uuid, svm.running[uuid].pid)
+                                       return
                                }
                                if !completed {
                                        logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
@@ -288,6 +298,15 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                                                svm.CrashRunningContainer(ctr)
                                        }
                                }
+                               sproc := svm.running[uuid]
+                               sproc.exited = true
+                               svm.running[uuid] = sproc
+                               svm.Unlock()
+                               time.Sleep(svm.ArvMountMaxExitLag * time.Duration(math_rand.Float64()))
+                               svm.Lock()
+                               if math_rand.Float64() >= svm.ArvMountDeadlockRate {
+                                       delete(svm.running, uuid)
+                               }
                        }()
 
                        crashluck := math_rand.Float64()
@@ -333,26 +352,31 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
        if command == "crunch-run --list" {
                svm.Lock()
                defer svm.Unlock()
-               for uuid := range svm.running {
-                       fmt.Fprintf(stdout, "%s\n", uuid)
+               for uuid, sproc := range svm.running {
+                       if sproc.exited {
+                               fmt.Fprintf(stdout, "%s stale\n", uuid)
+                       } else {
+                               fmt.Fprintf(stdout, "%s\n", uuid)
+                       }
                }
                if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) {
                        fmt.Fprintln(stdout, "broken")
                }
+               fmt.Fprintln(stdout, svm.deadlocked)
                return 0
        }
        if strings.HasPrefix(command, "crunch-run --kill ") {
                svm.Lock()
-               _, running := svm.running[uuid]
-               if running {
+               sproc, running := svm.running[uuid]
+               if running && !sproc.exited {
                        svm.killing[uuid] = true
                        svm.Unlock()
                        time.Sleep(time.Duration(math_rand.Float64()*2) * time.Millisecond)
                        svm.Lock()
-                       _, running = svm.running[uuid]
+                       sproc, running = svm.running[uuid]
                }
                svm.Unlock()
-               if running {
+               if running && !sproc.exited {
                        fmt.Fprintf(stderr, "%s: container is running\n", uuid)
                        return 1
                }
index 086887cb44176f05c9446341a68d7176dd5ed7aa..c6eaeae2b618b11423512cc2be1cb5cfc454c20c 100644 (file)
@@ -64,15 +64,16 @@ type Executor interface {
 }
 
 const (
-       defaultSyncInterval       = time.Minute
-       defaultProbeInterval      = time.Second * 10
-       defaultMaxProbesPerSecond = 10
-       defaultTimeoutIdle        = time.Minute
-       defaultTimeoutBooting     = time.Minute * 10
-       defaultTimeoutProbe       = time.Minute * 10
-       defaultTimeoutShutdown    = time.Second * 10
-       defaultTimeoutTERM        = time.Minute * 2
-       defaultTimeoutSignal      = time.Second * 5
+       defaultSyncInterval        = time.Minute
+       defaultProbeInterval       = time.Second * 10
+       defaultMaxProbesPerSecond  = 10
+       defaultTimeoutIdle         = time.Minute
+       defaultTimeoutBooting      = time.Minute * 10
+       defaultTimeoutProbe        = time.Minute * 10
+       defaultTimeoutShutdown     = time.Second * 10
+       defaultTimeoutTERM         = time.Minute * 2
+       defaultTimeoutSignal       = time.Second * 5
+       defaultTimeoutStaleRunLock = time.Second * 5
 
        // Time after a quota error to try again anyway, even if no
        // instances have been shutdown.
@@ -115,6 +116,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
                timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
                timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
                timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+               timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
                installPublicKey:               installPublicKey,
                tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
                stop:                           make(chan bool),
@@ -152,6 +154,7 @@ type Pool struct {
        timeoutShutdown                time.Duration
        timeoutTERM                    time.Duration
        timeoutSignal                  time.Duration
+       timeoutStaleRunLock            time.Duration
        installPublicKey               ssh.PublicKey
        tagKeyPrefix                   string
 
@@ -170,15 +173,17 @@ type Pool struct {
        runnerMD5    [md5.Size]byte
        runnerCmd    string
 
-       mContainersRunning       prometheus.Gauge
-       mInstances               *prometheus.GaugeVec
-       mInstancesPrice          *prometheus.GaugeVec
-       mVCPUs                   *prometheus.GaugeVec
-       mMemory                  *prometheus.GaugeVec
-       mBootOutcomes            *prometheus.CounterVec
-       mDisappearances          *prometheus.CounterVec
-       mTimeToSSH               prometheus.Summary
-       mTimeToReadyForContainer prometheus.Summary
+       mContainersRunning        prometheus.Gauge
+       mInstances                *prometheus.GaugeVec
+       mInstancesPrice           *prometheus.GaugeVec
+       mVCPUs                    *prometheus.GaugeVec
+       mMemory                   *prometheus.GaugeVec
+       mBootOutcomes             *prometheus.CounterVec
+       mDisappearances           *prometheus.CounterVec
+       mTimeToSSH                prometheus.Summary
+       mTimeToReadyForContainer  prometheus.Summary
+       mTimeFromShutdownToGone   prometheus.Summary
+       mTimeFromQueueToCrunchRun prometheus.Summary
 }
 
 type createCall struct {
@@ -661,6 +666,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
        })
        reg.MustRegister(wp.mTimeToReadyForContainer)
+       wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+               Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromShutdownToGone)
+       wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "containers_time_from_queue_to_crunch_run_seconds",
+               Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
 }
 
 func (wp *Pool) runMetrics() {
@@ -930,6 +951,10 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                if wp.mDisappearances != nil {
                        wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                }
+               // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+               if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
+                       wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+               }
                delete(wp.workers, id)
                go wkr.Close()
                notify = true
index 9199d4bafe764d806312638328cf13fd3b422e4d..5b145d7c6599b75bb6e8b30f6c65e65d82186d84 100644 (file)
@@ -110,6 +110,7 @@ type worker struct {
        probing             chan struct{}
        bootOutcomeReported bool
        timeToReadyReported bool
+       staleRunLockSince   time.Time
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -176,6 +177,9 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
        }
        go func() {
                rr.Start()
+               if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+                       wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
+               }
                wkr.mtx.Lock()
                defer wkr.mtx.Unlock()
                now := time.Now()
@@ -382,13 +386,43 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
                return
        }
        ok = true
+
+       staleRunLock := false
        for _, s := range strings.Split(string(stdout), "\n") {
-               if s == "broken" {
+               // Each line of the "crunch-run --list" output is one
+               // of the following:
+               //
+               // * a container UUID, indicating that processes
+               //   related to that container are currently running.
+               //   Optionally followed by " stale", indicating that
+               //   the crunch-run process itself has exited (the
+               //   remaining process is probably arv-mount).
+               //
+               // * the string "broken", indicating that the instance
+               //   appears incapable of starting containers.
+               //
+               // See ListProcesses() in lib/crunchrun/background.go.
+               if s == "" {
+                       // empty string following final newline
+               } else if s == "broken" {
                        reportsBroken = true
-               } else if s != "" {
+               } else if toks := strings.Split(s, " "); len(toks) == 1 {
                        running = append(running, s)
+               } else if toks[1] == "stale" {
+                       wkr.logger.WithField("ContainerUUID", toks[0]).Info("probe reported stale run lock")
+                       staleRunLock = true
                }
        }
+       wkr.mtx.Lock()
+       defer wkr.mtx.Unlock()
+       if !staleRunLock {
+               wkr.staleRunLockSince = time.Time{}
+       } else if wkr.staleRunLockSince.IsZero() {
+               wkr.staleRunLockSince = time.Now()
+       } else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock {
+               wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long")
+               reportsBroken = true
+       }
        return
 }
 
index 363d09dafb5452b350077d6933ed2698689fb513..394e30a737e2aff13128df90d96decfc12428863 100644 (file)
@@ -462,6 +462,7 @@ type CloudVMsConfig struct {
        TimeoutProbe                   Duration
        TimeoutShutdown                Duration
        TimeoutSignal                  Duration
+       TimeoutStaleRunLock            Duration
        TimeoutTERM                    Duration
        ResourceTags                   map[string]string
        TagKeyPrefix                   string
index 6fb8ff2b33549af8e4e512a1374363f8dee8fa64..709b4b4d1c1304e456c494a4079ca2ea0cccc8fb 100644 (file)
@@ -454,7 +454,7 @@ class ArvadosModel < ApplicationRecord
   end
 
   def logged_attributes
-    attributes.except(*Rails.configuration.AuditLogs.UnloggedAttributes.keys)
+    attributes.except(*Rails.configuration.AuditLogs.UnloggedAttributes.stringify_keys.keys)
   end
 
   def self.full_text_searchable_columns
index cf16993ca51054e220713c24cca1a33510e2a423..f421fb5b2a07817905c28bbd1a463da9be936ac5 100644 (file)
@@ -147,14 +147,14 @@ class ConfigLoader
             'Ki' => 1 << 10,
             'M' => 1000000,
             'Mi' => 1 << 20,
-           "G" =>  1000000000,
-           "Gi" => 1 << 30,
-           "T" =>  1000000000000,
-           "Ti" => 1 << 40,
-           "P" =>  1000000000000000,
-           "Pi" => 1 << 50,
-           "E" =>  1000000000000000000,
-           "Ei" => 1 << 60,
+            "G" =>  1000000000,
+            "Gi" => 1 << 30,
+            "T" =>  1000000000000,
+            "Ti" => 1 << 40,
+            "P" =>  1000000000000000,
+            "Pi" => 1 << 50,
+            "E" =>  1000000000000000000,
+            "Ei" => 1 << 60,
           }[mt[2]]
         end
       end
index 1a96a81ad66708f8e45c032737a821d3a5d12ebc..cef76f08a5c93342f2ba02bf4ec699d2bf98bdd3 100644 (file)
@@ -2,16 +2,19 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-Disable_update_jobs_api_method_list = {"jobs.create"=>{},
-                                "pipeline_instances.create"=>{},
-                                "pipeline_templates.create"=>{},
-                                "jobs.update"=>{},
-                                "pipeline_instances.update"=>{},
-                                "pipeline_templates.update"=>{},
-                                "job_tasks.create"=>{},
-                                "job_tasks.update"=>{}}
+Disable_update_jobs_api_method_list = ConfigLoader.to_OrderedOptions({
+                                        "jobs.create"=>{},
+                                        "pipeline_instances.create"=>{},
+                                        "pipeline_templates.create"=>{},
+                                        "jobs.update"=>{},
+                                        "pipeline_instances.update"=>{},
+                                        "pipeline_templates.update"=>{},
+                                        "job_tasks.create"=>{},
+                                        "job_tasks.update"=>{}
+                                      })
 
-Disable_jobs_api_method_list = {"jobs.create"=>{},
+Disable_jobs_api_method_list = ConfigLoader.to_OrderedOptions({
+                                "jobs.create"=>{},
                                 "pipeline_instances.create"=>{},
                                 "pipeline_templates.create"=>{},
                                 "jobs.get"=>{},
@@ -36,7 +39,7 @@ Disable_jobs_api_method_list = {"jobs.create"=>{},
                                 "jobs.show"=>{},
                                 "pipeline_instances.show"=>{},
                                 "pipeline_templates.show"=>{},
-                                "job_tasks.show"=>{}}
+                                "job_tasks.show"=>{}})
 
 def check_enable_legacy_jobs_api
   # Create/update is permanently disabled (legacy functionality has been removed)
index ff89cd2129b31ad5357d54066262a80cd702e41c..f413188b54b2ba54236a8ea5ce2ab51c002cb434 100644 (file)
@@ -430,7 +430,8 @@ class Arvados::V1::GroupsControllerTest < ActionController::TestCase
   end
 
   test 'get contents with jobs and pipeline instances disabled' do
-    Rails.configuration.API.DisabledAPIs = {'jobs.index'=>{}, 'pipeline_instances.index'=>{}}
+    Rails.configuration.API.DisabledAPIs = ConfigLoader.to_OrderedOptions(
+      {'jobs.index'=>{}, 'pipeline_instances.index'=>{}})
 
     authorize_with :active
     get :contents, params: {
index 3dd343b13cd29ac567f8244b9399c534651fbceb..764f3a8d1dd395a7bb84b365144ca114bf3c92ff 100644 (file)
@@ -65,8 +65,8 @@ class Arvados::V1::SchemaControllerTest < ActionController::TestCase
   end
 
   test "non-empty disable_api_methods" do
-    Rails.configuration.API.DisabledAPIs =
-      {'jobs.create'=>{}, 'pipeline_instances.create'=>{}, 'pipeline_templates.create'=>{}}
+    Rails.configuration.API.DisabledAPIs = ConfigLoader.to_OrderedOptions(
+      {'jobs.create'=>{}, 'pipeline_instances.create'=>{}, 'pipeline_templates.create'=>{}})
     get :index
     assert_response :success
     discovery_doc = JSON.parse(@response.body)
index cd475dea4d1849f6d99374fa3976068767ef1fcb..d979208d381b1a28d5f6ead099e45e6e7d4f9302 100644 (file)
@@ -68,7 +68,7 @@ class UserSessionsControllerTest < ActionController::TestCase
 
   test "login to LoginCluster" do
     Rails.configuration.Login.LoginCluster = 'zbbbb'
-    Rails.configuration.RemoteClusters['zbbbb'] = {'Host' => 'zbbbb.example.com'}
+    Rails.configuration.RemoteClusters['zbbbb'] = ConfigLoader.to_OrderedOptions({'Host' => 'zbbbb.example.com'})
     api_client_page = 'http://client.example.com/home'
     get :login, params: {return_to: api_client_page}
     assert_response :redirect
index c99a57aaff49b24910df17c4a745088a4903ce22..5dc77cb98aeaa9cb7758576042e6705d29ef7f22 100644 (file)
@@ -62,7 +62,7 @@ class ActiveSupport::TestCase
   include ArvadosTestSupport
   include CurrentApiClient
 
-  teardown do
+  setup do
     Thread.current[:api_client_ip_address] = nil
     Thread.current[:api_client_authorization] = nil
     Thread.current[:api_client_uuid] = nil
@@ -72,6 +72,14 @@ class ActiveSupport::TestCase
     restore_configuration
   end
 
+  teardown do
+    # Confirm that any changed configuration doesn't include non-symbol keys
+    $arvados_config.keys.each do |conf_name|
+      conf = Rails.configuration.send(conf_name)
+      confirm_keys_as_symbols(conf, conf_name) if conf.respond_to?('keys')
+    end
+  end
+
   def assert_equal(expect, *args)
     if expect.nil?
       assert_nil(*args)
@@ -99,6 +107,14 @@ class ActiveSupport::TestCase
     end
   end
 
+  def confirm_keys_as_symbols(conf, conf_name)
+    assert(conf.is_a?(ActiveSupport::OrderedOptions), "#{conf_name} should be an OrderedOptions object")
+    conf.keys.each do |k|
+      assert(k.is_a?(Symbol), "Key '#{k}' on section '#{conf_name}' should be a Symbol")
+      confirm_keys_as_symbols(conf[k], "#{conf_name}.#{k}") if conf[k].respond_to?('keys')
+    end
+  end
+
   def restore_configuration
     # Restore configuration settings changed during tests
     ConfigLoader.copy_into_config $arvados_config, Rails.configuration
index 679dddf223fa72b9d89f94258f8415c75992143f..e1565ec627c42ec53bbba875d8760d7637302e8c 100644 (file)
@@ -7,7 +7,7 @@ require 'test_helper'
 class ApplicationTest < ActiveSupport::TestCase
   include CurrentApiClient
 
-  test "test act_as_system_user" do
+  test "act_as_system_user" do
     Thread.current[:user] = users(:active)
     assert_equal users(:active), Thread.current[:user]
     act_as_system_user do
@@ -17,7 +17,7 @@ class ApplicationTest < ActiveSupport::TestCase
     assert_equal users(:active), Thread.current[:user]
   end
 
-  test "test act_as_system_user is exception safe" do
+  test "act_as_system_user is exception safe" do
     Thread.current[:user] = users(:active)
     assert_equal users(:active), Thread.current[:user]
     caught = false
@@ -33,4 +33,12 @@ class ApplicationTest < ActiveSupport::TestCase
     assert caught
     assert_equal users(:active), Thread.current[:user]
   end
+
+  test "config maps' keys are returned as symbols" do
+    assert Rails.configuration.Users.AutoSetupUsernameBlacklist.is_a? ActiveSupport::OrderedOptions
+    assert Rails.configuration.Users.AutoSetupUsernameBlacklist.keys.size > 0
+    Rails.configuration.Users.AutoSetupUsernameBlacklist.keys.each do |k|
+      assert k.is_a? Symbol
+    end
+  end
 end
index addea83062404b84baf1911d6b81a262d582ce05..48cae5afee92622fcda41c2b48e89761f54ac80d 100644 (file)
@@ -1044,10 +1044,10 @@ class CollectionTest < ActiveSupport::TestCase
   end
 
   test "create collections with managed properties" do
-    Rails.configuration.Collections.ManagedProperties = {
+    Rails.configuration.Collections.ManagedProperties = ConfigLoader.to_OrderedOptions({
       'default_prop1' => {'Value' => 'prop1_value'},
       'responsible_person_uuid' => {'Function' => 'original_owner'}
-    }
+    })
     # Test collection without initial properties
     act_as_user users(:active) do
       c = create_collection 'foo', Encoding::US_ASCII
@@ -1076,9 +1076,9 @@ class CollectionTest < ActiveSupport::TestCase
   end
 
   test "update collection with protected managed properties" do
-    Rails.configuration.Collections.ManagedProperties = {
+    Rails.configuration.Collections.ManagedProperties = ConfigLoader.to_OrderedOptions({
       'default_prop1' => {'Value' => 'prop1_value', 'Protected' => true},
-    }
+    })
     act_as_user users(:active) do
       c = create_collection 'foo', Encoding::US_ASCII
       assert c.valid?
index b91910d2d66f89081c5bd8ee44ecca20a03da046..90de800b2fa472e05e711a73a56fe97bb5c67572 100644 (file)
@@ -576,7 +576,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
   test "Container.resolve_container_image(pdh)" do
     set_user_from_auth :active
     [[:docker_image, 'v1'], [:docker_image_1_12, 'v2']].each do |coll, ver|
-      Rails.configuration.Containers.SupportedDockerImageFormats = {ver=>{}}
+      Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({ver=>{}})
       pdh = collections(coll).portable_data_hash
       resolved = Container.resolve_container_image(pdh)
       assert_equal resolved, pdh
@@ -602,7 +602,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
   end
 
   test "migrated docker image" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v2'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v2'=>{}})
     add_docker19_migration_link
 
     # Test that it returns only v2 images even though request is for v1 image.
@@ -620,7 +620,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
   end
 
   test "use unmigrated docker image" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v1'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v1'=>{}})
     add_docker19_migration_link
 
     # Test that it returns only supported v1 images even though there is a
@@ -639,7 +639,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
   end
 
   test "incompatible docker image v1" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v1'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v1'=>{}})
     add_docker19_migration_link
 
     # Don't return unsupported v2 image even if we ask for it directly.
@@ -652,7 +652,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
   end
 
   test "incompatible docker image v2" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v2'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v2'=>{}})
     # No migration link, don't return unsupported v1 image,
 
     set_user_from_auth :active
index 0e8cc48538e0f00acc0650d80910774876bc7822..c529aab8b653947ef0d8f64db7fed64c5000c523 100644 (file)
@@ -117,7 +117,7 @@ class JobTest < ActiveSupport::TestCase
     'locator' => BAD_COLLECTION,
   }.each_pair do |spec_type, image_spec|
     test "Job validation fails with nonexistent Docker image #{spec_type}" do
-      Rails.configuration.RemoteClusters = {}
+      Rails.configuration.RemoteClusters = ConfigLoader.to_OrderedOptions({})
       job = Job.new job_attrs(runtime_constraints:
                               {'docker_image' => image_spec})
       assert(job.invalid?, "nonexistent Docker image #{spec_type} #{image_spec} was valid")
index 016a0e4eb4a9b6a59717de2c75a634b3182dd82f..58f0cddb709064ad2f8af9df6c20aa20f0c77d6a 100644 (file)
@@ -282,7 +282,7 @@ class LogTest < ActiveSupport::TestCase
   end
 
   test "non-empty configuration.unlogged_attributes" do
-    Rails.configuration.AuditLogs.UnloggedAttributes = {"manifest_text"=>{}}
+    Rails.configuration.AuditLogs.UnloggedAttributes = ConfigLoader.to_OrderedOptions({"manifest_text"=>{}})
     txt = ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n"
 
     act_as_system_user do
@@ -297,7 +297,7 @@ class LogTest < ActiveSupport::TestCase
   end
 
   test "empty configuration.unlogged_attributes" do
-    Rails.configuration.AuditLogs.UnloggedAttributes = {}
+    Rails.configuration.AuditLogs.UnloggedAttributes = ConfigLoader.to_OrderedOptions({})
     txt = ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n"
 
     act_as_system_user do
index 7fcd36d7091a4c7a00a9af6ae50f10f5a413871d..b6d66230db6e2bbb0a06bbe0add823fdb397642d 100644 (file)
@@ -110,7 +110,7 @@ class UserTest < ActiveSupport::TestCase
   end
 
   test "new username set avoiding blacklist" do
-    Rails.configuration.Users.AutoSetupUsernameBlacklist = {"root"=>{}}
+    Rails.configuration.Users.AutoSetupUsernameBlacklist = ConfigLoader.to_OrderedOptions({"root"=>{}})
     check_new_username_setting("root", "root2")
   end
 
@@ -340,48 +340,52 @@ class UserTest < ActiveSupport::TestCase
     assert_equal(user.first_name, 'first_name_for_newly_created_user_updated')
   end
 
+  active_notify_list = ConfigLoader.to_OrderedOptions({"active-notify@example.com"=>{}})
+  inactive_notify_list = ConfigLoader.to_OrderedOptions({"inactive-notify@example.com"=>{}})
+  empty_notify_list = ConfigLoader.to_OrderedOptions({})
+
   test "create new user with notifications" do
     set_user_from_auth :admin
 
-    create_user_and_verify_setup_and_notifications true, {'active-notify-address@example.com'=>{}}, {'inactive-notify-address@example.com'=>{}}, nil, nil
-    create_user_and_verify_setup_and_notifications true, {'active-notify-address@example.com'=>{}}, {}, nil, nil
-    create_user_and_verify_setup_and_notifications true, {}, [], nil, nil
-    create_user_and_verify_setup_and_notifications false, {'active-notify-address@example.com'=>{}}, {'inactive-notify-address@example.com'=>{}}, nil, nil
-    create_user_and_verify_setup_and_notifications false, {}, {'inactive-notify-address@example.com'=>{}}, nil, nil
-    create_user_and_verify_setup_and_notifications false, {}, {}, nil, nil
+    create_user_and_verify_setup_and_notifications true, active_notify_list, inactive_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications true, active_notify_list, empty_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications true, empty_notify_list, empty_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications false, active_notify_list, inactive_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications false, empty_notify_list, inactive_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications false, empty_notify_list, empty_notify_list, nil, nil
   end
 
   [
     # Easy inactive user tests.
-    [false, {}, {}, "inactive-none@example.com", false, false, "inactivenone"],
-    [false, {}, {}, "inactive-vm@example.com", true, false, "inactivevm"],
-    [false, {}, {}, "inactive-repo@example.com", false, true, "inactiverepo"],
-    [false, {}, {}, "inactive-both@example.com", true, true, "inactiveboth"],
+    [false, empty_notify_list, empty_notify_list, "inactive-none@example.com", false, false, "inactivenone"],
+    [false, empty_notify_list, empty_notify_list, "inactive-vm@example.com", true, false, "inactivevm"],
+    [false, empty_notify_list, empty_notify_list, "inactive-repo@example.com", false, true, "inactiverepo"],
+    [false, empty_notify_list, empty_notify_list, "inactive-both@example.com", true, true, "inactiveboth"],
 
     # Easy active user tests.
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-none@example.com", false, false, "activenone"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-vm@example.com", true, false, "activevm"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-repo@example.com", false, true, "activerepo"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-both@example.com", true, true, "activeboth"],
+    [true, active_notify_list, inactive_notify_list, "active-none@example.com", false, false, "activenone"],
+    [true, active_notify_list, inactive_notify_list, "active-vm@example.com", true, false, "activevm"],
+    [true, active_notify_list, inactive_notify_list, "active-repo@example.com", false, true, "activerepo"],
+    [true, active_notify_list, inactive_notify_list, "active-both@example.com", true, true, "activeboth"],
 
     # Test users with malformed e-mail addresses.
-    [false, {}, {}, nil, true, true, nil],
-    [false, {}, {}, "arvados", true, true, nil],
-    [false, {}, {}, "@example.com", true, true, nil],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "*!*@example.com", true, false, nil],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "*!*@example.com", false, false, nil],
+    [false, empty_notify_list, empty_notify_list, nil, true, true, nil],
+    [false, empty_notify_list, empty_notify_list, "arvados", true, true, nil],
+    [false, empty_notify_list, empty_notify_list, "@example.com", true, true, nil],
+    [true, active_notify_list, inactive_notify_list, "*!*@example.com", true, false, nil],
+    [true, active_notify_list, inactive_notify_list, "*!*@example.com", false, false, nil],
 
     # Test users with various username transformations.
-    [false, {}, {}, "arvados@example.com", false, false, "arvados2"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "arvados@example.com", false, false, "arvados2"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "root@example.com", true, false, "root2"],
-    [false, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "root@example.com", true, false, "root2"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "roo_t@example.com", false, true, "root2"],
-    [false, {}, {}, "^^incorrect_format@example.com", true, true, "incorrectformat"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", true, true, "ad9"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", false, false, "ad9"],
-    [false, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", true, true, "ad9"],
-    [false, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", false, false, "ad9"],
+    [false, empty_notify_list, empty_notify_list, "arvados@example.com", false, false, "arvados2"],
+    [true, active_notify_list, inactive_notify_list, "arvados@example.com", false, false, "arvados2"],
+    [true, active_notify_list, inactive_notify_list, "root@example.com", true, false, "root2"],
+    [false, active_notify_list, inactive_notify_list, "root@example.com", true, false, "root2"],
+    [true, active_notify_list, inactive_notify_list, "roo_t@example.com", false, true, "root2"],
+    [false, empty_notify_list, empty_notify_list, "^^incorrect_format@example.com", true, true, "incorrectformat"],
+    [true, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", true, true, "ad9"],
+    [true, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", false, false, "ad9"],
+    [false, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", true, true, "ad9"],
+    [false, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", false, false, "ad9"],
   ].each do |active, new_user_recipients, inactive_recipients, email, auto_setup_vm, auto_setup_repo, expect_username|
     test "create new user with auto setup #{active} #{email} #{auto_setup_vm} #{auto_setup_repo}" do
       set_user_from_auth :admin
@@ -569,7 +573,6 @@ class UserTest < ActiveSupport::TestCase
     assert_not_nil resp_user, 'expected user object'
     assert_not_nil resp_user['uuid'], 'expected user object'
     assert_equal email, resp_user['email'], 'expected email not found'
-
   end
 
   def verify_link (link_object, link_class, link_name, tail_uuid, head_uuid)
@@ -648,7 +651,7 @@ class UserTest < ActiveSupport::TestCase
     if not new_user_recipients.empty? then
       assert_not_nil new_user_email, 'Expected new user email after setup'
       assert_equal Rails.configuration.Users.UserNotifierEmailFrom, new_user_email.from[0]
-      assert_equal new_user_recipients.keys.first, new_user_email.to[0]
+      assert_equal new_user_recipients.stringify_keys.keys.first, new_user_email.to[0]
       assert_equal new_user_email_subject, new_user_email.subject
     else
       assert_nil new_user_email, 'Did not expect new user email after setup'
@@ -658,7 +661,7 @@ class UserTest < ActiveSupport::TestCase
       if not inactive_recipients.empty? then
         assert_not_nil new_inactive_user_email, 'Expected new inactive user email after setup'
         assert_equal Rails.configuration.Users.UserNotifierEmailFrom, new_inactive_user_email.from[0]
-        assert_equal inactive_recipients.keys.first, new_inactive_user_email.to[0]
+        assert_equal inactive_recipients.stringify_keys.keys.first, new_inactive_user_email.to[0]
         assert_equal "#{Rails.configuration.Users.EmailSubjectPrefix}New inactive user notification", new_inactive_user_email.subject
       else
         assert_nil new_inactive_user_email, 'Did not expect new inactive user email after setup'
@@ -667,7 +670,6 @@ class UserTest < ActiveSupport::TestCase
       assert_nil new_inactive_user_email, 'Expected no inactive user email after setting up active user'
     end
     ActionMailer::Base.deliveries = []
-
   end
 
   def verify_link_exists link_exists, head_uuid, tail_uuid, link_class, link_name, property_name=nil, property_value=nil