16833: Merge branch 'master' into 16833-replace-epydoc

author Ward Vandewege <ward@curii.com>

Wed, 16 Sep 2020 20:00:07 +0000 (16:00 -0400)

committer Ward Vandewege <ward@curii.com>

Wed, 16 Sep 2020 20:00:24 +0000 (16:00 -0400)
author Ward Vandewege <ward@curii.com>
Wed, 16 Sep 2020 20:00:07 +0000 (16:00 -0400)
committer Ward Vandewege <ward@curii.com>
Wed, 16 Sep 2020 20:00:24 +0000 (16:00 -0400)
diff --git a/doc/Gemfile.lock b/doc/Gemfile.lock

index 344a0a86b51555d60c7bb812afbdc0d5a1819349..b5e62cacd6b0b605599fc1cff58428d6d73673fc 100644 (file)
--- a/doc/Gemfile.lock
+++ b/doc/Gemfile.lock
@@ -1,28 +1,23 @@
  GEM
    remote: https://rubygems.org/
    specs:
-    RedCloth (4.2.9)
-    coderay (1.1.0)
-    colorize (0.6.0)
-    kramdown (1.3.1)
-    less (1.2.21)
-      mutter (>= 0.4.2)
-      treetop (>= 1.4.2)
-    liquid (2.6.1)
-    makerakeworkwell (1.0.3)
-      rake (>= 0.9.2, < 11)
-    mutter (0.5.3)
-    polyglot (0.3.3)
-    rake (10.1.1)
-    treetop (1.4.15)
-      polyglot
-      polyglot (>= 0.3.1)
-    zenweb (3.3.1)
+    RedCloth (4.3.2)
+    coderay (1.1.3)
+    colorize (0.8.1)
+    commonjs (0.2.7)
+    kramdown (1.17.0)
+    less (2.6.0)
+      commonjs (~> 0.2.7)
+    liquid (4.0.3)
+    makerakeworkwell (1.0.4)
+      rake (>= 0.9.2, < 15)
+    rake (13.0.1)
+    zenweb (3.10.4)
        coderay (~> 1.0)
-      kramdown (~> 1.0)
-      less (~> 1.2)
+      kramdown (~> 1.4)
+      less (~> 2.0)
        makerakeworkwell (~> 1.0)
-      rake (>= 0.9, < 11)
+      rake (>= 0.9, < 15)
  
  PLATFORMS
    ruby
@@ -32,3 +27,6 @@ DEPENDENCIES
    colorize
    liquid
    zenweb
+
+BUNDLED WITH
+   2.1.4
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml

index b1865a2217ce99c48a13ae8a17e4cf10d336cbf0..15e7c7c06ce723114cafc86d8e4c6ec0c2b99dff 100644 (file)
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -938,6 +938,11 @@ Clusters:
          # Time before repeating SIGTERM when killing a container.
          TimeoutSignal: 5s
  
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
          # Time to give up on SIGTERM and write off the worker.
          TimeoutTERM: 2m
  
diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go

index 201ae3604537f9f44a9e788320b7262685944f98..7ed332151b8bbacaa81fa7352e8251058296b8c8 100644 (file)
--- a/lib/config/generated_config.go
+++ b/lib/config/generated_config.go
@@ -944,6 +944,11 @@ Clusters:
          # Time before repeating SIGTERM when killing a container.
          TimeoutSignal: 5s
  
+        # Time to give up on a process (most likely arv-mount) that
+        # still holds a container lockfile after its main supervisor
+        # process has exited, and declare the instance broken.
+        TimeoutStaleRunLock: 5s
+
          # Time to give up on SIGTERM and write off the worker.
          TimeoutTERM: 2m
  
diff --git a/lib/crunchrun/background.go b/lib/crunchrun/background.go

index bf039afa0ad53799183607fe9795b5556f615bad..8cdba72c10d3c5902225456de9389bcc70b6dbca 100644 (file)
--- a/lib/crunchrun/background.go
+++ b/lib/crunchrun/background.go
@@ -218,6 +218,24 @@ func ListProcesses(stdout, stderr io.Writer) int {
                         return nil
                 }
  
+               proc, err := os.FindProcess(pi.PID)
+               if err != nil {
+                       // FindProcess should have succeeded, even if the
+                       // process does not exist.
+                       fmt.Fprintf(stderr, "%s: find process %d: %s", path, pi.PID, err)
+                       return nil
+               }
+               err = proc.Signal(syscall.Signal(0))
+               if err != nil {
+                       // Process is dead, even though lockfile was
+                       // still locked. Most likely a stuck arv-mount
+                       // process that inherited the lock from
+                       // crunch-run. Report container UUID as
+                       // "stale".
+                       fmt.Fprintln(stdout, pi.UUID, "stale")
+                       return nil
+               }
+
                 fmt.Fprintln(stdout, pi.UUID)
                 return nil
         }))
diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go

index 45b346383fab8641b27d16063a46bc4468fc96ce..a1ff414b73ecbd3be22276b716a64af6eaa3f225 100644 (file)
--- a/lib/dispatchcloud/container/queue.go
+++ b/lib/dispatchcloud/container/queue.go
@@ -382,7 +382,7 @@ func (cq *Queue) poll() (map[string]*arvados.Container, error) {
                         *next[upd.UUID] = upd
                 }
         }
-       selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters"}
+       selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters", "created_at"}
         limitParam := 1000
  
         mine, err := cq.fetchAll(arvados.ResourceListParams{
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go

index 6e1850410b28bf3394ec4e29c4416a9551ec6d91..9f1eb098e01aa029ebe735f2f8a7a956bf56c236 100644 (file)
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -66,6 +66,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                                 ProbeInterval:        arvados.Duration(5 * time.Millisecond),
                                 MaxProbesPerSecond:   1000,
                                 TimeoutSignal:        arvados.Duration(3 * time.Millisecond),
+                               TimeoutStaleRunLock:  arvados.Duration(3 * time.Millisecond),
                                 TimeoutTERM:          arvados.Duration(20 * time.Millisecond),
                                 ResourceTags:         map[string]string{"testtag": "test value"},
                                 TagKeyPrefix:         "test:",
@@ -169,6 +170,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                         stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
                 default:
                         stubvm.CrunchRunCrashRate = 0.1
+                       stubvm.ArvMountDeadlockRate = 0.1
                 }
         }
         s.stubDriver.Bugf = c.Errorf
@@ -221,6 +223,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
         c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
         c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
         c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
  }
  
  func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go

index 6d703649893356bdb91429954128d1e5afd6a9c5..fd1d0a870b7ac9f34f9d1dd39f250fed62b4a099 100644 (file)
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
         c.Check(pool.Running(), check.HasLen, 0)
  }
  
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
         ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
         queue := test.Queue{
                 ChooseType: chooseType,
                 Containers: []arvados.Container{
                         {
-                               UUID:     test.ContainerUUID(1),
-                               Priority: 1,
-                               State:    arvados.ContainerStateLocked,
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
                                 RuntimeConstraints: arvados.RuntimeConstraints{
                                         VCPUs: 1,
                                         RAM:   1 << 30,
@@ -417,9 +418,11 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
         }
         sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
         sch.runQueue()
+       sch.updateMetrics()
  
         c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
         c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
  
         // Create a pool without workers. The queued container will not be started, and the
         // 'over quota' metric will be 1 because no workers are available and canCreate defaults
@@ -427,7 +430,40 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
         pool = stubPool{}
         sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
         sch.runQueue()
+       sch.updateMetrics()
  
         c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
         c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+       // Reset the queue, and create a pool with an idle worker. The queued
+       // container will be started immediately and mLongestWaitTimeSinceQueue
+       // should be zero.
+       queue = test.Queue{
+               ChooseType: chooseType,
+               Containers: []arvados.Container{
+                       {
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
+                               RuntimeConstraints: arvados.RuntimeConstraints{
+                                       VCPUs: 1,
+                                       RAM:   1 << 30,
+                               },
+                       },
+               },
+       }
+       queue.Update()
+
+       pool = stubPool{
+               idle:    map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               running: map[string]time.Time{},
+       }
+       sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+       sch.runQueue()
+       sch.updateMetrics()
+
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
  }
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go

index 6fd47e919411d69026316d24d8643e5693d06f38..c3e67dd11f70a4e00c8a74f59826efb13bf0e35c 100644 (file)
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -11,6 +11,7 @@ import (
         "sync"
         "time"
  
+       "git.arvados.org/arvados.git/sdk/go/arvados"
         "git.arvados.org/arvados.git/sdk/go/ctxlog"
         "github.com/prometheus/client_golang/prometheus"
         "github.com/sirupsen/logrus"
@@ -46,6 +47,7 @@ type Scheduler struct {
  
         mContainersAllocatedNotStarted   prometheus.Gauge
         mContainersNotAllocatedOverQuota prometheus.Gauge
+       mLongestWaitTimeSinceQueue       prometheus.Gauge
  }
  
  // New returns a new unstarted Scheduler.
@@ -87,6 +89,37 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
                 Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
         })
         reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+       sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "containers_longest_wait_time_seconds",
+               Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+       })
+       reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
+}
+
+func (sch *Scheduler) updateMetrics() {
+       earliest := time.Time{}
+       entries, _ := sch.queue.Entries()
+       running := sch.pool.Running()
+       for _, ent := range entries {
+               if ent.Container.Priority > 0 &&
+                       (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
+                       // Exclude containers that are preparing to run the payload (i.e.
+                       // ContainerStateLocked and running on a worker, most likely loading the
+                       // payload image
+                       if _, ok := running[ent.Container.UUID]; !ok {
+                               if ent.Container.CreatedAt.Before(earliest) || earliest.IsZero() {
+                                       earliest = ent.Container.CreatedAt
+                               }
+                       }
+               }
+       }
+       if !earliest.IsZero() {
+               sch.mLongestWaitTimeSinceQueue.Set(time.Since(earliest).Seconds())
+       } else {
+               sch.mLongestWaitTimeSinceQueue.Set(0)
+       }
  }
  
  // Start starts the scheduler.
@@ -141,6 +174,7 @@ func (sch *Scheduler) run() {
         for {
                 sch.runQueue()
                 sch.sync()
+               sch.updateMetrics()
                 select {
                 case <-sch.stop:
                         return
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go

index 132bd4d695f0ef88095951b151be592029c31328..4d32cf221ce49461e092a834ad192460bc37a49d 100644 (file)
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -131,7 +131,7 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID,
                 tags:         copyTags(tags),
                 providerType: it.ProviderType,
                 initCommand:  cmd,
-               running:      map[string]int64{},
+               running:      map[string]stubProcess{},
                 killing:      map[string]bool{},
         }
         svm.SSHService = SSHService{
@@ -189,6 +189,8 @@ type StubVM struct {
         CrunchRunMissing      bool
         CrunchRunCrashRate    float64
         CrunchRunDetachDelay  time.Duration
+       ArvMountMaxExitLag    time.Duration
+       ArvMountDeadlockRate  float64
         ExecuteContainer      func(arvados.Container) int
         CrashRunningContainer func(arvados.Container)
  
@@ -198,12 +200,21 @@ type StubVM struct {
         initCommand  cloud.InitCommand
         providerType string
         SSHService   SSHService
-       running      map[string]int64
+       running      map[string]stubProcess
         killing      map[string]bool
         lastPID      int64
+       deadlocked   string
         sync.Mutex
  }
  
+type stubProcess struct {
+       pid int64
+
+       // crunch-run has exited, but arv-mount process (or something)
+       // still holds lock in /var/run/
+       exited bool
+}
+
  func (svm *StubVM) Instance() stubInstance {
         svm.Lock()
         defer svm.Unlock()
@@ -256,7 +267,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                 svm.Lock()
                 svm.lastPID++
                 pid := svm.lastPID
-               svm.running[uuid] = pid
+               svm.running[uuid] = stubProcess{pid: pid}
                 svm.Unlock()
                 time.Sleep(svm.CrunchRunDetachDelay)
                 fmt.Fprintf(stderr, "starting %s\n", uuid)
@@ -273,14 +284,13 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                                 logger.Print("[test] exiting crunch-run stub")
                                 svm.Lock()
                                 defer svm.Unlock()
-                               if svm.running[uuid] != pid {
+                               if svm.running[uuid].pid != pid {
                                         bugf := svm.sis.driver.Bugf
                                         if bugf == nil {
                                                 bugf = logger.Warnf
                                         }
-                                       bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
-                               } else {
-                                       delete(svm.running, uuid)
+                                       bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s].pid==%d", pid, uuid, svm.running[uuid].pid)
+                                       return
                                 }
                                 if !completed {
                                         logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
@@ -288,6 +298,15 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                                                 svm.CrashRunningContainer(ctr)
                                         }
                                 }
+                               sproc := svm.running[uuid]
+                               sproc.exited = true
+                               svm.running[uuid] = sproc
+                               svm.Unlock()
+                               time.Sleep(svm.ArvMountMaxExitLag * time.Duration(math_rand.Float64()))
+                               svm.Lock()
+                               if math_rand.Float64() >= svm.ArvMountDeadlockRate {
+                                       delete(svm.running, uuid)
+                               }
                         }()
  
                         crashluck := math_rand.Float64()
@@ -333,26 +352,31 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
         if command == "crunch-run --list" {
                 svm.Lock()
                 defer svm.Unlock()
-               for uuid := range svm.running {
-                       fmt.Fprintf(stdout, "%s\n", uuid)
+               for uuid, sproc := range svm.running {
+                       if sproc.exited {
+                               fmt.Fprintf(stdout, "%s stale\n", uuid)
+                       } else {
+                               fmt.Fprintf(stdout, "%s\n", uuid)
+                       }
                 }
                 if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) {
                         fmt.Fprintln(stdout, "broken")
                 }
+               fmt.Fprintln(stdout, svm.deadlocked)
                 return 0
         }
         if strings.HasPrefix(command, "crunch-run --kill ") {
                 svm.Lock()
-               _, running := svm.running[uuid]
-               if running {
+               sproc, running := svm.running[uuid]
+               if running && !sproc.exited {
                         svm.killing[uuid] = true
                         svm.Unlock()
                         time.Sleep(time.Duration(math_rand.Float64()*2) * time.Millisecond)
                         svm.Lock()
-                       _, running = svm.running[uuid]
+                       sproc, running = svm.running[uuid]
                 }
                 svm.Unlock()
-               if running {
+               if running && !sproc.exited {
                         fmt.Fprintf(stderr, "%s: container is running\n", uuid)
                         return 1
                 }
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go

index 086887cb44176f05c9446341a68d7176dd5ed7aa..c6eaeae2b618b11423512cc2be1cb5cfc454c20c 100644 (file)
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -64,15 +64,16 @@ type Executor interface {
  }
  
  const (
-       defaultSyncInterval       = time.Minute
-       defaultProbeInterval      = time.Second * 10
-       defaultMaxProbesPerSecond = 10
-       defaultTimeoutIdle        = time.Minute
-       defaultTimeoutBooting     = time.Minute * 10
-       defaultTimeoutProbe       = time.Minute * 10
-       defaultTimeoutShutdown    = time.Second * 10
-       defaultTimeoutTERM        = time.Minute * 2
-       defaultTimeoutSignal      = time.Second * 5
+       defaultSyncInterval        = time.Minute
+       defaultProbeInterval       = time.Second * 10
+       defaultMaxProbesPerSecond  = 10
+       defaultTimeoutIdle         = time.Minute
+       defaultTimeoutBooting      = time.Minute * 10
+       defaultTimeoutProbe        = time.Minute * 10
+       defaultTimeoutShutdown     = time.Second * 10
+       defaultTimeoutTERM         = time.Minute * 2
+       defaultTimeoutSignal       = time.Second * 5
+       defaultTimeoutStaleRunLock = time.Second * 5
  
         // Time after a quota error to try again anyway, even if no
         // instances have been shutdown.
@@ -115,6 +116,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
                 timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
                 timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
                 timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+               timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
                 installPublicKey:               installPublicKey,
                 tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
                 stop:                           make(chan bool),
@@ -152,6 +154,7 @@ type Pool struct {
         timeoutShutdown                time.Duration
         timeoutTERM                    time.Duration
         timeoutSignal                  time.Duration
+       timeoutStaleRunLock            time.Duration
         installPublicKey               ssh.PublicKey
         tagKeyPrefix                   string
  
@@ -170,15 +173,17 @@ type Pool struct {
         runnerMD5    [md5.Size]byte
         runnerCmd    string
  
-       mContainersRunning       prometheus.Gauge
-       mInstances               *prometheus.GaugeVec
-       mInstancesPrice          *prometheus.GaugeVec
-       mVCPUs                   *prometheus.GaugeVec
-       mMemory                  *prometheus.GaugeVec
-       mBootOutcomes            *prometheus.CounterVec
-       mDisappearances          *prometheus.CounterVec
-       mTimeToSSH               prometheus.Summary
-       mTimeToReadyForContainer prometheus.Summary
+       mContainersRunning        prometheus.Gauge
+       mInstances                *prometheus.GaugeVec
+       mInstancesPrice           *prometheus.GaugeVec
+       mVCPUs                    *prometheus.GaugeVec
+       mMemory                   *prometheus.GaugeVec
+       mBootOutcomes             *prometheus.CounterVec
+       mDisappearances           *prometheus.CounterVec
+       mTimeToSSH                prometheus.Summary
+       mTimeToReadyForContainer  prometheus.Summary
+       mTimeFromShutdownToGone   prometheus.Summary
+       mTimeFromQueueToCrunchRun prometheus.Summary
  }
  
  type createCall struct {
@@ -661,6 +666,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
         })
         reg.MustRegister(wp.mTimeToReadyForContainer)
+       wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+               Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromShutdownToGone)
+       wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "containers_time_from_queue_to_crunch_run_seconds",
+               Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
  }
  
  func (wp *Pool) runMetrics() {
@@ -930,6 +951,10 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                 if wp.mDisappearances != nil {
                         wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                 }
+               // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+               if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
+                       wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+               }
                 delete(wp.workers, id)
                 go wkr.Close()
                 notify = true
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go

index 9199d4bafe764d806312638328cf13fd3b422e4d..5b145d7c6599b75bb6e8b30f6c65e65d82186d84 100644 (file)
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -110,6 +110,7 @@ type worker struct {
         probing             chan struct{}
         bootOutcomeReported bool
         timeToReadyReported bool
+       staleRunLockSince   time.Time
  }
  
  func (wkr *worker) onUnkillable(uuid string) {
@@ -176,6 +177,9 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
         }
         go func() {
                 rr.Start()
+               if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+                       wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
+               }
                 wkr.mtx.Lock()
                 defer wkr.mtx.Unlock()
                 now := time.Now()
@@ -382,13 +386,43 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
                 return
         }
         ok = true
+
+       staleRunLock := false
         for _, s := range strings.Split(string(stdout), "\n") {
-               if s == "broken" {
+               // Each line of the "crunch-run --list" output is one
+               // of the following:
+               //
+               // * a container UUID, indicating that processes
+               //   related to that container are currently running.
+               //   Optionally followed by " stale", indicating that
+               //   the crunch-run process itself has exited (the
+               //   remaining process is probably arv-mount).
+               //
+               // * the string "broken", indicating that the instance
+               //   appears incapable of starting containers.
+               //
+               // See ListProcesses() in lib/crunchrun/background.go.
+               if s == "" {
+                       // empty string following final newline
+               } else if s == "broken" {
                         reportsBroken = true
-               } else if s != "" {
+               } else if toks := strings.Split(s, " "); len(toks) == 1 {
                         running = append(running, s)
+               } else if toks[1] == "stale" {
+                       wkr.logger.WithField("ContainerUUID", toks[0]).Info("probe reported stale run lock")
+                       staleRunLock = true
                 }
         }
+       wkr.mtx.Lock()
+       defer wkr.mtx.Unlock()
+       if !staleRunLock {
+               wkr.staleRunLockSince = time.Time{}
+       } else if wkr.staleRunLockSince.IsZero() {
+               wkr.staleRunLockSince = time.Now()
+       } else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock {
+               wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long")
+               reportsBroken = true
+       }
         return
  }
  
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go

index 363d09dafb5452b350077d6933ed2698689fb513..394e30a737e2aff13128df90d96decfc12428863 100644 (file)
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -462,6 +462,7 @@ type CloudVMsConfig struct {
         TimeoutProbe                   Duration
         TimeoutShutdown                Duration
         TimeoutSignal                  Duration
+       TimeoutStaleRunLock            Duration
         TimeoutTERM                    Duration
         ResourceTags                   map[string]string
         TagKeyPrefix                   string
diff --git a/services/api/app/models/arvados_model.rb b/services/api/app/models/arvados_model.rb

index 6fb8ff2b33549af8e4e512a1374363f8dee8fa64..709b4b4d1c1304e456c494a4079ca2ea0cccc8fb 100644 (file)
--- a/services/api/app/models/arvados_model.rb
+++ b/services/api/app/models/arvados_model.rb
@@ -454,7 +454,7 @@ class ArvadosModel < ApplicationRecord
    end
  
    def logged_attributes
-    attributes.except(*Rails.configuration.AuditLogs.UnloggedAttributes.keys)
+    attributes.except(*Rails.configuration.AuditLogs.UnloggedAttributes.stringify_keys.keys)
    end
  
    def self.full_text_searchable_columns
diff --git a/services/api/lib/config_loader.rb b/services/api/lib/config_loader.rb

index cf16993ca51054e220713c24cca1a33510e2a423..f421fb5b2a07817905c28bbd1a463da9be936ac5 100644 (file)
--- a/services/api/lib/config_loader.rb
+++ b/services/api/lib/config_loader.rb
@@ -147,14 +147,14 @@ class ConfigLoader
              'Ki' => 1 << 10,
              'M' => 1000000,
              'Mi' => 1 << 20,
-           "G" =>  1000000000,
-           "Gi" => 1 << 30,
-           "T" =>  1000000000000,
-           "Ti" => 1 << 40,
-           "P" =>  1000000000000000,
-           "Pi" => 1 << 50,
-           "E" =>  1000000000000000000,
-           "Ei" => 1 << 60,
+            "G" =>  1000000000,
+            "Gi" => 1 << 30,
+            "T" =>  1000000000000,
+            "Ti" => 1 << 40,
+            "P" =>  1000000000000000,
+            "Pi" => 1 << 50,
+            "E" =>  1000000000000000000,
+            "Ei" => 1 << 60,
            }[mt[2]]
          end
        end
diff --git a/services/api/lib/enable_jobs_api.rb b/services/api/lib/enable_jobs_api.rb

index 1a96a81ad66708f8e45c032737a821d3a5d12ebc..cef76f08a5c93342f2ba02bf4ec699d2bf98bdd3 100644 (file)
--- a/services/api/lib/enable_jobs_api.rb
+++ b/services/api/lib/enable_jobs_api.rb
@@ -2,16 +2,19 @@
  #
  # SPDX-License-Identifier: AGPL-3.0
  
-Disable_update_jobs_api_method_list = {"jobs.create"=>{},
-                                "pipeline_instances.create"=>{},
-                                "pipeline_templates.create"=>{},
-                                "jobs.update"=>{},
-                                "pipeline_instances.update"=>{},
-                                "pipeline_templates.update"=>{},
-                                "job_tasks.create"=>{},
-                                "job_tasks.update"=>{}}
+Disable_update_jobs_api_method_list = ConfigLoader.to_OrderedOptions({
+                                        "jobs.create"=>{},
+                                        "pipeline_instances.create"=>{},
+                                        "pipeline_templates.create"=>{},
+                                        "jobs.update"=>{},
+                                        "pipeline_instances.update"=>{},
+                                        "pipeline_templates.update"=>{},
+                                        "job_tasks.create"=>{},
+                                        "job_tasks.update"=>{}
+                                      })
  
-Disable_jobs_api_method_list = {"jobs.create"=>{},
+Disable_jobs_api_method_list = ConfigLoader.to_OrderedOptions({
+                                "jobs.create"=>{},
                                  "pipeline_instances.create"=>{},
                                  "pipeline_templates.create"=>{},
                                  "jobs.get"=>{},
@@ -36,7 +39,7 @@ Disable_jobs_api_method_list = {"jobs.create"=>{},
                                  "jobs.show"=>{},
                                  "pipeline_instances.show"=>{},
                                  "pipeline_templates.show"=>{},
-                                "job_tasks.show"=>{}}
+                                "job_tasks.show"=>{}})
  
  def check_enable_legacy_jobs_api
    # Create/update is permanently disabled (legacy functionality has been removed)
diff --git a/services/api/test/functional/arvados/v1/groups_controller_test.rb b/services/api/test/functional/arvados/v1/groups_controller_test.rb

index ff89cd2129b31ad5357d54066262a80cd702e41c..f413188b54b2ba54236a8ea5ce2ab51c002cb434 100644 (file)
--- a/services/api/test/functional/arvados/v1/groups_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/groups_controller_test.rb
@@ -430,7 +430,8 @@ class Arvados::V1::GroupsControllerTest < ActionController::TestCase
    end
  
    test 'get contents with jobs and pipeline instances disabled' do
-    Rails.configuration.API.DisabledAPIs = {'jobs.index'=>{}, 'pipeline_instances.index'=>{}}
+    Rails.configuration.API.DisabledAPIs = ConfigLoader.to_OrderedOptions(
+      {'jobs.index'=>{}, 'pipeline_instances.index'=>{}})
  
      authorize_with :active
      get :contents, params: {
diff --git a/services/api/test/functional/arvados/v1/schema_controller_test.rb b/services/api/test/functional/arvados/v1/schema_controller_test.rb

index 3dd343b13cd29ac567f8244b9399c534651fbceb..764f3a8d1dd395a7bb84b365144ca114bf3c92ff 100644 (file)
--- a/services/api/test/functional/arvados/v1/schema_controller_test.rb
+++ b/services/api/test/functional/arvados/v1/schema_controller_test.rb
@@ -65,8 +65,8 @@ class Arvados::V1::SchemaControllerTest < ActionController::TestCase
    end
  
    test "non-empty disable_api_methods" do
-    Rails.configuration.API.DisabledAPIs =
-      {'jobs.create'=>{}, 'pipeline_instances.create'=>{}, 'pipeline_templates.create'=>{}}
+    Rails.configuration.API.DisabledAPIs = ConfigLoader.to_OrderedOptions(
+      {'jobs.create'=>{}, 'pipeline_instances.create'=>{}, 'pipeline_templates.create'=>{}})
      get :index
      assert_response :success
      discovery_doc = JSON.parse(@response.body)
diff --git a/services/api/test/functional/user_sessions_controller_test.rb b/services/api/test/functional/user_sessions_controller_test.rb

index cd475dea4d1849f6d99374fa3976068767ef1fcb..d979208d381b1a28d5f6ead099e45e6e7d4f9302 100644 (file)
--- a/services/api/test/functional/user_sessions_controller_test.rb
+++ b/services/api/test/functional/user_sessions_controller_test.rb
@@ -68,7 +68,7 @@ class UserSessionsControllerTest < ActionController::TestCase
  
    test "login to LoginCluster" do
      Rails.configuration.Login.LoginCluster = 'zbbbb'
-    Rails.configuration.RemoteClusters['zbbbb'] = {'Host' => 'zbbbb.example.com'}
+    Rails.configuration.RemoteClusters['zbbbb'] = ConfigLoader.to_OrderedOptions({'Host' => 'zbbbb.example.com'})
      api_client_page = 'http://client.example.com/home'
      get :login, params: {return_to: api_client_page}
      assert_response :redirect
diff --git a/services/api/test/test_helper.rb b/services/api/test/test_helper.rb

index c99a57aaff49b24910df17c4a745088a4903ce22..5dc77cb98aeaa9cb7758576042e6705d29ef7f22 100644 (file)
--- a/services/api/test/test_helper.rb
+++ b/services/api/test/test_helper.rb
@@ -62,7 +62,7 @@ class ActiveSupport::TestCase
    include ArvadosTestSupport
    include CurrentApiClient
  
-  teardown do
+  setup do
      Thread.current[:api_client_ip_address] = nil
      Thread.current[:api_client_authorization] = nil
      Thread.current[:api_client_uuid] = nil
@@ -72,6 +72,14 @@ class ActiveSupport::TestCase
      restore_configuration
    end
  
+  teardown do
+    # Confirm that any changed configuration doesn't include non-symbol keys
+    $arvados_config.keys.each do |conf_name|
+      conf = Rails.configuration.send(conf_name)
+      confirm_keys_as_symbols(conf, conf_name) if conf.respond_to?('keys')
+    end
+  end
+
    def assert_equal(expect, *args)
      if expect.nil?
        assert_nil(*args)
@@ -99,6 +107,14 @@ class ActiveSupport::TestCase
      end
    end
  
+  def confirm_keys_as_symbols(conf, conf_name)
+    assert(conf.is_a?(ActiveSupport::OrderedOptions), "#{conf_name} should be an OrderedOptions object")
+    conf.keys.each do |k|
+      assert(k.is_a?(Symbol), "Key '#{k}' on section '#{conf_name}' should be a Symbol")
+      confirm_keys_as_symbols(conf[k], "#{conf_name}.#{k}") if conf[k].respond_to?('keys')
+    end
+  end
+
    def restore_configuration
      # Restore configuration settings changed during tests
      ConfigLoader.copy_into_config $arvados_config, Rails.configuration
diff --git a/services/api/test/unit/application_test.rb b/services/api/test/unit/application_test.rb

index 679dddf223fa72b9d89f94258f8415c75992143f..e1565ec627c42ec53bbba875d8760d7637302e8c 100644 (file)
--- a/services/api/test/unit/application_test.rb
+++ b/services/api/test/unit/application_test.rb
@@ -7,7 +7,7 @@ require 'test_helper'
  class ApplicationTest < ActiveSupport::TestCase
    include CurrentApiClient
  
-  test "test act_as_system_user" do
+  test "act_as_system_user" do
      Thread.current[:user] = users(:active)
      assert_equal users(:active), Thread.current[:user]
      act_as_system_user do
@@ -17,7 +17,7 @@ class ApplicationTest < ActiveSupport::TestCase
      assert_equal users(:active), Thread.current[:user]
    end
  
-  test "test act_as_system_user is exception safe" do
+  test "act_as_system_user is exception safe" do
      Thread.current[:user] = users(:active)
      assert_equal users(:active), Thread.current[:user]
      caught = false
@@ -33,4 +33,12 @@ class ApplicationTest < ActiveSupport::TestCase
      assert caught
      assert_equal users(:active), Thread.current[:user]
    end
+
+  test "config maps' keys are returned as symbols" do
+    assert Rails.configuration.Users.AutoSetupUsernameBlacklist.is_a? ActiveSupport::OrderedOptions
+    assert Rails.configuration.Users.AutoSetupUsernameBlacklist.keys.size > 0
+    Rails.configuration.Users.AutoSetupUsernameBlacklist.keys.each do |k|
+      assert k.is_a? Symbol
+    end
+  end
  end
diff --git a/services/api/test/unit/collection_test.rb b/services/api/test/unit/collection_test.rb

index addea83062404b84baf1911d6b81a262d582ce05..48cae5afee92622fcda41c2b48e89761f54ac80d 100644 (file)
--- a/services/api/test/unit/collection_test.rb
+++ b/services/api/test/unit/collection_test.rb
@@ -1044,10 +1044,10 @@ class CollectionTest < ActiveSupport::TestCase
    end
  
    test "create collections with managed properties" do
-    Rails.configuration.Collections.ManagedProperties = {
+    Rails.configuration.Collections.ManagedProperties = ConfigLoader.to_OrderedOptions({
        'default_prop1' => {'Value' => 'prop1_value'},
        'responsible_person_uuid' => {'Function' => 'original_owner'}
-    }
+    })
      # Test collection without initial properties
      act_as_user users(:active) do
        c = create_collection 'foo', Encoding::US_ASCII
@@ -1076,9 +1076,9 @@ class CollectionTest < ActiveSupport::TestCase
    end
  
    test "update collection with protected managed properties" do
-    Rails.configuration.Collections.ManagedProperties = {
+    Rails.configuration.Collections.ManagedProperties = ConfigLoader.to_OrderedOptions({
        'default_prop1' => {'Value' => 'prop1_value', 'Protected' => true},
-    }
+    })
      act_as_user users(:active) do
        c = create_collection 'foo', Encoding::US_ASCII
        assert c.valid?
diff --git a/services/api/test/unit/container_request_test.rb b/services/api/test/unit/container_request_test.rb

index b91910d2d66f89081c5bd8ee44ecca20a03da046..90de800b2fa472e05e711a73a56fe97bb5c67572 100644 (file)
--- a/services/api/test/unit/container_request_test.rb
+++ b/services/api/test/unit/container_request_test.rb
@@ -576,7 +576,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
    test "Container.resolve_container_image(pdh)" do
      set_user_from_auth :active
      [[:docker_image, 'v1'], [:docker_image_1_12, 'v2']].each do |coll, ver|
-      Rails.configuration.Containers.SupportedDockerImageFormats = {ver=>{}}
+      Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({ver=>{}})
        pdh = collections(coll).portable_data_hash
        resolved = Container.resolve_container_image(pdh)
        assert_equal resolved, pdh
@@ -602,7 +602,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
    end
  
    test "migrated docker image" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v2'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v2'=>{}})
      add_docker19_migration_link
  
      # Test that it returns only v2 images even though request is for v1 image.
@@ -620,7 +620,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
    end
  
    test "use unmigrated docker image" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v1'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v1'=>{}})
      add_docker19_migration_link
  
      # Test that it returns only supported v1 images even though there is a
@@ -639,7 +639,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
    end
  
    test "incompatible docker image v1" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v1'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v1'=>{}})
      add_docker19_migration_link
  
      # Don't return unsupported v2 image even if we ask for it directly.
@@ -652,7 +652,7 @@ class ContainerRequestTest < ActiveSupport::TestCase
    end
  
    test "incompatible docker image v2" do
-    Rails.configuration.Containers.SupportedDockerImageFormats = {'v2'=>{}}
+    Rails.configuration.Containers.SupportedDockerImageFormats = ConfigLoader.to_OrderedOptions({'v2'=>{}})
      # No migration link, don't return unsupported v1 image,
  
      set_user_from_auth :active
diff --git a/services/api/test/unit/job_test.rb b/services/api/test/unit/job_test.rb

index 0e8cc48538e0f00acc0650d80910774876bc7822..c529aab8b653947ef0d8f64db7fed64c5000c523 100644 (file)
--- a/services/api/test/unit/job_test.rb
+++ b/services/api/test/unit/job_test.rb
@@ -117,7 +117,7 @@ class JobTest < ActiveSupport::TestCase
      'locator' => BAD_COLLECTION,
    }.each_pair do |spec_type, image_spec|
      test "Job validation fails with nonexistent Docker image #{spec_type}" do
-      Rails.configuration.RemoteClusters = {}
+      Rails.configuration.RemoteClusters = ConfigLoader.to_OrderedOptions({})
        job = Job.new job_attrs(runtime_constraints:
                                {'docker_image' => image_spec})
        assert(job.invalid?, "nonexistent Docker image #{spec_type} #{image_spec} was valid")
diff --git a/services/api/test/unit/log_test.rb b/services/api/test/unit/log_test.rb

index 016a0e4eb4a9b6a59717de2c75a634b3182dd82f..58f0cddb709064ad2f8af9df6c20aa20f0c77d6a 100644 (file)
--- a/services/api/test/unit/log_test.rb
+++ b/services/api/test/unit/log_test.rb
@@ -282,7 +282,7 @@ class LogTest < ActiveSupport::TestCase
    end
  
    test "non-empty configuration.unlogged_attributes" do
-    Rails.configuration.AuditLogs.UnloggedAttributes = {"manifest_text"=>{}}
+    Rails.configuration.AuditLogs.UnloggedAttributes = ConfigLoader.to_OrderedOptions({"manifest_text"=>{}})
      txt = ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n"
  
      act_as_system_user do
@@ -297,7 +297,7 @@ class LogTest < ActiveSupport::TestCase
    end
  
    test "empty configuration.unlogged_attributes" do
-    Rails.configuration.AuditLogs.UnloggedAttributes = {}
+    Rails.configuration.AuditLogs.UnloggedAttributes = ConfigLoader.to_OrderedOptions({})
      txt = ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n"
  
      act_as_system_user do
diff --git a/services/api/test/unit/user_test.rb b/services/api/test/unit/user_test.rb

index 7fcd36d7091a4c7a00a9af6ae50f10f5a413871d..b6d66230db6e2bbb0a06bbe0add823fdb397642d 100644 (file)
--- a/services/api/test/unit/user_test.rb
+++ b/services/api/test/unit/user_test.rb
@@ -110,7 +110,7 @@ class UserTest < ActiveSupport::TestCase
    end
  
    test "new username set avoiding blacklist" do
-    Rails.configuration.Users.AutoSetupUsernameBlacklist = {"root"=>{}}
+    Rails.configuration.Users.AutoSetupUsernameBlacklist = ConfigLoader.to_OrderedOptions({"root"=>{}})
      check_new_username_setting("root", "root2")
    end
  
@@ -340,48 +340,52 @@ class UserTest < ActiveSupport::TestCase
      assert_equal(user.first_name, 'first_name_for_newly_created_user_updated')
    end
  
+  active_notify_list = ConfigLoader.to_OrderedOptions({"active-notify@example.com"=>{}})
+  inactive_notify_list = ConfigLoader.to_OrderedOptions({"inactive-notify@example.com"=>{}})
+  empty_notify_list = ConfigLoader.to_OrderedOptions({})
+
    test "create new user with notifications" do
      set_user_from_auth :admin
  
-    create_user_and_verify_setup_and_notifications true, {'active-notify-address@example.com'=>{}}, {'inactive-notify-address@example.com'=>{}}, nil, nil
-    create_user_and_verify_setup_and_notifications true, {'active-notify-address@example.com'=>{}}, {}, nil, nil
-    create_user_and_verify_setup_and_notifications true, {}, [], nil, nil
-    create_user_and_verify_setup_and_notifications false, {'active-notify-address@example.com'=>{}}, {'inactive-notify-address@example.com'=>{}}, nil, nil
-    create_user_and_verify_setup_and_notifications false, {}, {'inactive-notify-address@example.com'=>{}}, nil, nil
-    create_user_and_verify_setup_and_notifications false, {}, {}, nil, nil
+    create_user_and_verify_setup_and_notifications true, active_notify_list, inactive_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications true, active_notify_list, empty_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications true, empty_notify_list, empty_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications false, active_notify_list, inactive_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications false, empty_notify_list, inactive_notify_list, nil, nil
+    create_user_and_verify_setup_and_notifications false, empty_notify_list, empty_notify_list, nil, nil
    end
  
    [
      # Easy inactive user tests.
-    [false, {}, {}, "inactive-none@example.com", false, false, "inactivenone"],
-    [false, {}, {}, "inactive-vm@example.com", true, false, "inactivevm"],
-    [false, {}, {}, "inactive-repo@example.com", false, true, "inactiverepo"],
-    [false, {}, {}, "inactive-both@example.com", true, true, "inactiveboth"],
+    [false, empty_notify_list, empty_notify_list, "inactive-none@example.com", false, false, "inactivenone"],
+    [false, empty_notify_list, empty_notify_list, "inactive-vm@example.com", true, false, "inactivevm"],
+    [false, empty_notify_list, empty_notify_list, "inactive-repo@example.com", false, true, "inactiverepo"],
+    [false, empty_notify_list, empty_notify_list, "inactive-both@example.com", true, true, "inactiveboth"],
  
      # Easy active user tests.
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-none@example.com", false, false, "activenone"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-vm@example.com", true, false, "activevm"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-repo@example.com", false, true, "activerepo"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "active-both@example.com", true, true, "activeboth"],
+    [true, active_notify_list, inactive_notify_list, "active-none@example.com", false, false, "activenone"],
+    [true, active_notify_list, inactive_notify_list, "active-vm@example.com", true, false, "activevm"],
+    [true, active_notify_list, inactive_notify_list, "active-repo@example.com", false, true, "activerepo"],
+    [true, active_notify_list, inactive_notify_list, "active-both@example.com", true, true, "activeboth"],
  
      # Test users with malformed e-mail addresses.
-    [false, {}, {}, nil, true, true, nil],
-    [false, {}, {}, "arvados", true, true, nil],
-    [false, {}, {}, "@example.com", true, true, nil],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "*!*@example.com", true, false, nil],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "*!*@example.com", false, false, nil],
+    [false, empty_notify_list, empty_notify_list, nil, true, true, nil],
+    [false, empty_notify_list, empty_notify_list, "arvados", true, true, nil],
+    [false, empty_notify_list, empty_notify_list, "@example.com", true, true, nil],
+    [true, active_notify_list, inactive_notify_list, "*!*@example.com", true, false, nil],
+    [true, active_notify_list, inactive_notify_list, "*!*@example.com", false, false, nil],
  
      # Test users with various username transformations.
-    [false, {}, {}, "arvados@example.com", false, false, "arvados2"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "arvados@example.com", false, false, "arvados2"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "root@example.com", true, false, "root2"],
-    [false, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "root@example.com", true, false, "root2"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "roo_t@example.com", false, true, "root2"],
-    [false, {}, {}, "^^incorrect_format@example.com", true, true, "incorrectformat"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", true, true, "ad9"],
-    [true, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", false, false, "ad9"],
-    [false, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", true, true, "ad9"],
-    [false, {"active-notify@example.com"=>{}}, {"inactive-notify@example.com"=>{}}, "&4a_d9.@example.com", false, false, "ad9"],
+    [false, empty_notify_list, empty_notify_list, "arvados@example.com", false, false, "arvados2"],
+    [true, active_notify_list, inactive_notify_list, "arvados@example.com", false, false, "arvados2"],
+    [true, active_notify_list, inactive_notify_list, "root@example.com", true, false, "root2"],
+    [false, active_notify_list, inactive_notify_list, "root@example.com", true, false, "root2"],
+    [true, active_notify_list, inactive_notify_list, "roo_t@example.com", false, true, "root2"],
+    [false, empty_notify_list, empty_notify_list, "^^incorrect_format@example.com", true, true, "incorrectformat"],
+    [true, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", true, true, "ad9"],
+    [true, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", false, false, "ad9"],
+    [false, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", true, true, "ad9"],
+    [false, active_notify_list, inactive_notify_list, "&4a_d9.@example.com", false, false, "ad9"],
    ].each do |active, new_user_recipients, inactive_recipients, email, auto_setup_vm, auto_setup_repo, expect_username|
      test "create new user with auto setup #{active} #{email} #{auto_setup_vm} #{auto_setup_repo}" do
        set_user_from_auth :admin
@@ -569,7 +573,6 @@ class UserTest < ActiveSupport::TestCase
      assert_not_nil resp_user, 'expected user object'
      assert_not_nil resp_user['uuid'], 'expected user object'
      assert_equal email, resp_user['email'], 'expected email not found'
-
    end
  
    def verify_link (link_object, link_class, link_name, tail_uuid, head_uuid)
@@ -648,7 +651,7 @@ class UserTest < ActiveSupport::TestCase
      if not new_user_recipients.empty? then
        assert_not_nil new_user_email, 'Expected new user email after setup'
        assert_equal Rails.configuration.Users.UserNotifierEmailFrom, new_user_email.from[0]
-      assert_equal new_user_recipients.keys.first, new_user_email.to[0]
+      assert_equal new_user_recipients.stringify_keys.keys.first, new_user_email.to[0]
        assert_equal new_user_email_subject, new_user_email.subject
      else
        assert_nil new_user_email, 'Did not expect new user email after setup'
@@ -658,7 +661,7 @@ class UserTest < ActiveSupport::TestCase
        if not inactive_recipients.empty? then
          assert_not_nil new_inactive_user_email, 'Expected new inactive user email after setup'
          assert_equal Rails.configuration.Users.UserNotifierEmailFrom, new_inactive_user_email.from[0]
-        assert_equal inactive_recipients.keys.first, new_inactive_user_email.to[0]
+        assert_equal inactive_recipients.stringify_keys.keys.first, new_inactive_user_email.to[0]
          assert_equal "#{Rails.configuration.Users.EmailSubjectPrefix}New inactive user notification", new_inactive_user_email.subject
        else
          assert_nil new_inactive_user_email, 'Did not expect new inactive user email after setup'
@@ -667,7 +670,6 @@ class UserTest < ActiveSupport::TestCase
        assert_nil new_inactive_user_email, 'Expected no inactive user email after setting up active user'
      end
      ActionMailer::Base.deliveries = []
-
    end
  
    def verify_link_exists link_exists, head_uuid, tail_uuid, link_class, link_name, property_name=nil, property_value=nil
author	Ward Vandewege <ward@curii.com>
	Wed, 16 Sep 2020 20:00:07 +0000 (16:00 -0400)
committer	Ward Vandewege <ward@curii.com>
	Wed, 16 Sep 2020 20:00:24 +0000 (16:00 -0400)
doc/Gemfile.lock		patch \| blob \| history
lib/config/config.default.yml		patch \| blob \| history
lib/config/generated_config.go		patch \| blob \| history
lib/crunchrun/background.go		patch \| blob \| history
lib/dispatchcloud/container/queue.go		patch \| blob \| history
lib/dispatchcloud/dispatcher_test.go		patch \| blob \| history
lib/dispatchcloud/scheduler/run_queue_test.go		patch \| blob \| history
lib/dispatchcloud/scheduler/scheduler.go		patch \| blob \| history
lib/dispatchcloud/test/stub_driver.go		patch \| blob \| history
lib/dispatchcloud/worker/pool.go		patch \| blob \| history
lib/dispatchcloud/worker/worker.go		patch \| blob \| history
sdk/go/arvados/config.go		patch \| blob \| history
services/api/app/models/arvados_model.rb		patch \| blob \| history
services/api/lib/config_loader.rb		patch \| blob \| history
services/api/lib/enable_jobs_api.rb		patch \| blob \| history
services/api/test/functional/arvados/v1/groups_controller_test.rb		patch \| blob \| history
services/api/test/functional/arvados/v1/schema_controller_test.rb		patch \| blob \| history
services/api/test/functional/user_sessions_controller_test.rb		patch \| blob \| history
services/api/test/test_helper.rb		patch \| blob \| history
services/api/test/unit/application_test.rb		patch \| blob \| history
services/api/test/unit/collection_test.rb		patch \| blob \| history
services/api/test/unit/container_request_test.rb		patch \| blob \| history
services/api/test/unit/job_test.rb		patch \| blob \| history
services/api/test/unit/log_test.rb		patch \| blob \| history
services/api/test/unit/user_test.rb		patch \| blob \| history