14931: Add configurable prefix for built-in tags.
authorTom Clegg <tclegg@veritasgenetics.com>
Thu, 30 May 2019 14:13:07 +0000 (10:13 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Fri, 31 May 2019 17:57:32 +0000 (13:57 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

lib/config/config.default.yml
lib/config/generated_config.go
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/pool_test.go
lib/dispatchcloud/worker/verify.go
lib/dispatchcloud/worker/worker.go
sdk/go/arvados/config.go

index b1ed91d4bbc7250a0736081abd61100569c70118..5a482e2d2269a3832f284386676b6f28903c7f66 100644 (file)
@@ -537,6 +537,17 @@ Clusters:
         ResourceTags:
           SAMPLE: "tag value"
 
+        # Prefix for predefined tags used by Arvados (InstanceSetID,
+        # InstanceType, InstanceSecret, IdleBehavior). For example,
+        # set to "arvados-" to use tag key "arvados-InstanceSecret"
+        # instead of "InstanceSecret".
+        #
+        # This should only be changed while no cloud resources are in
+        # use and the cloud dispatcher is not running. Otherwise,
+        # VMs/resources that were added using the old tag prefix will
+        # need to be detected and cleaned up manually.
+        TagKeyPrefix: ""
+
         # Cloud driver: "azure" (Microsoft Azure) or "ec2" (Amazon AWS).
         Driver: ec2
 
index 513eee3622463d8209dd9cd26ba713964ef9233f..e34203dc8294441e71a9272985e967942768ff57 100644 (file)
@@ -543,6 +543,17 @@ Clusters:
         ResourceTags:
           SAMPLE: "tag value"
 
+        # Prefix for predefined tags used by Arvados (InstanceSetID,
+        # InstanceType, InstanceSecret, IdleBehavior). For example,
+        # set to "arvados-" to use tag key "arvados-InstanceSecret"
+        # instead of "InstanceSecret".
+        #
+        # This should only be changed while no cloud resources are in
+        # use and the cloud dispatcher is not running. Otherwise,
+        # VMs/resources that were added using the old tag prefix will
+        # need to be detected and cleaned up manually.
+        TagKeyPrefix: ""
+
         # Cloud driver: "azure" (Microsoft Azure) or "ec2" (Amazon AWS).
         Driver: ec2
 
index 11786bd3cc5713119821601ecd18d1aeb76a045d..012621f12f633fe9c352e2f6bb847dadb965a59d 100644 (file)
@@ -66,6 +66,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                                TimeoutSignal:        arvados.Duration(3 * time.Millisecond),
                                TimeoutTERM:          arvados.Duration(20 * time.Millisecond),
                                ResourceTags:         map[string]string{"testtag": "test value"},
+                               TagKeyPrefix:         "test:",
                        },
                },
                InstanceTypes: arvados.InstanceTypeMap{
index 8af1037125f7d31b08855971e032f8e50e8b7731..0ee36a96ff1d23d3c27e48679dba4b31007299f4 100644 (file)
@@ -112,6 +112,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
                timeoutTERM:        duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
                timeoutSignal:      duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
                installPublicKey:   installPublicKey,
+               tagKeyPrefix:       cluster.Containers.CloudVMs.TagKeyPrefix,
                stop:               make(chan bool),
        }
        wp.registerMetrics(reg)
@@ -146,6 +147,7 @@ type Pool struct {
        timeoutTERM        time.Duration
        timeoutSignal      time.Duration
        installPublicKey   ssh.PublicKey
+       tagKeyPrefix       string
 
        // private state
        subscribers  map[<-chan struct{}]chan<- struct{}
@@ -284,10 +286,10 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
        go func() {
                defer wp.notify()
                tags := cloud.InstanceTags{
-                       wp.tagPrefix + tagKeyInstanceSetID:  string(wp.instanceSetID),
-                       wp.tagPrefix + tagKeyInstanceType:   it.Name,
-                       wp.tagPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
-                       wp.tagPrefix + tagKeyInstanceSecret: secret,
+                       wp.tagKeyPrefix + tagKeyInstanceSetID:  string(wp.instanceSetID),
+                       wp.tagKeyPrefix + tagKeyInstanceType:   it.Name,
+                       wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
+                       wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
                }
                initCmd := cloud.InitCommand(fmt.Sprintf("umask 0177 && echo -n %q >%s", secret, instanceSecretFilename))
                inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
@@ -342,7 +344,8 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 //
 // Caller must have lock.
 func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
-       inst = tagVerifier{inst}
+       secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
+       inst = tagVerifier{inst, secret}
        id := inst.ID()
        if wkr := wp.workers[id]; wkr != nil {
                wkr.executor.SetTarget(inst)
@@ -353,7 +356,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*wor
        }
 
        state := StateUnknown
-       if _, ok := wp.creating[inst.Tags()[tagKeyInstanceSecret]]; ok {
+       if _, ok := wp.creating[secret]; ok {
                state = StateBooting
        }
 
@@ -363,7 +366,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*wor
        // process); otherwise, default to "run". After this,
        // wkr.idleBehavior is the source of truth, and will only be
        // changed via SetIdleBehavior().
-       idleBehavior := IdleBehavior(inst.Tags()[tagKeyIdleBehavior])
+       idleBehavior := IdleBehavior(inst.Tags()[wp.tagKeyPrefix+tagKeyIdleBehavior])
        if !validIdleBehavior[idleBehavior] {
                idleBehavior = IdleBehaviorRun
        }
@@ -732,7 +735,7 @@ func (wp *Pool) getInstancesAndSync() error {
        }
        wp.logger.Debug("getting instance list")
        threshold := time.Now()
-       instances, err := wp.instanceSet.Instances(cloud.InstanceTags{tagKeyInstanceSetID: string(wp.instanceSetID)})
+       instances, err := wp.instanceSet.Instances(cloud.InstanceTags{wp.tagKeyPrefix + tagKeyInstanceSetID: string(wp.instanceSetID)})
        if err != nil {
                wp.instanceSet.throttleInstances.CheckRateLimitError(err, wp.logger, "list instances", wp.notify)
                return err
@@ -752,7 +755,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
        notify := false
 
        for _, inst := range instances {
-               itTag := inst.Tags()[tagKeyInstanceType]
+               itTag := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceType]
                it, ok := wp.instanceTypes[itTag]
                if !ok {
                        wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
index 8ab4c987544e06e8535a9de2ca12df7052ff8edb..300b4730fc3b0dd9107803df1931bb95a4b52f6a 100644 (file)
@@ -83,6 +83,7 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
                                MaxProbesPerSecond: 1000,
                                ProbeInterval:      arvados.Duration(time.Millisecond * 10),
                                SyncInterval:       arvados.Duration(time.Millisecond * 10),
+                               TagKeyPrefix:       "testprefix:",
                        },
                },
                InstanceTypes: arvados.InstanceTypeMap{
@@ -107,13 +108,14 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
                }
        }
        // Wait for the tags to save to the cloud provider
+       tagKey := cluster.Containers.CloudVMs.TagKeyPrefix + tagKeyIdleBehavior
        deadline := time.Now().Add(time.Second)
        for !func() bool {
                pool.mtx.RLock()
                defer pool.mtx.RUnlock()
                for _, wkr := range pool.workers {
                        if wkr.instType == type2 {
-                               return wkr.instance.Tags()[tagKeyIdleBehavior] == string(IdleBehaviorHold)
+                               return wkr.instance.Tags()[tagKey] == string(IdleBehaviorHold)
                        }
                }
                return false
index e22c85d00906fba303f7d636e41a84a3cce3c523..330071951425c1c382f8be4e53f436d758d032f6 100644 (file)
@@ -23,11 +23,11 @@ var (
 
 type tagVerifier struct {
        cloud.Instance
+       secret string
 }
 
 func (tv tagVerifier) VerifyHostKey(pubKey ssh.PublicKey, client *ssh.Client) error {
-       expectSecret := tv.Instance.Tags()[tagKeyInstanceSecret]
-       if err := tv.Instance.VerifyHostKey(pubKey, client); err != cloud.ErrNotImplemented || expectSecret == "" {
+       if err := tv.Instance.VerifyHostKey(pubKey, client); err != cloud.ErrNotImplemented || tv.secret == "" {
                // If the wrapped instance indicates it has a way to
                // verify the key, return that decision.
                return err
@@ -49,7 +49,7 @@ func (tv tagVerifier) VerifyHostKey(pubKey ssh.PublicKey, client *ssh.Client) er
        if err != nil {
                return err
        }
-       if stdout.String() != expectSecret {
+       if stdout.String() != tv.secret {
                return errBadInstanceSecret
        }
        return nil
index 49c5057b3842e49da945d40c3950f7c2185dfcc5..03ab15176f5297b85182d3689b71f5a3f0195004 100644 (file)
@@ -455,8 +455,8 @@ func (wkr *worker) saveTags() {
        instance := wkr.instance
        tags := instance.Tags()
        update := cloud.InstanceTags{
-               tagKeyInstanceType: wkr.instType.Name,
-               tagKeyIdleBehavior: string(wkr.idleBehavior),
+               wkr.wp.tagKeyPrefix + tagKeyInstanceType: wkr.instType.Name,
+               wkr.wp.tagKeyPrefix + tagKeyIdleBehavior: string(wkr.idleBehavior),
        }
        save := false
        for k, v := range update {
index 4356f0003d3421ff1d4f8b4a27e16bcf0c810701..d96bf25173a949dc0d95cb49f9ba639295c019b4 100644 (file)
@@ -170,6 +170,7 @@ type CloudVMsConfig struct {
        TimeoutSignal        Duration
        TimeoutTERM          Duration
        ResourceTags         map[string]string
+       TagKeyPrefix         string
 
        Driver           string
        DriverParameters json.RawMessage