Merge branch '16749-arvbox-users' refs #16749
[arvados.git] / lib / cloud / azure / azure.go
index b448bddd701180b00b78fa308f80791d35b6c084..7f949d9bdb3e2fb83c539a0df5f15efec6ca2612 100644 (file)
@@ -8,6 +8,7 @@ import (
        "context"
        "encoding/base64"
        "encoding/json"
+       "errors"
        "fmt"
        "net/http"
        "regexp"
@@ -35,21 +36,23 @@ import (
 var Driver = cloud.DriverFunc(newAzureInstanceSet)
 
 type azureInstanceSetConfig struct {
-       SubscriptionID               string
-       ClientID                     string
-       ClientSecret                 string
-       TenantID                     string
-       CloudEnvironment             string
-       ResourceGroup                string
-       ImageResourceGroup           string
-       Location                     string
-       Network                      string
-       NetworkResourceGroup         string
-       Subnet                       string
-       StorageAccount               string
-       BlobContainer                string
-       DeleteDanglingResourcesAfter arvados.Duration
-       AdminUsername                string
+       SubscriptionID                 string
+       ClientID                       string
+       ClientSecret                   string
+       TenantID                       string
+       CloudEnvironment               string
+       ResourceGroup                  string
+       ImageResourceGroup             string
+       Location                       string
+       Network                        string
+       NetworkResourceGroup           string
+       Subnet                         string
+       StorageAccount                 string
+       BlobContainer                  string
+       SharedImageGalleryName         string
+       SharedImageGalleryImageVersion string
+       DeleteDanglingResourcesAfter   arvados.Duration
+       AdminUsername                  string
 }
 
 type containerWrapper interface {
@@ -290,7 +293,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
        }
 
        var client storage.Client
-       if az.azconfig.StorageAccount != "" {
+       if az.azconfig.StorageAccount != "" && az.azconfig.BlobContainer != "" {
                result, err := storageAcctClient.ListKeys(az.ctx, az.azconfig.ResourceGroup, az.azconfig.StorageAccount)
                if err != nil {
                        az.logger.WithError(err).Warn("Couldn't get account keys")
@@ -306,6 +309,8 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
 
                blobsvc := client.GetBlobService()
                az.blobcont = blobsvc.GetContainerReference(az.azconfig.BlobContainer)
+       } else if az.azconfig.StorageAccount != "" || az.azconfig.BlobContainer != "" {
+               az.logger.Error("Invalid configuration: StorageAccount and BlobContainer must both be empty or both be set")
        }
 
        az.dispatcherID = dispatcherID
@@ -336,11 +341,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
 
        for i := 0; i < 4; i++ {
                go func() {
-                       for {
-                               nicname, ok := <-az.deleteNIC
-                               if !ok {
-                                       return
-                               }
+                       for nicname := range az.deleteNIC {
                                _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, nicname)
                                if delerr != nil {
                                        az.logger.WithError(delerr).Warnf("Error deleting %v", nicname)
@@ -350,11 +351,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
                        }
                }()
                go func() {
-                       for {
-                               blob, ok := <-az.deleteBlob
-                               if !ok {
-                                       return
-                               }
+                       for blob := range az.deleteBlob {
                                err := blob.Delete(nil)
                                if err != nil {
                                        az.logger.WithError(err).Warnf("Error deleting %v", blob.Name)
@@ -364,11 +361,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
                        }
                }()
                go func() {
-                       for {
-                               disk, ok := <-az.deleteDisk
-                               if !ok {
-                                       return
-                               }
+                       for disk := range az.deleteDisk {
                                _, err := az.disksClient.delete(az.ctx, az.imageResourceGroup, *disk.Name)
                                if err != nil {
                                        az.logger.WithError(err).Warnf("Error deleting disk %+v", *disk.Name)
@@ -382,6 +375,13 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
        return nil
 }
 
+func (az *azureInstanceSet) cleanupNic(nic network.Interface) {
+       _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, *nic.Name)
+       if delerr != nil {
+               az.logger.WithError(delerr).Warnf("Error cleaning up NIC after failed create")
+       }
+}
+
 func (az *azureInstanceSet) Create(
        instanceType arvados.InstanceType,
        imageID cloud.ImageID,
@@ -419,7 +419,7 @@ func (az *azureInstanceSet) Create(
                Tags:     tags,
                InterfacePropertiesFormat: &network.InterfacePropertiesFormat{
                        IPConfigurations: &[]network.InterfaceIPConfiguration{
-                               network.InterfaceIPConfiguration{
+                               {
                                        Name: to.StringPtr("ip1"),
                                        InterfaceIPConfigurationPropertiesFormat: &network.InterfaceIPConfigurationPropertiesFormat{
                                                Subnet: &network.Subnet{
@@ -441,18 +441,23 @@ func (az *azureInstanceSet) Create(
                return nil, wrapAzureError(err)
        }
 
-       blobname := fmt.Sprintf("%s-os.vhd", name)
+       var blobname string
        customData := base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))
        var storageProfile *compute.StorageProfile
 
        re := regexp.MustCompile(`^http(s?)://`)
        if re.MatchString(string(imageID)) {
+               if az.blobcont == nil {
+                       az.cleanupNic(nic)
+                       return nil, wrapAzureError(errors.New("Invalid configuration: can't configure unmanaged image URL without StorageAccount and BlobContainer"))
+               }
+               blobname = fmt.Sprintf("%s-os.vhd", name)
                instanceVhd := fmt.Sprintf("https://%s.blob.%s/%s/%s",
                        az.azconfig.StorageAccount,
                        az.azureEnv.StorageEndpointSuffix,
                        az.azconfig.BlobContainer,
                        blobname)
-               az.logger.Info("using deprecated VHD image")
+               az.logger.Warn("using deprecated unmanaged image, see https://doc.arvados.org/ to migrate to managed disks")
                storageProfile = &compute.StorageProfile{
                        OsDisk: &compute.OSDisk{
                                OsType:       compute.Linux,
@@ -467,11 +472,16 @@ func (az *azureInstanceSet) Create(
                        },
                }
        } else {
-               az.logger.Info("using managed image")
-
+               id := to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/images/" + string(imageID))
+               if az.azconfig.SharedImageGalleryName != "" && az.azconfig.SharedImageGalleryImageVersion != "" {
+                       id = to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/galleries/" + az.azconfig.SharedImageGalleryName + "/images/" + string(imageID) + "/versions/" + az.azconfig.SharedImageGalleryImageVersion)
+               } else if az.azconfig.SharedImageGalleryName != "" || az.azconfig.SharedImageGalleryImageVersion != "" {
+                       az.cleanupNic(nic)
+                       return nil, wrapAzureError(errors.New("Invalid configuration: SharedImageGalleryName and SharedImageGalleryImageVersion must both be set or both be empty"))
+               }
                storageProfile = &compute.StorageProfile{
                        ImageReference: &compute.ImageReference{
-                               ID: to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/images/" + string(imageID)),
+                               ID: id,
                        },
                        OsDisk: &compute.OSDisk{
                                OsType:       compute.Linux,
@@ -491,7 +501,7 @@ func (az *azureInstanceSet) Create(
                        StorageProfile: storageProfile,
                        NetworkProfile: &compute.NetworkProfile{
                                NetworkInterfaces: &[]compute.NetworkInterfaceReference{
-                                       compute.NetworkInterfaceReference{
+                                       {
                                                ID: nic.ID,
                                                NetworkInterfaceReferenceProperties: &compute.NetworkInterfaceReferenceProperties{
                                                        Primary: to.BoolPtr(true),
@@ -520,17 +530,21 @@ func (az *azureInstanceSet) Create(
 
        vm, err := az.vmClient.createOrUpdate(az.ctx, az.azconfig.ResourceGroup, name, vmParameters)
        if err != nil {
-               if az.blobcont != nil {
+               // Do some cleanup. Otherwise, an unbounded number of new unused nics and
+               // blobs can pile up during times when VMs can't be created and the
+               // dispatcher keeps retrying, because the garbage collection in manageBlobs
+               // and manageNics is only triggered periodically. This is most important
+               // for nics, because those are subject to a quota.
+               az.cleanupNic(nic)
+
+               if blobname != "" {
                        _, delerr := az.blobcont.GetBlobReference(blobname).DeleteIfExists(nil)
                        if delerr != nil {
                                az.logger.WithError(delerr).Warnf("Error cleaning up vhd blob after failed create")
                        }
                }
 
-               _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, *nic.Name)
-               if delerr != nil {
-                       az.logger.WithError(delerr).Warnf("Error cleaning up NIC after failed create")
-               }
+               // Leave cleaning up of managed disks to the garbage collection in manageDisks()
 
                return nil, wrapAzureError(err)
        }
@@ -570,7 +584,7 @@ func (az *azureInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, err
        return instances, nil
 }
 
-// ManageNics returns a list of Azure network interface resources.
+// manageNics returns a list of Azure network interface resources.
 // Also performs garbage collection of NICs which have "namePrefix",
 // are not associated with a virtual machine and have a "created-at"
 // time more than DeleteDanglingResourcesAfter (to prevent racing and
@@ -611,7 +625,7 @@ func (az *azureInstanceSet) manageNics() (map[string]network.Interface, error) {
        return interfaces, nil
 }
 
-// ManageBlobs garbage collects blobs (VM disk images) in the
+// manageBlobs garbage collects blobs (VM disk images) in the
 // configured storage account container.  It will delete blobs which
 // have "namePrefix", are "available" (which means they are not
 // leased to a VM) and haven't been modified for
@@ -646,38 +660,36 @@ func (az *azureInstanceSet) manageBlobs() {
        }
 }
 
-// ManageDisks garbage collects managed compute disks (VM disk images) in the
-// configured resource group.  It will delete disks which
-// have "namePrefix", are "available" (which means they are not
-// leased to a VM) and were created more than DeleteDanglingResourcesAfter seconds ago.
-// (Azure provides no modification timestamp on managed disks, there is only a
-// creation timestamp)
+// manageDisks garbage collects managed compute disks (VM disk images) in the
+// configured resource group.  It will delete disks which have "namePrefix",
+// are "unattached" (which means they are not leased to a VM) and were created
+// more than DeleteDanglingResourcesAfter seconds ago.  (Azure provides no
+// modification timestamp on managed disks, there is only a creation timestamp)
 func (az *azureInstanceSet) manageDisks() {
 
-       re := regexp.MustCompile(`^` + az.namePrefix + `.*-os$`)
-       timestamp := time.Now()
+       re := regexp.MustCompile(`^` + regexp.QuoteMeta(az.namePrefix) + `.*-os$`)
+       threshold := time.Now().Add(-az.azconfig.DeleteDanglingResourcesAfter.Duration())
 
-       for {
-               response, err := az.disksClient.listByResourceGroup(az.ctx, az.imageResourceGroup)
+       response, err := az.disksClient.listByResourceGroup(az.ctx, az.imageResourceGroup)
+       if err != nil {
+               az.logger.WithError(err).Warn("Error listing disks")
+               return
+       }
+
+       for ; response.NotDone(); err = response.Next() {
                if err != nil {
-                       az.logger.WithError(err).Warn("Error listing disks")
+                       az.logger.WithError(err).Warn("Error getting next page of disks")
                        return
                }
                for _, d := range response.Values() {
-                       age := timestamp.Sub(d.DiskProperties.TimeCreated.ToTime())
-                       if d.DiskProperties.DiskState == "Unattached" &&
-                               re.MatchString(*d.Name) &&
-                               age.Seconds() > az.azconfig.DeleteDanglingResourcesAfter.Duration().Seconds() {
+                       if d.DiskProperties.DiskState == compute.Unattached &&
+                               d.Name != nil && re.MatchString(*d.Name) &&
+                               d.DiskProperties.TimeCreated.ToTime().Before(threshold) {
 
-                               az.logger.Printf("Disk %v is unlocked and not modified for %v seconds, will delete", *d.Name, age.Seconds())
+                               az.logger.Printf("Disk %v is unlocked and was created at %+v, will delete", *d.Name, d.DiskProperties.TimeCreated.ToTime())
                                az.deleteDisk <- d
                        }
                }
-               if response.Values() != nil {
-                       response.Next()
-               } else {
-                       break
-               }
        }
 }
 
@@ -686,6 +698,7 @@ func (az *azureInstanceSet) Stop() {
        az.stopWg.Wait()
        close(az.deleteNIC)
        close(az.deleteBlob)
+       close(az.deleteDisk)
 }
 
 type azureInstance struct {