16106: update Azure preemptible node code after real world testing. Add
[arvados.git] / lib / cloud / azure / azure.go
index d2dbde96d76f52459981248fa4d3b4614bfab4c7..100d87c337f22a8daddf2e3aeca7efc5836102d4 100644 (file)
@@ -36,21 +36,23 @@ import (
 var Driver = cloud.DriverFunc(newAzureInstanceSet)
 
 type azureInstanceSetConfig struct {
-       SubscriptionID               string
-       ClientID                     string
-       ClientSecret                 string
-       TenantID                     string
-       CloudEnvironment             string
-       ResourceGroup                string
-       ImageResourceGroup           string
-       Location                     string
-       Network                      string
-       NetworkResourceGroup         string
-       Subnet                       string
-       StorageAccount               string
-       BlobContainer                string
-       DeleteDanglingResourcesAfter arvados.Duration
-       AdminUsername                string
+       SubscriptionID                 string
+       ClientID                       string
+       ClientSecret                   string
+       TenantID                       string
+       CloudEnvironment               string
+       ResourceGroup                  string
+       ImageResourceGroup             string
+       Location                       string
+       Network                        string
+       NetworkResourceGroup           string
+       Subnet                         string
+       StorageAccount                 string
+       BlobContainer                  string
+       SharedImageGalleryName         string
+       SharedImageGalleryImageVersion string
+       DeleteDanglingResourcesAfter   arvados.Duration
+       AdminUsername                  string
 }
 
 type containerWrapper interface {
@@ -373,6 +375,13 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
        return nil
 }
 
+func (az *azureInstanceSet) cleanupNic(nic network.Interface) {
+       _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, *nic.Name)
+       if delerr != nil {
+               az.logger.WithError(delerr).Warnf("Error cleaning up NIC after failed create")
+       }
+}
+
 func (az *azureInstanceSet) Create(
        instanceType arvados.InstanceType,
        imageID cloud.ImageID,
@@ -410,7 +419,7 @@ func (az *azureInstanceSet) Create(
                Tags:     tags,
                InterfacePropertiesFormat: &network.InterfacePropertiesFormat{
                        IPConfigurations: &[]network.InterfaceIPConfiguration{
-                               network.InterfaceIPConfiguration{
+                               {
                                        Name: to.StringPtr("ip1"),
                                        InterfaceIPConfigurationPropertiesFormat: &network.InterfaceIPConfigurationPropertiesFormat{
                                                Subnet: &network.Subnet{
@@ -437,7 +446,11 @@ func (az *azureInstanceSet) Create(
        var storageProfile *compute.StorageProfile
 
        re := regexp.MustCompile(`^http(s?)://`)
-       if re.MatchString(string(imageID)) && az.blobcont != nil {
+       if re.MatchString(string(imageID)) {
+               if az.blobcont == nil {
+                       az.cleanupNic(nic)
+                       return nil, wrapAzureError(errors.New("Invalid configuration: can't configure unmanaged image URL without StorageAccount and BlobContainer"))
+               }
                blobname = fmt.Sprintf("%s-os.vhd", name)
                instanceVhd := fmt.Sprintf("https://%s.blob.%s/%s/%s",
                        az.azconfig.StorageAccount,
@@ -458,10 +471,17 @@ func (az *azureInstanceSet) Create(
                                },
                        },
                }
-       } else if az.blobcont == nil {
+       } else {
+               id := to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/images/" + string(imageID))
+               if az.azconfig.SharedImageGalleryName != "" && az.azconfig.SharedImageGalleryImageVersion != "" {
+                       id = to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/galleries/" + az.azconfig.SharedImageGalleryName + "/images/" + string(imageID) + "/versions/" + az.azconfig.SharedImageGalleryImageVersion)
+               } else if az.azconfig.SharedImageGalleryName != "" || az.azconfig.SharedImageGalleryImageVersion != "" {
+                       az.cleanupNic(nic)
+                       return nil, wrapAzureError(errors.New("Invalid configuration: SharedImageGalleryName and SharedImageGalleryImageVersion must both be set or both be empty"))
+               }
                storageProfile = &compute.StorageProfile{
                        ImageReference: &compute.ImageReference{
-                               ID: to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/images/" + string(imageID)),
+                               ID: id,
                        },
                        OsDisk: &compute.OSDisk{
                                OsType:       compute.Linux,
@@ -469,8 +489,6 @@ func (az *azureInstanceSet) Create(
                                CreateOption: compute.DiskCreateOptionTypesFromImage,
                        },
                }
-       } else {
-               return nil, wrapAzureError(errors.New("Invalid configuration: can't configure unmanaged image URL without StorageAccount and BlobContainer"))
        }
 
        vmParameters := compute.VirtualMachine{
@@ -483,7 +501,7 @@ func (az *azureInstanceSet) Create(
                        StorageProfile: storageProfile,
                        NetworkProfile: &compute.NetworkProfile{
                                NetworkInterfaces: &[]compute.NetworkInterfaceReference{
-                                       compute.NetworkInterfaceReference{
+                                       {
                                                ID: nic.ID,
                                                NetworkInterfaceReferenceProperties: &compute.NetworkInterfaceReferenceProperties{
                                                        Primary: to.BoolPtr(true),
@@ -510,20 +528,35 @@ func (az *azureInstanceSet) Create(
                },
        }
 
+       var maxPrice float64
+       if instanceType.Preemptible {
+               // Setting maxPrice to -1 is the equivalent of paying spot price, up to the
+               // normal price. This means the node will not be pre-empted for price
+               // reasons. It may still be pre-empted for capacity reasons though. And
+               // Azure offers *no* SLA on spot instances.
+               maxPrice = -1
+               vmParameters.VirtualMachineProperties.Priority = compute.Spot
+               vmParameters.VirtualMachineProperties.EvictionPolicy = compute.Delete
+               vmParameters.VirtualMachineProperties.BillingProfile = &compute.BillingProfile{MaxPrice: &maxPrice}
+       }
+
        vm, err := az.vmClient.createOrUpdate(az.ctx, az.azconfig.ResourceGroup, name, vmParameters)
        if err != nil {
-               if az.blobcont != nil {
+               // Do some cleanup. Otherwise, an unbounded number of new unused nics and
+               // blobs can pile up during times when VMs can't be created and the
+               // dispatcher keeps retrying, because the garbage collection in manageBlobs
+               // and manageNics is only triggered periodically. This is most important
+               // for nics, because those are subject to a quota.
+               az.cleanupNic(nic)
+
+               if blobname != "" {
                        _, delerr := az.blobcont.GetBlobReference(blobname).DeleteIfExists(nil)
                        if delerr != nil {
                                az.logger.WithError(delerr).Warnf("Error cleaning up vhd blob after failed create")
                        }
                }
-               // Leave cleaning up of managed disks to the garbage collection in manageDisks()
 
-               _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, *nic.Name)
-               if delerr != nil {
-                       az.logger.WithError(delerr).Warnf("Error cleaning up NIC after failed create")
-               }
+               // Leave cleaning up of managed disks to the garbage collection in manageDisks()
 
                return nil, wrapAzureError(err)
        }
@@ -656,6 +689,10 @@ func (az *azureInstanceSet) manageDisks() {
        }
 
        for ; response.NotDone(); err = response.Next() {
+               if err != nil {
+                       az.logger.WithError(err).Warn("Error getting next page of disks")
+                       return
+               }
                for _, d := range response.Values() {
                        if d.DiskProperties.DiskState == compute.Unattached &&
                                d.Name != nil && re.MatchString(*d.Name) &&