X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/24f3823f6e96c60025cbde15c3cb94557f3d0bec..d63d6ba645b8e34cf9d248bd0407fab0b9ef6534:/lib/cloud/azure/azure.go diff --git a/lib/cloud/azure/azure.go b/lib/cloud/azure/azure.go index b448bddd70..7b170958b6 100644 --- a/lib/cloud/azure/azure.go +++ b/lib/cloud/azure/azure.go @@ -8,6 +8,7 @@ import ( "context" "encoding/base64" "encoding/json" + "errors" "fmt" "net/http" "regexp" @@ -35,21 +36,23 @@ import ( var Driver = cloud.DriverFunc(newAzureInstanceSet) type azureInstanceSetConfig struct { - SubscriptionID string - ClientID string - ClientSecret string - TenantID string - CloudEnvironment string - ResourceGroup string - ImageResourceGroup string - Location string - Network string - NetworkResourceGroup string - Subnet string - StorageAccount string - BlobContainer string - DeleteDanglingResourcesAfter arvados.Duration - AdminUsername string + SubscriptionID string + ClientID string + ClientSecret string + TenantID string + CloudEnvironment string + ResourceGroup string + ImageResourceGroup string + Location string + Network string + NetworkResourceGroup string + Subnet string + StorageAccount string + BlobContainer string + SharedImageGalleryName string + SharedImageGalleryImageVersion string + DeleteDanglingResourcesAfter arvados.Duration + AdminUsername string } type containerWrapper interface { @@ -290,7 +293,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str } var client storage.Client - if az.azconfig.StorageAccount != "" { + if az.azconfig.StorageAccount != "" && az.azconfig.BlobContainer != "" { result, err := storageAcctClient.ListKeys(az.ctx, az.azconfig.ResourceGroup, az.azconfig.StorageAccount) if err != nil { az.logger.WithError(err).Warn("Couldn't get account keys") @@ -306,6 +309,8 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str blobsvc := client.GetBlobService() az.blobcont = blobsvc.GetContainerReference(az.azconfig.BlobContainer) + } else if az.azconfig.StorageAccount != "" || az.azconfig.BlobContainer != "" { + az.logger.Error("Invalid configuration: StorageAccount and BlobContainer must both be empty or both be set") } az.dispatcherID = dispatcherID @@ -336,11 +341,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str for i := 0; i < 4; i++ { go func() { - for { - nicname, ok := <-az.deleteNIC - if !ok { - return - } + for nicname := range az.deleteNIC { _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, nicname) if delerr != nil { az.logger.WithError(delerr).Warnf("Error deleting %v", nicname) @@ -350,11 +351,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str } }() go func() { - for { - blob, ok := <-az.deleteBlob - if !ok { - return - } + for blob := range az.deleteBlob { err := blob.Delete(nil) if err != nil { az.logger.WithError(err).Warnf("Error deleting %v", blob.Name) @@ -364,11 +361,7 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str } }() go func() { - for { - disk, ok := <-az.deleteDisk - if !ok { - return - } + for disk := range az.deleteDisk { _, err := az.disksClient.delete(az.ctx, az.imageResourceGroup, *disk.Name) if err != nil { az.logger.WithError(err).Warnf("Error deleting disk %+v", *disk.Name) @@ -382,6 +375,13 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str return nil } +func (az *azureInstanceSet) cleanupNic(nic network.Interface) { + _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, *nic.Name) + if delerr != nil { + az.logger.WithError(delerr).Warnf("Error cleaning up NIC after failed create") + } +} + func (az *azureInstanceSet) Create( instanceType arvados.InstanceType, imageID cloud.ImageID, @@ -419,7 +419,7 @@ func (az *azureInstanceSet) Create( Tags: tags, InterfacePropertiesFormat: &network.InterfacePropertiesFormat{ IPConfigurations: &[]network.InterfaceIPConfiguration{ - network.InterfaceIPConfiguration{ + { Name: to.StringPtr("ip1"), InterfaceIPConfigurationPropertiesFormat: &network.InterfaceIPConfigurationPropertiesFormat{ Subnet: &network.Subnet{ @@ -441,18 +441,23 @@ func (az *azureInstanceSet) Create( return nil, wrapAzureError(err) } - blobname := fmt.Sprintf("%s-os.vhd", name) + var blobname string customData := base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n")) var storageProfile *compute.StorageProfile re := regexp.MustCompile(`^http(s?)://`) if re.MatchString(string(imageID)) { + if az.blobcont == nil { + az.cleanupNic(nic) + return nil, wrapAzureError(errors.New("Invalid configuration: can't configure unmanaged image URL without StorageAccount and BlobContainer")) + } + blobname = fmt.Sprintf("%s-os.vhd", name) instanceVhd := fmt.Sprintf("https://%s.blob.%s/%s/%s", az.azconfig.StorageAccount, az.azureEnv.StorageEndpointSuffix, az.azconfig.BlobContainer, blobname) - az.logger.Info("using deprecated VHD image") + az.logger.Warn("using deprecated unmanaged image, see https://doc.arvados.org/ to migrate to managed disks") storageProfile = &compute.StorageProfile{ OsDisk: &compute.OSDisk{ OsType: compute.Linux, @@ -467,11 +472,16 @@ func (az *azureInstanceSet) Create( }, } } else { - az.logger.Info("using managed image") - + id := to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/images/" + string(imageID)) + if az.azconfig.SharedImageGalleryName != "" && az.azconfig.SharedImageGalleryImageVersion != "" { + id = to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/galleries/" + az.azconfig.SharedImageGalleryName + "/images/" + string(imageID) + "/versions/" + az.azconfig.SharedImageGalleryImageVersion) + } else if az.azconfig.SharedImageGalleryName != "" || az.azconfig.SharedImageGalleryImageVersion != "" { + az.cleanupNic(nic) + return nil, wrapAzureError(errors.New("Invalid configuration: SharedImageGalleryName and SharedImageGalleryImageVersion must both be set or both be empty")) + } storageProfile = &compute.StorageProfile{ ImageReference: &compute.ImageReference{ - ID: to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/images/" + string(imageID)), + ID: id, }, OsDisk: &compute.OSDisk{ OsType: compute.Linux, @@ -491,7 +501,7 @@ func (az *azureInstanceSet) Create( StorageProfile: storageProfile, NetworkProfile: &compute.NetworkProfile{ NetworkInterfaces: &[]compute.NetworkInterfaceReference{ - compute.NetworkInterfaceReference{ + { ID: nic.ID, NetworkInterfaceReferenceProperties: &compute.NetworkInterfaceReferenceProperties{ Primary: to.BoolPtr(true), @@ -518,19 +528,34 @@ func (az *azureInstanceSet) Create( }, } + if instanceType.Preemptible { + // Setting maxPrice to -1 is the equivalent of paying spot price, up to the + // normal price. This means the node will not be pre-empted for price + // reasons. It may still be pre-empted for capacity reasons though. And + // Azure offers *no* SLA on spot instances. + var maxPrice float64 = -1 + vmParameters.VirtualMachineProperties.Priority = compute.Spot + vmParameters.VirtualMachineProperties.EvictionPolicy = compute.Delete + vmParameters.VirtualMachineProperties.BillingProfile = &compute.BillingProfile{MaxPrice: &maxPrice} + } + vm, err := az.vmClient.createOrUpdate(az.ctx, az.azconfig.ResourceGroup, name, vmParameters) if err != nil { - if az.blobcont != nil { + // Do some cleanup. Otherwise, an unbounded number of new unused nics and + // blobs can pile up during times when VMs can't be created and the + // dispatcher keeps retrying, because the garbage collection in manageBlobs + // and manageNics is only triggered periodically. This is most important + // for nics, because those are subject to a quota. + az.cleanupNic(nic) + + if blobname != "" { _, delerr := az.blobcont.GetBlobReference(blobname).DeleteIfExists(nil) if delerr != nil { az.logger.WithError(delerr).Warnf("Error cleaning up vhd blob after failed create") } } - _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, *nic.Name) - if delerr != nil { - az.logger.WithError(delerr).Warnf("Error cleaning up NIC after failed create") - } + // Leave cleaning up of managed disks to the garbage collection in manageDisks() return nil, wrapAzureError(err) } @@ -570,7 +595,7 @@ func (az *azureInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, err return instances, nil } -// ManageNics returns a list of Azure network interface resources. +// manageNics returns a list of Azure network interface resources. // Also performs garbage collection of NICs which have "namePrefix", // are not associated with a virtual machine and have a "created-at" // time more than DeleteDanglingResourcesAfter (to prevent racing and @@ -611,7 +636,7 @@ func (az *azureInstanceSet) manageNics() (map[string]network.Interface, error) { return interfaces, nil } -// ManageBlobs garbage collects blobs (VM disk images) in the +// manageBlobs garbage collects blobs (VM disk images) in the // configured storage account container. It will delete blobs which // have "namePrefix", are "available" (which means they are not // leased to a VM) and haven't been modified for @@ -646,38 +671,36 @@ func (az *azureInstanceSet) manageBlobs() { } } -// ManageDisks garbage collects managed compute disks (VM disk images) in the -// configured resource group. It will delete disks which -// have "namePrefix", are "available" (which means they are not -// leased to a VM) and were created more than DeleteDanglingResourcesAfter seconds ago. -// (Azure provides no modification timestamp on managed disks, there is only a -// creation timestamp) +// manageDisks garbage collects managed compute disks (VM disk images) in the +// configured resource group. It will delete disks which have "namePrefix", +// are "unattached" (which means they are not leased to a VM) and were created +// more than DeleteDanglingResourcesAfter seconds ago. (Azure provides no +// modification timestamp on managed disks, there is only a creation timestamp) func (az *azureInstanceSet) manageDisks() { - re := regexp.MustCompile(`^` + az.namePrefix + `.*-os$`) - timestamp := time.Now() + re := regexp.MustCompile(`^` + regexp.QuoteMeta(az.namePrefix) + `.*-os$`) + threshold := time.Now().Add(-az.azconfig.DeleteDanglingResourcesAfter.Duration()) - for { - response, err := az.disksClient.listByResourceGroup(az.ctx, az.imageResourceGroup) + response, err := az.disksClient.listByResourceGroup(az.ctx, az.imageResourceGroup) + if err != nil { + az.logger.WithError(err).Warn("Error listing disks") + return + } + + for ; response.NotDone(); err = response.Next() { if err != nil { - az.logger.WithError(err).Warn("Error listing disks") + az.logger.WithError(err).Warn("Error getting next page of disks") return } for _, d := range response.Values() { - age := timestamp.Sub(d.DiskProperties.TimeCreated.ToTime()) - if d.DiskProperties.DiskState == "Unattached" && - re.MatchString(*d.Name) && - age.Seconds() > az.azconfig.DeleteDanglingResourcesAfter.Duration().Seconds() { + if d.DiskProperties.DiskState == compute.Unattached && + d.Name != nil && re.MatchString(*d.Name) && + d.DiskProperties.TimeCreated.ToTime().Before(threshold) { - az.logger.Printf("Disk %v is unlocked and not modified for %v seconds, will delete", *d.Name, age.Seconds()) + az.logger.Printf("Disk %v is unlocked and was created at %+v, will delete", *d.Name, d.DiskProperties.TimeCreated.ToTime()) az.deleteDisk <- d } } - if response.Values() != nil { - response.Next() - } else { - break - } } } @@ -686,6 +709,7 @@ func (az *azureInstanceSet) Stop() { az.stopWg.Wait() close(az.deleteNIC) close(az.deleteBlob) + close(az.deleteDisk) } type azureInstance struct { @@ -761,6 +785,10 @@ func (ai *azureInstance) Address() string { } } +func (ai *azureInstance) PriceHistory(arvados.InstanceType) []cloud.InstancePrice { + return nil +} + func (ai *azureInstance) RemoteUser() string { return ai.provider.azconfig.AdminUsername }