X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/15a809ab5d5cd892b7f3097f82ba271890721cd1..acf4fc32585cef9032b400abaa54582aed0ac76b:/lib/cloud/azure/azure.go diff --git a/lib/cloud/azure/azure.go b/lib/cloud/azure/azure.go index 2a63d28f65..100d87c337 100644 --- a/lib/cloud/azure/azure.go +++ b/lib/cloud/azure/azure.go @@ -7,6 +7,8 @@ package azure import ( "context" "encoding/base64" + "encoding/json" + "errors" "fmt" "net/http" "regexp" @@ -15,9 +17,9 @@ import ( "sync" "time" - "git.curoverse.com/arvados.git/lib/cloud" - "git.curoverse.com/arvados.git/sdk/go/arvados" - "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-06-01/compute" + "git.arvados.org/arvados.git/lib/cloud" + "git.arvados.org/arvados.git/sdk/go/arvados" + "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute" "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-06-01/network" storageacct "github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2018-02-01/storage" "github.com/Azure/azure-sdk-for-go/storage" @@ -26,128 +28,159 @@ import ( "github.com/Azure/go-autorest/autorest/azure/auth" "github.com/Azure/go-autorest/autorest/to" "github.com/jmcvetta/randutil" - "github.com/mitchellh/mapstructure" "github.com/sirupsen/logrus" "golang.org/x/crypto/ssh" ) -type AzureInstanceSetConfig struct { - SubscriptionID string "SubscriptionID" - ClientID string "ClientID" - ClientSecret string "ClientSecret" - TenantID string "TenantID" - CloudEnvironment string "CloudEnvironment" - ResourceGroup string "ResourceGroup" - Location string "Location" - Network string "Network" - Subnet string "Subnet" - StorageAccount string "StorageAccount" - BlobContainer string "BlobContainer" - DeleteDanglingResourcesAfter float64 "DeleteDanglingResourcesAfter" -} - -type VirtualMachinesClientWrapper interface { - CreateOrUpdate(ctx context.Context, +// Driver is the azure implementation of the cloud.Driver interface. +var Driver = cloud.DriverFunc(newAzureInstanceSet) + +type azureInstanceSetConfig struct { + SubscriptionID string + ClientID string + ClientSecret string + TenantID string + CloudEnvironment string + ResourceGroup string + ImageResourceGroup string + Location string + Network string + NetworkResourceGroup string + Subnet string + StorageAccount string + BlobContainer string + SharedImageGalleryName string + SharedImageGalleryImageVersion string + DeleteDanglingResourcesAfter arvados.Duration + AdminUsername string +} + +type containerWrapper interface { + GetBlobReference(name string) *storage.Blob + ListBlobs(params storage.ListBlobsParameters) (storage.BlobListResponse, error) +} + +type virtualMachinesClientWrapper interface { + createOrUpdate(ctx context.Context, resourceGroupName string, VMName string, parameters compute.VirtualMachine) (result compute.VirtualMachine, err error) - Delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) - ListComplete(ctx context.Context, resourceGroupName string) (result compute.VirtualMachineListResultIterator, err error) + delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) + listComplete(ctx context.Context, resourceGroupName string) (result compute.VirtualMachineListResultIterator, err error) } -type VirtualMachinesClientImpl struct { +type virtualMachinesClientImpl struct { inner compute.VirtualMachinesClient } -func (cl *VirtualMachinesClientImpl) CreateOrUpdate(ctx context.Context, +func (cl *virtualMachinesClientImpl) createOrUpdate(ctx context.Context, resourceGroupName string, VMName string, parameters compute.VirtualMachine) (result compute.VirtualMachine, err error) { future, err := cl.inner.CreateOrUpdate(ctx, resourceGroupName, VMName, parameters) if err != nil { - return compute.VirtualMachine{}, WrapAzureError(err) + return compute.VirtualMachine{}, wrapAzureError(err) } future.WaitForCompletionRef(ctx, cl.inner.Client) r, err := future.Result(cl.inner) - return r, WrapAzureError(err) + return r, wrapAzureError(err) } -func (cl *VirtualMachinesClientImpl) Delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) { +func (cl *virtualMachinesClientImpl) delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) { future, err := cl.inner.Delete(ctx, resourceGroupName, VMName) if err != nil { - return nil, WrapAzureError(err) + return nil, wrapAzureError(err) } err = future.WaitForCompletionRef(ctx, cl.inner.Client) - return future.Response(), WrapAzureError(err) + return future.Response(), wrapAzureError(err) } -func (cl *VirtualMachinesClientImpl) ListComplete(ctx context.Context, resourceGroupName string) (result compute.VirtualMachineListResultIterator, err error) { +func (cl *virtualMachinesClientImpl) listComplete(ctx context.Context, resourceGroupName string) (result compute.VirtualMachineListResultIterator, err error) { r, err := cl.inner.ListComplete(ctx, resourceGroupName) - return r, WrapAzureError(err) + return r, wrapAzureError(err) } -type InterfacesClientWrapper interface { - CreateOrUpdate(ctx context.Context, +type interfacesClientWrapper interface { + createOrUpdate(ctx context.Context, resourceGroupName string, networkInterfaceName string, parameters network.Interface) (result network.Interface, err error) - Delete(ctx context.Context, resourceGroupName string, networkInterfaceName string) (result *http.Response, err error) - ListComplete(ctx context.Context, resourceGroupName string) (result network.InterfaceListResultIterator, err error) + delete(ctx context.Context, resourceGroupName string, networkInterfaceName string) (result *http.Response, err error) + listComplete(ctx context.Context, resourceGroupName string) (result network.InterfaceListResultIterator, err error) } -type InterfacesClientImpl struct { +type interfacesClientImpl struct { inner network.InterfacesClient } -func (cl *InterfacesClientImpl) Delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) { +func (cl *interfacesClientImpl) delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) { future, err := cl.inner.Delete(ctx, resourceGroupName, VMName) if err != nil { - return nil, WrapAzureError(err) + return nil, wrapAzureError(err) } err = future.WaitForCompletionRef(ctx, cl.inner.Client) - return future.Response(), WrapAzureError(err) + return future.Response(), wrapAzureError(err) } -func (cl *InterfacesClientImpl) CreateOrUpdate(ctx context.Context, +func (cl *interfacesClientImpl) createOrUpdate(ctx context.Context, resourceGroupName string, networkInterfaceName string, parameters network.Interface) (result network.Interface, err error) { future, err := cl.inner.CreateOrUpdate(ctx, resourceGroupName, networkInterfaceName, parameters) if err != nil { - return network.Interface{}, WrapAzureError(err) + return network.Interface{}, wrapAzureError(err) } future.WaitForCompletionRef(ctx, cl.inner.Client) r, err := future.Result(cl.inner) - return r, WrapAzureError(err) + return r, wrapAzureError(err) } -func (cl *InterfacesClientImpl) ListComplete(ctx context.Context, resourceGroupName string) (result network.InterfaceListResultIterator, err error) { +func (cl *interfacesClientImpl) listComplete(ctx context.Context, resourceGroupName string) (result network.InterfaceListResultIterator, err error) { r, err := cl.inner.ListComplete(ctx, resourceGroupName) - return r, WrapAzureError(err) + return r, wrapAzureError(err) +} + +type disksClientWrapper interface { + listByResourceGroup(ctx context.Context, resourceGroupName string) (result compute.DiskListPage, err error) + delete(ctx context.Context, resourceGroupName string, diskName string) (result compute.DisksDeleteFuture, err error) +} + +type disksClientImpl struct { + inner compute.DisksClient +} + +func (cl *disksClientImpl) listByResourceGroup(ctx context.Context, resourceGroupName string) (result compute.DiskListPage, err error) { + r, err := cl.inner.ListByResourceGroup(ctx, resourceGroupName) + return r, wrapAzureError(err) +} + +func (cl *disksClientImpl) delete(ctx context.Context, resourceGroupName string, diskName string) (result compute.DisksDeleteFuture, err error) { + r, err := cl.inner.Delete(ctx, resourceGroupName, diskName) + return r, wrapAzureError(err) } var quotaRe = regexp.MustCompile(`(?i:exceed|quota|limit)`) -type AzureRateLimitError struct { +type azureRateLimitError struct { azure.RequestError - earliestRetry time.Time + firstRetry time.Time } -func (ar *AzureRateLimitError) EarliestRetry() time.Time { - return ar.earliestRetry +func (ar *azureRateLimitError) EarliestRetry() time.Time { + return ar.firstRetry } -type AzureQuotaError struct { +type azureQuotaError struct { azure.RequestError } -func (ar *AzureQuotaError) IsQuotaError() bool { +func (ar *azureQuotaError) IsQuotaError() bool { return true } -func WrapAzureError(err error) error { +func wrapAzureError(err error) error { de, ok := err.(autorest.DetailedError) if !ok { return err @@ -174,51 +207,59 @@ func WrapAzureError(err error) error { earliestRetry = time.Now().Add(20 * time.Second) } } - return &AzureRateLimitError{*rq, earliestRetry} + return &azureRateLimitError{*rq, earliestRetry} } if rq.ServiceError == nil { return err } if quotaRe.FindString(rq.ServiceError.Code) != "" || quotaRe.FindString(rq.ServiceError.Message) != "" { - return &AzureQuotaError{*rq} + return &azureQuotaError{*rq} } return err } -type AzureInstanceSet struct { - azconfig AzureInstanceSetConfig - vmClient VirtualMachinesClientWrapper - netClient InterfacesClientWrapper - storageAcctClient storageacct.AccountsClient - azureEnv azure.Environment - interfaces map[string]network.Interface - dispatcherID string - namePrefix string - ctx context.Context - stopFunc context.CancelFunc - stopWg sync.WaitGroup - deleteNIC chan string - deleteBlob chan storage.Blob - logger logrus.FieldLogger -} - -func NewAzureInstanceSet(config map[string]interface{}, dispatcherID cloud.InstanceSetID, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) { - azcfg := AzureInstanceSetConfig{} - if err = mapstructure.Decode(config, &azcfg); err != nil { +type azureInstanceSet struct { + azconfig azureInstanceSetConfig + vmClient virtualMachinesClientWrapper + netClient interfacesClientWrapper + disksClient disksClientWrapper + imageResourceGroup string + blobcont containerWrapper + azureEnv azure.Environment + interfaces map[string]network.Interface + dispatcherID string + namePrefix string + ctx context.Context + stopFunc context.CancelFunc + stopWg sync.WaitGroup + deleteNIC chan string + deleteBlob chan storage.Blob + deleteDisk chan compute.Disk + logger logrus.FieldLogger +} + +func newAzureInstanceSet(config json.RawMessage, dispatcherID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) { + azcfg := azureInstanceSetConfig{} + err = json.Unmarshal(config, &azcfg) + if err != nil { return nil, err } - ap := AzureInstanceSet{logger: logger} - err = ap.setup(azcfg, string(dispatcherID)) + + az := azureInstanceSet{logger: logger} + az.ctx, az.stopFunc = context.WithCancel(context.Background()) + err = az.setup(azcfg, string(dispatcherID)) if err != nil { + az.stopFunc() return nil, err } - return &ap, nil + return &az, nil } -func (az *AzureInstanceSet) setup(azcfg AzureInstanceSetConfig, dispatcherID string) (err error) { +func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID string) (err error) { az.azconfig = azcfg vmClient := compute.NewVirtualMachinesClient(az.azconfig.SubscriptionID) netClient := network.NewInterfacesClient(az.azconfig.SubscriptionID) + disksClient := compute.NewDisksClient(az.azconfig.SubscriptionID) storageAcctClient := storageacct.NewAccountsClient(az.azconfig.SubscriptionID) az.azureEnv, err = azure.EnvironmentFromName(az.azconfig.CloudEnvironment) @@ -239,16 +280,42 @@ func (az *AzureInstanceSet) setup(azcfg AzureInstanceSetConfig, dispatcherID str vmClient.Authorizer = authorizer netClient.Authorizer = authorizer + disksClient.Authorizer = authorizer storageAcctClient.Authorizer = authorizer - az.vmClient = &VirtualMachinesClientImpl{vmClient} - az.netClient = &InterfacesClientImpl{netClient} - az.storageAcctClient = storageAcctClient + az.vmClient = &virtualMachinesClientImpl{vmClient} + az.netClient = &interfacesClientImpl{netClient} + az.disksClient = &disksClientImpl{disksClient} + + az.imageResourceGroup = az.azconfig.ImageResourceGroup + if az.imageResourceGroup == "" { + az.imageResourceGroup = az.azconfig.ResourceGroup + } + + var client storage.Client + if az.azconfig.StorageAccount != "" && az.azconfig.BlobContainer != "" { + result, err := storageAcctClient.ListKeys(az.ctx, az.azconfig.ResourceGroup, az.azconfig.StorageAccount) + if err != nil { + az.logger.WithError(err).Warn("Couldn't get account keys") + return err + } + + key1 := *(*result.Keys)[0].Value + client, err = storage.NewBasicClientOnSovereignCloud(az.azconfig.StorageAccount, key1, az.azureEnv) + if err != nil { + az.logger.WithError(err).Warn("Couldn't make client") + return err + } + + blobsvc := client.GetBlobService() + az.blobcont = blobsvc.GetContainerReference(az.azconfig.BlobContainer) + } else if az.azconfig.StorageAccount != "" || az.azconfig.BlobContainer != "" { + az.logger.Error("Invalid configuration: StorageAccount and BlobContainer must both be empty or both be set") + } az.dispatcherID = dispatcherID az.namePrefix = fmt.Sprintf("compute-%s-", az.dispatcherID) - az.ctx, az.stopFunc = context.WithCancel(context.Background()) go func() { az.stopWg.Add(1) defer az.stopWg.Done() @@ -260,22 +327,22 @@ func (az *AzureInstanceSet) setup(azcfg AzureInstanceSetConfig, dispatcherID str tk.Stop() return case <-tk.C: - az.ManageBlobs() + if az.blobcont != nil { + az.manageBlobs() + } + az.manageDisks() } } }() az.deleteNIC = make(chan string) az.deleteBlob = make(chan storage.Blob) + az.deleteDisk = make(chan compute.Disk) for i := 0; i < 4; i++ { go func() { - for { - nicname, ok := <-az.deleteNIC - if !ok { - return - } - _, delerr := az.netClient.Delete(context.Background(), az.azconfig.ResourceGroup, nicname) + for nicname := range az.deleteNIC { + _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, nicname) if delerr != nil { az.logger.WithError(delerr).Warnf("Error deleting %v", nicname) } else { @@ -284,11 +351,7 @@ func (az *AzureInstanceSet) setup(azcfg AzureInstanceSetConfig, dispatcherID str } }() go func() { - for { - blob, ok := <-az.deleteBlob - if !ok { - return - } + for blob := range az.deleteBlob { err := blob.Delete(nil) if err != nil { az.logger.WithError(err).Warnf("Error deleting %v", blob.Name) @@ -297,22 +360,40 @@ func (az *AzureInstanceSet) setup(azcfg AzureInstanceSetConfig, dispatcherID str } } }() + go func() { + for disk := range az.deleteDisk { + _, err := az.disksClient.delete(az.ctx, az.imageResourceGroup, *disk.Name) + if err != nil { + az.logger.WithError(err).Warnf("Error deleting disk %+v", *disk.Name) + } else { + az.logger.Printf("Deleted disk %v", *disk.Name) + } + } + }() } return nil } -func (az *AzureInstanceSet) Create( +func (az *azureInstanceSet) cleanupNic(nic network.Interface) { + _, delerr := az.netClient.delete(context.Background(), az.azconfig.ResourceGroup, *nic.Name) + if delerr != nil { + az.logger.WithError(delerr).Warnf("Error cleaning up NIC after failed create") + } +} + +func (az *azureInstanceSet) Create( instanceType arvados.InstanceType, imageID cloud.ImageID, newTags cloud.InstanceTags, + initCommand cloud.InitCommand, publicKey ssh.PublicKey) (cloud.Instance, error) { az.stopWg.Add(1) defer az.stopWg.Done() - if len(newTags["node-token"]) == 0 { - return nil, fmt.Errorf("Must provide tag 'node-token'") + if instanceType.AddedScratch > 0 { + return nil, fmt.Errorf("cannot create instance type %q: driver does not implement non-zero AddedScratch (%d)", instanceType.Name, instanceType.AddedScratch) } name, err := randutil.String(15, "abcdefghijklmnopqrstuvwxyz0123456789") @@ -322,30 +403,30 @@ func (az *AzureInstanceSet) Create( name = az.namePrefix + name - timestamp := time.Now().Format(time.RFC3339Nano) - - tags := make(map[string]*string) - tags["created-at"] = ×tamp + tags := map[string]*string{} for k, v := range newTags { - newstr := v - tags["dispatch-"+k] = &newstr + tags[k] = to.StringPtr(v) } + tags["created-at"] = to.StringPtr(time.Now().Format(time.RFC3339Nano)) - tags["dispatch-instance-type"] = &instanceType.Name + networkResourceGroup := az.azconfig.NetworkResourceGroup + if networkResourceGroup == "" { + networkResourceGroup = az.azconfig.ResourceGroup + } nicParameters := network.Interface{ Location: &az.azconfig.Location, Tags: tags, InterfacePropertiesFormat: &network.InterfacePropertiesFormat{ IPConfigurations: &[]network.InterfaceIPConfiguration{ - network.InterfaceIPConfiguration{ + { Name: to.StringPtr("ip1"), InterfaceIPConfigurationPropertiesFormat: &network.InterfaceIPConfigurationPropertiesFormat{ Subnet: &network.Subnet{ ID: to.StringPtr(fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers"+ "/Microsoft.Network/virtualnetworks/%s/subnets/%s", az.azconfig.SubscriptionID, - az.azconfig.ResourceGroup, + networkResourceGroup, az.azconfig.Network, az.azconfig.Subnet)), }, @@ -355,19 +436,60 @@ func (az *AzureInstanceSet) Create( }, }, } - nic, err := az.netClient.CreateOrUpdate(az.ctx, az.azconfig.ResourceGroup, name+"-nic", nicParameters) + nic, err := az.netClient.createOrUpdate(az.ctx, az.azconfig.ResourceGroup, name+"-nic", nicParameters) if err != nil { - return nil, WrapAzureError(err) + return nil, wrapAzureError(err) } - instanceVhd := fmt.Sprintf("https://%s.blob.%s/%s/%s-os.vhd", - az.azconfig.StorageAccount, - az.azureEnv.StorageEndpointSuffix, - az.azconfig.BlobContainer, - name) + var blobname string + customData := base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n")) + var storageProfile *compute.StorageProfile - customData := base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf(`#!/bin/sh -echo '%s-%s' > /home/crunch/node-token`, name, newTags["node-token"]))) + re := regexp.MustCompile(`^http(s?)://`) + if re.MatchString(string(imageID)) { + if az.blobcont == nil { + az.cleanupNic(nic) + return nil, wrapAzureError(errors.New("Invalid configuration: can't configure unmanaged image URL without StorageAccount and BlobContainer")) + } + blobname = fmt.Sprintf("%s-os.vhd", name) + instanceVhd := fmt.Sprintf("https://%s.blob.%s/%s/%s", + az.azconfig.StorageAccount, + az.azureEnv.StorageEndpointSuffix, + az.azconfig.BlobContainer, + blobname) + az.logger.Warn("using deprecated unmanaged image, see https://doc.arvados.org/ to migrate to managed disks") + storageProfile = &compute.StorageProfile{ + OsDisk: &compute.OSDisk{ + OsType: compute.Linux, + Name: to.StringPtr(name + "-os"), + CreateOption: compute.DiskCreateOptionTypesFromImage, + Image: &compute.VirtualHardDisk{ + URI: to.StringPtr(string(imageID)), + }, + Vhd: &compute.VirtualHardDisk{ + URI: &instanceVhd, + }, + }, + } + } else { + id := to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/images/" + string(imageID)) + if az.azconfig.SharedImageGalleryName != "" && az.azconfig.SharedImageGalleryImageVersion != "" { + id = to.StringPtr("/subscriptions/" + az.azconfig.SubscriptionID + "/resourceGroups/" + az.imageResourceGroup + "/providers/Microsoft.Compute/galleries/" + az.azconfig.SharedImageGalleryName + "/images/" + string(imageID) + "/versions/" + az.azconfig.SharedImageGalleryImageVersion) + } else if az.azconfig.SharedImageGalleryName != "" || az.azconfig.SharedImageGalleryImageVersion != "" { + az.cleanupNic(nic) + return nil, wrapAzureError(errors.New("Invalid configuration: SharedImageGalleryName and SharedImageGalleryImageVersion must both be set or both be empty")) + } + storageProfile = &compute.StorageProfile{ + ImageReference: &compute.ImageReference{ + ID: id, + }, + OsDisk: &compute.OSDisk{ + OsType: compute.Linux, + Name: to.StringPtr(name + "-os"), + CreateOption: compute.DiskCreateOptionTypesFromImage, + }, + } + } vmParameters := compute.VirtualMachine{ Location: &az.azconfig.Location, @@ -376,22 +498,10 @@ echo '%s-%s' > /home/crunch/node-token`, name, newTags["node-token"]))) HardwareProfile: &compute.HardwareProfile{ VMSize: compute.VirtualMachineSizeTypes(instanceType.ProviderType), }, - StorageProfile: &compute.StorageProfile{ - OsDisk: &compute.OSDisk{ - OsType: compute.Linux, - Name: to.StringPtr(name + "-os"), - CreateOption: compute.FromImage, - Image: &compute.VirtualHardDisk{ - URI: to.StringPtr(string(imageID)), - }, - Vhd: &compute.VirtualHardDisk{ - URI: &instanceVhd, - }, - }, - }, + StorageProfile: storageProfile, NetworkProfile: &compute.NetworkProfile{ NetworkInterfaces: &[]compute.NetworkInterfaceReference{ - compute.NetworkInterfaceReference{ + { ID: nic.ID, NetworkInterfaceReferenceProperties: &compute.NetworkInterfaceReferenceProperties{ Primary: to.BoolPtr(true), @@ -401,13 +511,13 @@ echo '%s-%s' > /home/crunch/node-token`, name, newTags["node-token"]))) }, OsProfile: &compute.OSProfile{ ComputerName: &name, - AdminUsername: to.StringPtr("crunch"), + AdminUsername: to.StringPtr(az.azconfig.AdminUsername), LinuxConfiguration: &compute.LinuxConfiguration{ DisablePasswordAuthentication: to.BoolPtr(true), SSH: &compute.SSHConfiguration{ PublicKeys: &[]compute.SSHPublicKey{ - compute.SSHPublicKey{ - Path: to.StringPtr("/home/crunch/.ssh/authorized_keys"), + { + Path: to.StringPtr("/home/" + az.azconfig.AdminUsername + "/.ssh/authorized_keys"), KeyData: to.StringPtr(string(ssh.MarshalAuthorizedKey(publicKey))), }, }, @@ -418,60 +528,86 @@ echo '%s-%s' > /home/crunch/node-token`, name, newTags["node-token"]))) }, } - vm, err := az.vmClient.CreateOrUpdate(az.ctx, az.azconfig.ResourceGroup, name, vmParameters) + var maxPrice float64 + if instanceType.Preemptible { + // Setting maxPrice to -1 is the equivalent of paying spot price, up to the + // normal price. This means the node will not be pre-empted for price + // reasons. It may still be pre-empted for capacity reasons though. And + // Azure offers *no* SLA on spot instances. + maxPrice = -1 + vmParameters.VirtualMachineProperties.Priority = compute.Spot + vmParameters.VirtualMachineProperties.EvictionPolicy = compute.Delete + vmParameters.VirtualMachineProperties.BillingProfile = &compute.BillingProfile{MaxPrice: &maxPrice} + } + + vm, err := az.vmClient.createOrUpdate(az.ctx, az.azconfig.ResourceGroup, name, vmParameters) if err != nil { - return nil, WrapAzureError(err) + // Do some cleanup. Otherwise, an unbounded number of new unused nics and + // blobs can pile up during times when VMs can't be created and the + // dispatcher keeps retrying, because the garbage collection in manageBlobs + // and manageNics is only triggered periodically. This is most important + // for nics, because those are subject to a quota. + az.cleanupNic(nic) + + if blobname != "" { + _, delerr := az.blobcont.GetBlobReference(blobname).DeleteIfExists(nil) + if delerr != nil { + az.logger.WithError(delerr).Warnf("Error cleaning up vhd blob after failed create") + } + } + + // Leave cleaning up of managed disks to the garbage collection in manageDisks() + + return nil, wrapAzureError(err) } - return &AzureInstance{ + return &azureInstance{ provider: az, nic: nic, vm: vm, }, nil } -func (az *AzureInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, error) { +func (az *azureInstanceSet) Instances(cloud.InstanceTags) ([]cloud.Instance, error) { az.stopWg.Add(1) defer az.stopWg.Done() - interfaces, err := az.ManageNics() + interfaces, err := az.manageNics() if err != nil { return nil, err } - result, err := az.vmClient.ListComplete(az.ctx, az.azconfig.ResourceGroup) + result, err := az.vmClient.listComplete(az.ctx, az.azconfig.ResourceGroup) if err != nil { - return nil, WrapAzureError(err) + return nil, wrapAzureError(err) } - instances := make([]cloud.Instance, 0) - + var instances []cloud.Instance for ; result.NotDone(); err = result.Next() { if err != nil { - return nil, WrapAzureError(err) - } - if strings.HasPrefix(*result.Value().Name, az.namePrefix) { - instances = append(instances, &AzureInstance{ - provider: az, - vm: result.Value(), - nic: interfaces[*(*result.Value().NetworkProfile.NetworkInterfaces)[0].ID]}) + return nil, wrapAzureError(err) } + instances = append(instances, &azureInstance{ + provider: az, + vm: result.Value(), + nic: interfaces[*(*result.Value().NetworkProfile.NetworkInterfaces)[0].ID], + }) } return instances, nil } -// ManageNics returns a list of Azure network interface resources. -// Also performs garbage collection of NICs which have "namePrefix", are -// not associated with a virtual machine and have a "create-at" time -// more than DeleteDanglingResourcesAfter (to prevent racing and +// manageNics returns a list of Azure network interface resources. +// Also performs garbage collection of NICs which have "namePrefix", +// are not associated with a virtual machine and have a "created-at" +// time more than DeleteDanglingResourcesAfter (to prevent racing and // deleting newly created NICs) in the past are deleted. -func (az *AzureInstanceSet) ManageNics() (map[string]network.Interface, error) { +func (az *azureInstanceSet) manageNics() (map[string]network.Interface, error) { az.stopWg.Add(1) defer az.stopWg.Done() - result, err := az.netClient.ListComplete(az.ctx, az.azconfig.ResourceGroup) + result, err := az.netClient.listComplete(az.ctx, az.azconfig.ResourceGroup) if err != nil { - return nil, WrapAzureError(err) + return nil, wrapAzureError(err) } interfaces := make(map[string]network.Interface) @@ -489,8 +625,8 @@ func (az *AzureInstanceSet) ManageNics() (map[string]network.Interface, error) { if result.Value().Tags["created-at"] != nil { createdAt, err := time.Parse(time.RFC3339Nano, *result.Value().Tags["created-at"]) if err == nil { - if timestamp.Sub(createdAt).Seconds() > az.azconfig.DeleteDanglingResourcesAfter { - az.logger.Printf("Will delete %v because it is older than %v s", *result.Value().Name, az.azconfig.DeleteDanglingResourcesAfter) + if timestamp.Sub(createdAt) > az.azconfig.DeleteDanglingResourcesAfter.Duration() { + az.logger.Printf("Will delete %v because it is older than %s", *result.Value().Name, az.azconfig.DeleteDanglingResourcesAfter) az.deleteNIC <- *result.Value().Name } } @@ -501,33 +637,18 @@ func (az *AzureInstanceSet) ManageNics() (map[string]network.Interface, error) { return interfaces, nil } -// ManageBlobs garbage collects blobs (VM disk images) in the +// manageBlobs garbage collects blobs (VM disk images) in the // configured storage account container. It will delete blobs which // have "namePrefix", are "available" (which means they are not // leased to a VM) and haven't been modified for // DeleteDanglingResourcesAfter seconds. -func (az *AzureInstanceSet) ManageBlobs() { - result, err := az.storageAcctClient.ListKeys(az.ctx, az.azconfig.ResourceGroup, az.azconfig.StorageAccount) - if err != nil { - az.logger.WithError(err).Warn("Couldn't get account keys") - return - } - - key1 := *(*result.Keys)[0].Value - client, err := storage.NewBasicClientOnSovereignCloud(az.azconfig.StorageAccount, key1, az.azureEnv) - if err != nil { - az.logger.WithError(err).Warn("Couldn't make client") - return - } - - blobsvc := client.GetBlobService() - blobcont := blobsvc.GetContainerReference(az.azconfig.BlobContainer) +func (az *azureInstanceSet) manageBlobs() { page := storage.ListBlobsParameters{Prefix: az.namePrefix} timestamp := time.Now() for { - response, err := blobcont.ListBlobs(page) + response, err := az.blobcont.ListBlobs(page) if err != nil { az.logger.WithError(err).Warn("Error listing blobs") return @@ -537,7 +658,7 @@ func (az *AzureInstanceSet) ManageBlobs() { if b.Properties.BlobType == storage.BlobTypePage && b.Properties.LeaseState == "available" && b.Properties.LeaseStatus == "unlocked" && - age.Seconds() > az.azconfig.DeleteDanglingResourcesAfter { + age.Seconds() > az.azconfig.DeleteDanglingResourcesAfter.Duration().Seconds() { az.logger.Printf("Blob %v is unlocked and not modified for %v seconds, will delete", b.Name, age.Seconds()) az.deleteBlob <- b @@ -551,142 +672,124 @@ func (az *AzureInstanceSet) ManageBlobs() { } } -func (az *AzureInstanceSet) Stop() { +// manageDisks garbage collects managed compute disks (VM disk images) in the +// configured resource group. It will delete disks which have "namePrefix", +// are "unattached" (which means they are not leased to a VM) and were created +// more than DeleteDanglingResourcesAfter seconds ago. (Azure provides no +// modification timestamp on managed disks, there is only a creation timestamp) +func (az *azureInstanceSet) manageDisks() { + + re := regexp.MustCompile(`^` + regexp.QuoteMeta(az.namePrefix) + `.*-os$`) + threshold := time.Now().Add(-az.azconfig.DeleteDanglingResourcesAfter.Duration()) + + response, err := az.disksClient.listByResourceGroup(az.ctx, az.imageResourceGroup) + if err != nil { + az.logger.WithError(err).Warn("Error listing disks") + return + } + + for ; response.NotDone(); err = response.Next() { + if err != nil { + az.logger.WithError(err).Warn("Error getting next page of disks") + return + } + for _, d := range response.Values() { + if d.DiskProperties.DiskState == compute.Unattached && + d.Name != nil && re.MatchString(*d.Name) && + d.DiskProperties.TimeCreated.ToTime().Before(threshold) { + + az.logger.Printf("Disk %v is unlocked and was created at %+v, will delete", *d.Name, d.DiskProperties.TimeCreated.ToTime()) + az.deleteDisk <- d + } + } + } +} + +func (az *azureInstanceSet) Stop() { az.stopFunc() az.stopWg.Wait() close(az.deleteNIC) close(az.deleteBlob) + close(az.deleteDisk) } -type AzureInstance struct { - provider *AzureInstanceSet +type azureInstance struct { + provider *azureInstanceSet nic network.Interface vm compute.VirtualMachine } -func (ai *AzureInstance) ID() cloud.InstanceID { +func (ai *azureInstance) ID() cloud.InstanceID { return cloud.InstanceID(*ai.vm.ID) } -func (ai *AzureInstance) String() string { +func (ai *azureInstance) String() string { return *ai.vm.Name } -func (ai *AzureInstance) ProviderType() string { +func (ai *azureInstance) ProviderType() string { return string(ai.vm.VirtualMachineProperties.HardwareProfile.VMSize) } -func (ai *AzureInstance) SetTags(newTags cloud.InstanceTags) error { +func (ai *azureInstance) SetTags(newTags cloud.InstanceTags) error { ai.provider.stopWg.Add(1) defer ai.provider.stopWg.Done() - tags := make(map[string]*string) - + tags := map[string]*string{} for k, v := range ai.vm.Tags { - if !strings.HasPrefix(k, "dispatch-") { - tags[k] = v - } + tags[k] = v } for k, v := range newTags { - newstr := v - tags["dispatch-"+k] = &newstr + tags[k] = to.StringPtr(v) } vmParameters := compute.VirtualMachine{ Location: &ai.provider.azconfig.Location, Tags: tags, } - vm, err := ai.provider.vmClient.CreateOrUpdate(ai.provider.ctx, ai.provider.azconfig.ResourceGroup, *ai.vm.Name, vmParameters) + vm, err := ai.provider.vmClient.createOrUpdate(ai.provider.ctx, ai.provider.azconfig.ResourceGroup, *ai.vm.Name, vmParameters) if err != nil { - return WrapAzureError(err) + return wrapAzureError(err) } ai.vm = vm return nil } -func (ai *AzureInstance) Tags() cloud.InstanceTags { - tags := make(map[string]string) - +func (ai *azureInstance) Tags() cloud.InstanceTags { + tags := cloud.InstanceTags{} for k, v := range ai.vm.Tags { - if strings.HasPrefix(k, "dispatch-") { - tags[k[9:]] = *v - } + tags[k] = *v } - return tags } -func (ai *AzureInstance) Destroy() error { +func (ai *azureInstance) Destroy() error { ai.provider.stopWg.Add(1) defer ai.provider.stopWg.Done() - _, err := ai.provider.vmClient.Delete(ai.provider.ctx, ai.provider.azconfig.ResourceGroup, *ai.vm.Name) - return WrapAzureError(err) -} - -func (ai *AzureInstance) Address() string { - return *(*ai.nic.IPConfigurations)[0].PrivateIPAddress + _, err := ai.provider.vmClient.delete(ai.provider.ctx, ai.provider.azconfig.ResourceGroup, *ai.vm.Name) + return wrapAzureError(err) } -func (ai *AzureInstance) VerifyHostKey(receivedKey ssh.PublicKey, client *ssh.Client) error { - ai.provider.stopWg.Add(1) - defer ai.provider.stopWg.Done() - - remoteFingerprint := ssh.FingerprintSHA256(receivedKey) - - tags := ai.Tags() - - tg := tags["ssh-pubkey-fingerprint"] - if tg != "" { - if remoteFingerprint == tg { - return nil - } else { - return fmt.Errorf("Key fingerprint did not match, expected %q got %q", tg, remoteFingerprint) - } - } - - nodetokenTag := tags["node-token"] - if nodetokenTag == "" { - return fmt.Errorf("Missing node token tag") - } - - sess, err := client.NewSession() - if err != nil { - return err - } - - nodetokenbytes, err := sess.Output("cat /home/crunch/node-token") - if err != nil { - return err - } - - nodetoken := strings.TrimSpace(string(nodetokenbytes)) - - expectedToken := fmt.Sprintf("%s-%s", *ai.vm.Name, nodetokenTag) - - if strings.TrimSpace(nodetoken) != expectedToken { - return fmt.Errorf("Node token did not match, expected %q got %q", expectedToken, nodetoken) - } - - sess, err = client.NewSession() - if err != nil { - return err +func (ai *azureInstance) Address() string { + if iprops := ai.nic.InterfacePropertiesFormat; iprops == nil { + return "" + } else if ipconfs := iprops.IPConfigurations; ipconfs == nil || len(*ipconfs) == 0 { + return "" + } else if ipconfprops := (*ipconfs)[0].InterfaceIPConfigurationPropertiesFormat; ipconfprops == nil { + return "" + } else if addr := ipconfprops.PrivateIPAddress; addr == nil { + return "" + } else { + return *addr } +} - keyfingerprintbytes, err := sess.Output("ssh-keygen -E sha256 -l -f /etc/ssh/ssh_host_rsa_key.pub") - if err != nil { - return err - } - - sp := strings.Split(string(keyfingerprintbytes), " ") - - if remoteFingerprint != sp[1] { - return fmt.Errorf("Key fingerprint did not match, expected %q got %q", sp[1], remoteFingerprint) - } +func (ai *azureInstance) RemoteUser() string { + return ai.provider.azconfig.AdminUsername +} - tags["ssh-pubkey-fingerprint"] = sp[1] - delete(tags, "node-token") - ai.SetTags(tags) - return nil +func (ai *azureInstance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error { + return cloud.ErrNotImplemented }