13964: ssh key checking
[arvados.git] / lib / dispatchcloud / azure.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package dispatchcloud
6
7 import (
8         "context"
9         "encoding/base64"
10         "fmt"
11         "log"
12         "net/http"
13         "regexp"
14         "strconv"
15         "strings"
16         "sync"
17         "time"
18
19         "git.curoverse.com/arvados.git/sdk/go/arvados"
20         "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-06-01/compute"
21         "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-06-01/network"
22         storageacct "github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2018-02-01/storage"
23         "github.com/Azure/azure-sdk-for-go/storage"
24         "github.com/Azure/go-autorest/autorest"
25         "github.com/Azure/go-autorest/autorest/azure"
26         "github.com/Azure/go-autorest/autorest/azure/auth"
27         "github.com/Azure/go-autorest/autorest/to"
28         "github.com/jmcvetta/randutil"
29         "golang.org/x/crypto/ssh"
30 )
31
32 type AzureProviderConfig struct {
33         SubscriptionID               string  `json:"subscription_id"`
34         ClientID                     string  `json:"key"`
35         ClientSecret                 string  `json:"secret"`
36         TenantID                     string  `json:"tenant_id"`
37         CloudEnv                     string  `json:"cloud_environment"`
38         ResourceGroup                string  `json:"resource_group"`
39         Location                     string  `json:"region"`
40         Network                      string  `json:"network"`
41         Subnet                       string  `json:"subnet"`
42         StorageAccount               string  `json:"storage_account"`
43         BlobContainer                string  `json:"blob_container"`
44         Image                        string  `json:"image"`
45         DeleteDanglingResourcesAfter float64 `json:"delete_dangling_resources_after"`
46 }
47
48 type VirtualMachinesClientWrapper interface {
49         CreateOrUpdate(ctx context.Context,
50                 resourceGroupName string,
51                 VMName string,
52                 parameters compute.VirtualMachine) (result compute.VirtualMachine, err error)
53         Delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error)
54         ListComplete(ctx context.Context, resourceGroupName string) (result compute.VirtualMachineListResultIterator, err error)
55 }
56
57 type VirtualMachinesClientImpl struct {
58         inner compute.VirtualMachinesClient
59 }
60
61 func (cl *VirtualMachinesClientImpl) CreateOrUpdate(ctx context.Context,
62         resourceGroupName string,
63         VMName string,
64         parameters compute.VirtualMachine) (result compute.VirtualMachine, err error) {
65
66         future, err := cl.inner.CreateOrUpdate(ctx, resourceGroupName, VMName, parameters)
67         if err != nil {
68                 return compute.VirtualMachine{}, WrapAzureError(err)
69         }
70         future.WaitForCompletionRef(ctx, cl.inner.Client)
71         r, err := future.Result(cl.inner)
72         return r, WrapAzureError(err)
73 }
74
75 func (cl *VirtualMachinesClientImpl) Delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) {
76         future, err := cl.inner.Delete(ctx, resourceGroupName, VMName)
77         if err != nil {
78                 return nil, WrapAzureError(err)
79         }
80         err = future.WaitForCompletionRef(ctx, cl.inner.Client)
81         return future.Response(), WrapAzureError(err)
82 }
83
84 func (cl *VirtualMachinesClientImpl) ListComplete(ctx context.Context, resourceGroupName string) (result compute.VirtualMachineListResultIterator, err error) {
85         r, err := cl.inner.ListComplete(ctx, resourceGroupName)
86         return r, WrapAzureError(err)
87 }
88
89 type InterfacesClientWrapper interface {
90         CreateOrUpdate(ctx context.Context,
91                 resourceGroupName string,
92                 networkInterfaceName string,
93                 parameters network.Interface) (result network.Interface, err error)
94         Delete(ctx context.Context, resourceGroupName string, networkInterfaceName string) (result *http.Response, err error)
95         ListComplete(ctx context.Context, resourceGroupName string) (result network.InterfaceListResultIterator, err error)
96 }
97
98 type InterfacesClientImpl struct {
99         inner network.InterfacesClient
100 }
101
102 func (cl *InterfacesClientImpl) Delete(ctx context.Context, resourceGroupName string, VMName string) (result *http.Response, err error) {
103         future, err := cl.inner.Delete(ctx, resourceGroupName, VMName)
104         if err != nil {
105                 return nil, WrapAzureError(err)
106         }
107         err = future.WaitForCompletionRef(ctx, cl.inner.Client)
108         return future.Response(), WrapAzureError(err)
109 }
110
111 func (cl *InterfacesClientImpl) CreateOrUpdate(ctx context.Context,
112         resourceGroupName string,
113         networkInterfaceName string,
114         parameters network.Interface) (result network.Interface, err error) {
115
116         future, err := cl.inner.CreateOrUpdate(ctx, resourceGroupName, networkInterfaceName, parameters)
117         if err != nil {
118                 return network.Interface{}, WrapAzureError(err)
119         }
120         future.WaitForCompletionRef(ctx, cl.inner.Client)
121         r, err := future.Result(cl.inner)
122         return r, WrapAzureError(err)
123 }
124
125 func (cl *InterfacesClientImpl) ListComplete(ctx context.Context, resourceGroupName string) (result network.InterfaceListResultIterator, err error) {
126         r, err := cl.inner.ListComplete(ctx, resourceGroupName)
127         return r, WrapAzureError(err)
128 }
129
130 var quotaRe = regexp.MustCompile(`(?i:exceed|quota|limit)`)
131
132 type AzureRateLimitError struct {
133         azure.RequestError
134         earliestRetry time.Time
135 }
136
137 func (ar *AzureRateLimitError) EarliestRetry() time.Time {
138         return ar.earliestRetry
139 }
140
141 type AzureQuotaError struct {
142         azure.RequestError
143 }
144
145 func (ar *AzureQuotaError) IsQuotaError() bool {
146         return true
147 }
148
149 func WrapAzureError(err error) error {
150         de, ok := err.(autorest.DetailedError)
151         if !ok {
152                 return err
153         }
154         rq, ok := de.Original.(*azure.RequestError)
155         if !ok {
156                 return err
157         }
158         if rq.Response == nil {
159                 return err
160         }
161         if rq.Response.StatusCode == 429 || len(rq.Response.Header["Retry-After"]) >= 1 {
162                 // API throttling
163                 ra := rq.Response.Header["Retry-After"][0]
164                 earliestRetry, parseErr := http.ParseTime(ra)
165                 if parseErr != nil {
166                         // Could not parse as a timestamp, must be number of seconds
167                         dur, parseErr := strconv.ParseInt(ra, 10, 64)
168                         if parseErr != nil {
169                                 earliestRetry = time.Now().Add(time.Duration(dur) * time.Second)
170                         }
171                 }
172                 if parseErr != nil {
173                         // Couldn't make sense of retry-after,
174                         // so set retry to 20 seconds
175                         earliestRetry = time.Now().Add(20 * time.Second)
176                 }
177                 return &AzureRateLimitError{*rq, earliestRetry}
178         }
179         if rq.ServiceError == nil {
180                 return err
181         }
182         if quotaRe.FindString(rq.ServiceError.Code) != "" || quotaRe.FindString(rq.ServiceError.Message) != "" {
183                 return &AzureQuotaError{*rq}
184         }
185         return err
186 }
187
188 type AzureProvider struct {
189         azconfig          AzureProviderConfig
190         vmClient          VirtualMachinesClientWrapper
191         netClient         InterfacesClientWrapper
192         storageAcctClient storageacct.AccountsClient
193         azureEnv          azure.Environment
194         interfaces        map[string]network.Interface
195         dispatcherID      string
196         namePrefix        string
197 }
198
199 func NewAzureProvider(azcfg AzureProviderConfig, dispatcherID string) (prv InstanceProvider, err error) {
200         ap := AzureProvider{}
201         err = ap.setup(azcfg, dispatcherID)
202         if err != nil {
203                 return nil, err
204         }
205         return &ap, nil
206 }
207
208 func (az *AzureProvider) setup(azcfg AzureProviderConfig, dispatcherID string) (err error) {
209         az.azconfig = azcfg
210         vmClient := compute.NewVirtualMachinesClient(az.azconfig.SubscriptionID)
211         netClient := network.NewInterfacesClient(az.azconfig.SubscriptionID)
212         storageAcctClient := storageacct.NewAccountsClient(az.azconfig.SubscriptionID)
213
214         az.azureEnv, err = azure.EnvironmentFromName(az.azconfig.CloudEnv)
215         if err != nil {
216                 return err
217         }
218
219         authorizer, err := auth.ClientCredentialsConfig{
220                 ClientID:     az.azconfig.ClientID,
221                 ClientSecret: az.azconfig.ClientSecret,
222                 TenantID:     az.azconfig.TenantID,
223                 Resource:     az.azureEnv.ResourceManagerEndpoint,
224                 AADEndpoint:  az.azureEnv.ActiveDirectoryEndpoint,
225         }.Authorizer()
226         if err != nil {
227                 return err
228         }
229
230         vmClient.Authorizer = authorizer
231         netClient.Authorizer = authorizer
232         storageAcctClient.Authorizer = authorizer
233
234         az.vmClient = &VirtualMachinesClientImpl{vmClient}
235         az.netClient = &InterfacesClientImpl{netClient}
236         az.storageAcctClient = storageAcctClient
237
238         az.dispatcherID = dispatcherID
239         az.namePrefix = fmt.Sprintf("compute-%s-", az.dispatcherID)
240
241         return nil
242 }
243
244 func (az *AzureProvider) Create(ctx context.Context,
245         instanceType arvados.InstanceType,
246         imageId ImageID,
247         newTags InstanceTags,
248         publicKey ssh.PublicKey) (Instance, error) {
249
250         if len(newTags["node-token"]) == 0 {
251                 return nil, fmt.Errorf("Must provide tag 'node-token'")
252         }
253
254         name, err := randutil.String(15, "abcdefghijklmnopqrstuvwxyz0123456789")
255         if err != nil {
256                 return nil, err
257         }
258
259         name = az.namePrefix + name
260         log.Printf("name is %v", name)
261
262         timestamp := time.Now().Format(time.RFC3339Nano)
263
264         tags := make(map[string]*string)
265         tags["created-at"] = &timestamp
266         for k, v := range newTags {
267                 tags["dispatch-"+k] = &v
268         }
269
270         tags["dispatch-instance-type"] = &instanceType.Name
271
272         nicParameters := network.Interface{
273                 Location: &az.azconfig.Location,
274                 Tags:     tags,
275                 InterfacePropertiesFormat: &network.InterfacePropertiesFormat{
276                         IPConfigurations: &[]network.InterfaceIPConfiguration{
277                                 network.InterfaceIPConfiguration{
278                                         Name: to.StringPtr("ip1"),
279                                         InterfaceIPConfigurationPropertiesFormat: &network.InterfaceIPConfigurationPropertiesFormat{
280                                                 Subnet: &network.Subnet{
281                                                         ID: to.StringPtr(fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers"+
282                                                                 "/Microsoft.Network/virtualnetworks/%s/subnets/%s",
283                                                                 az.azconfig.SubscriptionID,
284                                                                 az.azconfig.ResourceGroup,
285                                                                 az.azconfig.Network,
286                                                                 az.azconfig.Subnet)),
287                                                 },
288                                                 PrivateIPAllocationMethod: network.Dynamic,
289                                         },
290                                 },
291                         },
292                 },
293         }
294         nic, err := az.netClient.CreateOrUpdate(ctx, az.azconfig.ResourceGroup, name+"-nic", nicParameters)
295         if err != nil {
296                 return nil, WrapAzureError(err)
297         }
298
299         log.Printf("Created NIC %v", *nic.ID)
300
301         instance_vhd := fmt.Sprintf("https://%s.blob.%s/%s/%s-os.vhd",
302                 az.azconfig.StorageAccount,
303                 az.azureEnv.StorageEndpointSuffix,
304                 az.azconfig.BlobContainer,
305                 name)
306
307         log.Printf("URI instance vhd %v", instance_vhd)
308
309         customData := base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf(`#!/bin/sh
310 echo '%s-%s' > /home/crunch/node-token`, name, newTags["node-token"])))
311
312         vmParameters := compute.VirtualMachine{
313                 Location: &az.azconfig.Location,
314                 Tags:     tags,
315                 VirtualMachineProperties: &compute.VirtualMachineProperties{
316                         HardwareProfile: &compute.HardwareProfile{
317                                 VMSize: compute.VirtualMachineSizeTypes(instanceType.ProviderType),
318                         },
319                         StorageProfile: &compute.StorageProfile{
320                                 OsDisk: &compute.OSDisk{
321                                         OsType:       compute.Linux,
322                                         Name:         to.StringPtr(name + "-os"),
323                                         CreateOption: compute.FromImage,
324                                         Image: &compute.VirtualHardDisk{
325                                                 URI: to.StringPtr(string(imageId)),
326                                         },
327                                         Vhd: &compute.VirtualHardDisk{
328                                                 URI: &instance_vhd,
329                                         },
330                                 },
331                         },
332                         NetworkProfile: &compute.NetworkProfile{
333                                 NetworkInterfaces: &[]compute.NetworkInterfaceReference{
334                                         compute.NetworkInterfaceReference{
335                                                 ID: nic.ID,
336                                                 NetworkInterfaceReferenceProperties: &compute.NetworkInterfaceReferenceProperties{
337                                                         Primary: to.BoolPtr(true),
338                                                 },
339                                         },
340                                 },
341                         },
342                         OsProfile: &compute.OSProfile{
343                                 ComputerName:  &name,
344                                 AdminUsername: to.StringPtr("crunch"),
345                                 LinuxConfiguration: &compute.LinuxConfiguration{
346                                         DisablePasswordAuthentication: to.BoolPtr(true),
347                                         SSH: &compute.SSHConfiguration{
348                                                 PublicKeys: &[]compute.SSHPublicKey{
349                                                         compute.SSHPublicKey{
350                                                                 Path:    to.StringPtr("/home/crunch/.ssh/authorized_keys"),
351                                                                 KeyData: to.StringPtr(string(ssh.MarshalAuthorizedKey(publicKey))),
352                                                         },
353                                                 },
354                                         },
355                                 },
356                                 CustomData: &customData,
357                         },
358                 },
359         }
360
361         vm, err := az.vmClient.CreateOrUpdate(ctx, az.azconfig.ResourceGroup, name, vmParameters)
362         if err != nil {
363                 return nil, WrapAzureError(err)
364         }
365
366         return &AzureInstance{
367                 provider: az,
368                 nic:      nic,
369                 vm:       vm,
370         }, nil
371 }
372
373 func (az *AzureProvider) Instances(ctx context.Context) ([]Instance, error) {
374         interfaces, err := az.ManageNics(ctx)
375         if err != nil {
376                 return nil, err
377         }
378
379         result, err := az.vmClient.ListComplete(ctx, az.azconfig.ResourceGroup)
380         if err != nil {
381                 return nil, WrapAzureError(err)
382         }
383
384         instances := make([]Instance, 0)
385
386         for ; result.NotDone(); err = result.Next() {
387                 if err != nil {
388                         return nil, WrapAzureError(err)
389                 }
390                 if strings.HasPrefix(*result.Value().Name, az.namePrefix) {
391                         instances = append(instances, &AzureInstance{
392                                 provider: az,
393                                 vm:       result.Value(),
394                                 nic:      interfaces[*(*result.Value().NetworkProfile.NetworkInterfaces)[0].ID]})
395                 }
396         }
397         return instances, nil
398 }
399
400 func (az *AzureProvider) ManageNics(ctx context.Context) (map[string]network.Interface, error) {
401         result, err := az.netClient.ListComplete(ctx, az.azconfig.ResourceGroup)
402         if err != nil {
403                 return nil, WrapAzureError(err)
404         }
405
406         interfaces := make(map[string]network.Interface)
407
408         timestamp := time.Now()
409         wg := sync.WaitGroup{}
410         deletechannel := make(chan string, 20)
411         defer func() {
412                 wg.Wait()
413                 close(deletechannel)
414         }()
415         for i := 0; i < 4; i += 1 {
416                 go func() {
417                         for {
418                                 nicname, ok := <-deletechannel
419                                 if !ok {
420                                         return
421                                 }
422                                 _, delerr := az.netClient.Delete(context.Background(), az.azconfig.ResourceGroup, nicname)
423                                 if delerr != nil {
424                                         log.Printf("Error deleting %v: %v", nicname, delerr)
425                                 } else {
426                                         log.Printf("Deleted %v", nicname)
427                                 }
428                                 wg.Done()
429                         }
430                 }()
431         }
432
433         for ; result.NotDone(); err = result.Next() {
434                 if err != nil {
435                         log.Printf("Error listing nics: %v", err)
436                         return interfaces, nil
437                 }
438                 if strings.HasPrefix(*result.Value().Name, az.namePrefix) {
439                         if result.Value().VirtualMachine != nil {
440                                 interfaces[*result.Value().ID] = result.Value()
441                         } else {
442                                 if result.Value().Tags["created-at"] != nil {
443                                         created_at, err := time.Parse(time.RFC3339Nano, *result.Value().Tags["created-at"])
444                                         if err == nil {
445                                                 //log.Printf("found dangling NIC %v created %v seconds ago", *result.Value().Name, timestamp.Sub(created_at).Seconds())
446                                                 if timestamp.Sub(created_at).Seconds() > az.azconfig.DeleteDanglingResourcesAfter {
447                                                         log.Printf("Will delete %v because it is older than %v s", *result.Value().Name, az.azconfig.DeleteDanglingResourcesAfter)
448                                                         wg.Add(1)
449                                                         deletechannel <- *result.Value().Name
450                                                 }
451                                         }
452                                 }
453                         }
454                 }
455         }
456         return interfaces, nil
457 }
458
459 func (az *AzureProvider) ManageBlobs(ctx context.Context) {
460         result, err := az.storageAcctClient.ListKeys(ctx, az.azconfig.ResourceGroup, az.azconfig.StorageAccount)
461         if err != nil {
462                 log.Printf("Couldn't get account keys %v", err)
463                 return
464         }
465
466         key1 := *(*result.Keys)[0].Value
467         client, err := storage.NewBasicClientOnSovereignCloud(az.azconfig.StorageAccount, key1, az.azureEnv)
468         if err != nil {
469                 log.Printf("Couldn't make client %v", err)
470                 return
471         }
472
473         blobsvc := client.GetBlobService()
474         blobcont := blobsvc.GetContainerReference(az.azconfig.BlobContainer)
475
476         timestamp := time.Now()
477         wg := sync.WaitGroup{}
478         deletechannel := make(chan storage.Blob, 20)
479         defer func() {
480                 wg.Wait()
481                 close(deletechannel)
482         }()
483         for i := 0; i < 4; i += 1 {
484                 go func() {
485                         for {
486                                 blob, ok := <-deletechannel
487                                 if !ok {
488                                         return
489                                 }
490                                 err := blob.Delete(nil)
491                                 if err != nil {
492                                         log.Printf("error deleting %v: %v", blob.Name, err)
493                                 } else {
494                                         log.Printf("Deleted blob %v", blob.Name)
495                                 }
496                                 wg.Done()
497                         }
498                 }()
499         }
500
501         page := storage.ListBlobsParameters{Prefix: az.namePrefix}
502
503         for {
504                 response, err := blobcont.ListBlobs(page)
505                 if err != nil {
506                         log.Printf("Error listing blobs %v", err)
507                         return
508                 }
509                 for _, b := range response.Blobs {
510                         age := timestamp.Sub(time.Time(b.Properties.LastModified))
511                         if b.Properties.BlobType == storage.BlobTypePage &&
512                                 b.Properties.LeaseState == "available" &&
513                                 b.Properties.LeaseStatus == "unlocked" &&
514                                 age.Seconds() > az.azconfig.DeleteDanglingResourcesAfter {
515
516                                 log.Printf("Blob %v is unlocked and not modified for %v seconds, will delete", b.Name, age.Seconds())
517                                 wg.Add(1)
518                                 deletechannel <- b
519                         }
520                 }
521                 if response.NextMarker != "" {
522                         page.Marker = response.NextMarker
523                 } else {
524                         break
525                 }
526         }
527 }
528
529 func (az *AzureProvider) Stop() {
530 }
531
532 type AzureInstance struct {
533         provider *AzureProvider
534         nic      network.Interface
535         vm       compute.VirtualMachine
536 }
537
538 func (ai *AzureInstance) ID() InstanceID {
539         return InstanceID(*ai.vm.ID)
540 }
541
542 func (ai *AzureInstance) String() string {
543         return *ai.vm.Name
544 }
545
546 func (ai *AzureInstance) SetTags(ctx context.Context, newTags InstanceTags) error {
547         tags := make(map[string]*string)
548
549         for k, v := range ai.vm.Tags {
550                 if !strings.HasPrefix(k, "dispatch-") {
551                         tags[k] = v
552                 }
553         }
554         for k, v := range newTags {
555                 tags["dispatch-"+k] = &v
556         }
557
558         vmParameters := compute.VirtualMachine{
559                 Location: &ai.provider.azconfig.Location,
560                 Tags:     tags,
561         }
562         vm, err := ai.provider.vmClient.CreateOrUpdate(ctx, ai.provider.azconfig.ResourceGroup, *ai.vm.Name, vmParameters)
563         if err != nil {
564                 return WrapAzureError(err)
565         }
566         ai.vm = vm
567
568         return nil
569 }
570
571 func (ai *AzureInstance) Tags(ctx context.Context) (InstanceTags, error) {
572         tags := make(map[string]string)
573
574         for k, v := range ai.vm.Tags {
575                 if strings.HasPrefix(k, "dispatch-") {
576                         tags[k[9:]] = *v
577                 }
578         }
579
580         return tags, nil
581 }
582
583 func (ai *AzureInstance) Destroy(ctx context.Context) error {
584         _, err := ai.provider.vmClient.Delete(ctx, ai.provider.azconfig.ResourceGroup, *ai.vm.Name)
585         return WrapAzureError(err)
586 }
587
588 func (ai *AzureInstance) Address() string {
589         return *(*ai.nic.IPConfigurations)[0].PrivateIPAddress
590 }
591
592 func (ai *AzureInstance) VerifyPublicKey(ctx context.Context, receivedKey ssh.PublicKey, client *ssh.Client) error {
593         remoteFingerprint := ssh.FingerprintSHA256(receivedKey)
594
595         tags, _ := ai.Tags(ctx)
596
597         tg := tags["ssh-pubkey-fingerprint"]
598         if tg != "" {
599                 if remoteFingerprint == tg {
600                         return nil
601                 } else {
602                         return fmt.Errorf("Key fingerprint did not match")
603                 }
604         }
605
606         sess, err := client.NewSession()
607         if err != nil {
608                 return err
609         }
610
611         nodetoken, err := sess.Output("cat /home/crunch/node-token")
612         if err != nil {
613                 return err
614         }
615
616         expectedToken := fmt.Sprintf("%s-%s", *ai.vm.Name, tags["node-token"])
617         log.Printf("%q %q", string(nodetoken), expectedToken)
618
619         if string(nodetoken) == expectedToken {
620                 sess, err := client.NewSession()
621                 if err != nil {
622                         return err
623                 }
624
625                 keyfingerprintbytes, err := sess.Output("ssh-keygen -E sha256 -l -f /etc/ssh/ssh_host_rsa_key.pub")
626                 if err != nil {
627                         return err
628                 }
629
630                 sp := strings.Split(string(keyfingerprintbytes), " ")
631
632                 log.Printf("%q %q", remoteFingerprint, sp[1])
633
634                 if remoteFingerprint == sp[1] {
635                         tags["ssh-pubkey-fingerprint"] = sp[1]
636                         ai.SetTags(ctx, tags)
637                         return nil
638                 }
639         }
640
641         return fmt.Errorf("Key fingerprint did not match")
642 }