lib/cloud/ec2/ec2.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package ec2
   6
   7 import (
   8         "crypto/md5"
   9         "crypto/rsa"
  10         "crypto/sha1"
  11         "crypto/x509"
  12         "encoding/base64"
  13         "encoding/json"
  14         "fmt"
  15         "math/big"
  16         "strconv"
  17         "strings"
  18         "sync"
  19         "sync/atomic"
  20         "time"
  21
  22         "git.arvados.org/arvados.git/lib/cloud"
  23         "git.arvados.org/arvados.git/sdk/go/arvados"
  24         "github.com/aws/aws-sdk-go/aws"
  25         "github.com/aws/aws-sdk-go/aws/awserr"
  26         "github.com/aws/aws-sdk-go/aws/credentials"
  27         "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
  28         "github.com/aws/aws-sdk-go/aws/ec2metadata"
  29         "github.com/aws/aws-sdk-go/aws/request"
  30         "github.com/aws/aws-sdk-go/aws/session"
  31         "github.com/aws/aws-sdk-go/service/ec2"
  32         "github.com/prometheus/client_golang/prometheus"
  33         "github.com/sirupsen/logrus"
  34         "golang.org/x/crypto/ssh"
  35 )
  36
  37 // Driver is the ec2 implementation of the cloud.Driver interface.
  38 var Driver = cloud.DriverFunc(newEC2InstanceSet)
  39
  40 const (
  41         throttleDelayMin = time.Second
  42         throttleDelayMax = time.Minute
  43 )
  44
  45 type ec2InstanceSetConfig struct {
  46         AccessKeyID             string
  47         SecretAccessKey         string
  48         Region                  string
  49         SecurityGroupIDs        arvados.StringSet
  50         SubnetID                sliceOrSingleString
  51         AdminUsername           string
  52         EBSVolumeType           string
  53         EBSPrice                float64
  54         IAMInstanceProfile      string
  55         SpotPriceUpdateInterval arvados.Duration
  56 }
  57
  58 type sliceOrSingleString []string
  59
  60 // UnmarshalJSON unmarshals an array of strings, and also accepts ""
  61 // as [], and "foo" as ["foo"].
  62 func (ss *sliceOrSingleString) UnmarshalJSON(data []byte) error {
  63         if len(data) == 0 {
  64                 *ss = nil
  65         } else if data[0] == '[' {
  66                 var slice []string
  67                 err := json.Unmarshal(data, &slice)
  68                 if err != nil {
  69                         return err
  70                 }
  71                 if len(slice) == 0 {
  72                         *ss = nil
  73                 } else {
  74                         *ss = slice
  75                 }
  76         } else {
  77                 var str string
  78                 err := json.Unmarshal(data, &str)
  79                 if err != nil {
  80                         return err
  81                 }
  82                 if str == "" {
  83                         *ss = nil
  84                 } else {
  85                         *ss = []string{str}
  86                 }
  87         }
  88         return nil
  89 }
  90
  91 type ec2Interface interface {
  92         DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error)
  93         ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
  94         RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error)
  95         DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
  96         DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error
  97         DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error
  98         CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error)
  99         TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
 100 }
 101
 102 type ec2InstanceSet struct {
 103         ec2config              ec2InstanceSetConfig
 104         currentSubnetIDIndex   int32
 105         instanceSetID          cloud.InstanceSetID
 106         logger                 logrus.FieldLogger
 107         client                 ec2Interface
 108         keysMtx                sync.Mutex
 109         keys                   map[string]string
 110         throttleDelayCreate    atomic.Value
 111         throttleDelayInstances atomic.Value
 112
 113         prices        map[priceKey][]cloud.InstancePrice
 114         pricesLock    sync.Mutex
 115         pricesUpdated map[priceKey]time.Time
 116
 117         mInstances      *prometheus.GaugeVec
 118         mInstanceStarts *prometheus.CounterVec
 119 }
 120
 121 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) {
 122         instanceSet := &ec2InstanceSet{
 123                 instanceSetID: instanceSetID,
 124                 logger:        logger,
 125         }
 126         err = json.Unmarshal(config, &instanceSet.ec2config)
 127         if err != nil {
 128                 return nil, err
 129         }
 130
 131         sess, err := session.NewSession()
 132         if err != nil {
 133                 return nil, err
 134         }
 135         // First try any static credentials, fall back to an IAM instance profile/role
 136         creds := credentials.NewChainCredentials(
 137                 []credentials.Provider{
 138                         &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
 139                         &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
 140                 })
 141
 142         awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
 143         instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
 144         instanceSet.keys = make(map[string]string)
 145         if instanceSet.ec2config.EBSVolumeType == "" {
 146                 instanceSet.ec2config.EBSVolumeType = "gp2"
 147         }
 148
 149         // Set up metrics
 150         instanceSet.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 151                 Namespace: "arvados",
 152                 Subsystem: "dispatchcloud",
 153                 Name:      "ec2_instances",
 154                 Help:      "Number of instances running",
 155         }, []string{"subnet_id"})
 156         instanceSet.mInstanceStarts = prometheus.NewCounterVec(prometheus.CounterOpts{
 157                 Namespace: "arvados",
 158                 Subsystem: "dispatchcloud",
 159                 Name:      "ec2_instance_starts_total",
 160                 Help:      "Number of attempts to start a new instance",
 161         }, []string{"subnet_id", "success"})
 162         // Initialize all of the series we'll be reporting.  Otherwise
 163         // the {subnet=A, success=0} series doesn't appear in metrics
 164         // at all until there's a failure in subnet A.
 165         for _, subnet := range instanceSet.ec2config.SubnetID {
 166                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "0").Add(0)
 167                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "1").Add(0)
 168         }
 169         if len(instanceSet.ec2config.SubnetID) == 0 {
 170                 instanceSet.mInstanceStarts.WithLabelValues("", "0").Add(0)
 171                 instanceSet.mInstanceStarts.WithLabelValues("", "1").Add(0)
 172         }
 173         if reg != nil {
 174                 reg.MustRegister(instanceSet.mInstances)
 175                 reg.MustRegister(instanceSet.mInstanceStarts)
 176         }
 177
 178         return instanceSet, nil
 179 }
 180
 181 func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error) {
 182         // AWS key fingerprints don't use the usual key fingerprint
 183         // you get from ssh-keygen or ssh.FingerprintLegacyMD5()
 184         // (you can get that from md5.Sum(pk.Marshal())
 185         //
 186         // AWS uses the md5 or sha1 of the PKIX DER encoding of the
 187         // public key, so calculate those fingerprints here.
 188         var rsaPub struct {
 189                 Name string
 190                 E    *big.Int
 191                 N    *big.Int
 192         }
 193         if err := ssh.Unmarshal(pk.Marshal(), &rsaPub); err != nil {
 194                 return "", "", fmt.Errorf("agent: Unmarshal failed to parse public key: %v", err)
 195         }
 196         rsaPk := rsa.PublicKey{
 197                 E: int(rsaPub.E.Int64()),
 198                 N: rsaPub.N,
 199         }
 200         pkix, _ := x509.MarshalPKIXPublicKey(&rsaPk)
 201         md5pkix := md5.Sum([]byte(pkix))
 202         sha1pkix := sha1.Sum([]byte(pkix))
 203         md5fp = ""
 204         sha1fp = ""
 205         for i := 0; i < len(md5pkix); i++ {
 206                 md5fp += fmt.Sprintf(":%02x", md5pkix[i])
 207         }
 208         for i := 0; i < len(sha1pkix); i++ {
 209                 sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
 210         }
 211         return md5fp[1:], sha1fp[1:], nil
 212 }
 213
 214 func (instanceSet *ec2InstanceSet) Create(
 215         instanceType arvados.InstanceType,
 216         imageID cloud.ImageID,
 217         newTags cloud.InstanceTags,
 218         initCommand cloud.InitCommand,
 219         publicKey ssh.PublicKey) (cloud.Instance, error) {
 220
 221         ec2tags := []*ec2.Tag{}
 222         for k, v := range newTags {
 223                 ec2tags = append(ec2tags, &ec2.Tag{
 224                         Key:   aws.String(k),
 225                         Value: aws.String(v),
 226                 })
 227         }
 228
 229         var groups []string
 230         for sg := range instanceSet.ec2config.SecurityGroupIDs {
 231                 groups = append(groups, sg)
 232         }
 233
 234         rii := ec2.RunInstancesInput{
 235                 ImageId:      aws.String(string(imageID)),
 236                 InstanceType: &instanceType.ProviderType,
 237                 MaxCount:     aws.Int64(1),
 238                 MinCount:     aws.Int64(1),
 239
 240                 NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
 241                         {
 242                                 AssociatePublicIpAddress: aws.Bool(false),
 243                                 DeleteOnTermination:      aws.Bool(true),
 244                                 DeviceIndex:              aws.Int64(0),
 245                                 Groups:                   aws.StringSlice(groups),
 246                         }},
 247                 DisableApiTermination:             aws.Bool(false),
 248                 InstanceInitiatedShutdownBehavior: aws.String("terminate"),
 249                 TagSpecifications: []*ec2.TagSpecification{
 250                         {
 251                                 ResourceType: aws.String("instance"),
 252                                 Tags:         ec2tags,
 253                         }},
 254                 UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
 255         }
 256
 257         if publicKey != nil {
 258                 keyname, err := instanceSet.getKeyName(publicKey)
 259                 if err != nil {
 260                         return nil, err
 261                 }
 262                 rii.KeyName = &keyname
 263         }
 264
 265         if instanceType.AddedScratch > 0 {
 266                 rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
 267                         DeviceName: aws.String("/dev/xvdt"),
 268                         Ebs: &ec2.EbsBlockDevice{
 269                                 DeleteOnTermination: aws.Bool(true),
 270                                 VolumeSize:          aws.Int64((int64(instanceType.AddedScratch) + (1<<30 - 1)) >> 30),
 271                                 VolumeType:          &instanceSet.ec2config.EBSVolumeType,
 272                         }}}
 273         }
 274
 275         if instanceType.Preemptible {
 276                 rii.InstanceMarketOptions = &ec2.InstanceMarketOptionsRequest{
 277                         MarketType: aws.String("spot"),
 278                         SpotOptions: &ec2.SpotMarketOptions{
 279                                 InstanceInterruptionBehavior: aws.String("terminate"),
 280                                 MaxPrice:                     aws.String(fmt.Sprintf("%v", instanceType.Price)),
 281                         }}
 282         }
 283
 284         if instanceSet.ec2config.IAMInstanceProfile != "" {
 285                 rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
 286                         Name: aws.String(instanceSet.ec2config.IAMInstanceProfile),
 287                 }
 288         }
 289
 290         var rsv *ec2.Reservation
 291         var errToReturn error
 292         subnets := instanceSet.ec2config.SubnetID
 293         currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
 294         for tryOffset := 0; ; tryOffset++ {
 295                 tryIndex := 0
 296                 trySubnet := ""
 297                 if len(subnets) > 0 {
 298                         tryIndex = (currentSubnetIDIndex + tryOffset) % len(subnets)
 299                         trySubnet = subnets[tryIndex]
 300                         rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
 301                 }
 302                 var err error
 303                 rsv, err = instanceSet.client.RunInstances(&rii)
 304                 instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
 305                 if !isErrorCapacity(errToReturn) || isErrorCapacity(err) {
 306                         // We want to return the last capacity error,
 307                         // if any; otherwise the last non-capacity
 308                         // error.
 309                         errToReturn = err
 310                 }
 311                 if isErrorSubnetSpecific(err) &&
 312                         tryOffset < len(subnets)-1 {
 313                         instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
 314                                 Warn("RunInstances failed, trying next subnet")
 315                         continue
 316                 }
 317                 // Succeeded, or exhausted all subnets, or got a
 318                 // non-subnet-related error.
 319                 //
 320                 // We intentionally update currentSubnetIDIndex even
 321                 // in the non-retryable-failure case here to avoid a
 322                 // situation where successive calls to Create() keep
 323                 // returning errors for the same subnet (perhaps
 324                 // "subnet full") and never reveal the errors for the
 325                 // other configured subnets (perhaps "subnet ID
 326                 // invalid").
 327                 atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
 328                 break
 329         }
 330         if rsv == nil || len(rsv.Instances) == 0 {
 331                 return nil, wrapError(errToReturn, &instanceSet.throttleDelayCreate)
 332         }
 333         return &ec2Instance{
 334                 provider: instanceSet,
 335                 instance: rsv.Instances[0],
 336         }, nil
 337 }
 338
 339 func (instanceSet *ec2InstanceSet) getKeyName(publicKey ssh.PublicKey) (string, error) {
 340         instanceSet.keysMtx.Lock()
 341         defer instanceSet.keysMtx.Unlock()
 342         md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
 343         if err != nil {
 344                 return "", fmt.Errorf("Could not make key fingerprint: %v", err)
 345         }
 346         if keyname, ok := instanceSet.keys[md5keyFingerprint]; ok {
 347                 return keyname, nil
 348         }
 349         keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
 350                 Filters: []*ec2.Filter{{
 351                         Name:   aws.String("fingerprint"),
 352                         Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
 353                 }},
 354         })
 355         if err != nil {
 356                 return "", fmt.Errorf("Could not search for keypair: %v", err)
 357         }
 358         if len(keyout.KeyPairs) > 0 {
 359                 return *(keyout.KeyPairs[0].KeyName), nil
 360         }
 361         keyname := "arvados-dispatch-keypair-" + md5keyFingerprint
 362         _, err = instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
 363                 KeyName:           &keyname,
 364                 PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
 365         })
 366         if err != nil {
 367                 return "", fmt.Errorf("Could not import keypair: %v", err)
 368         }
 369         instanceSet.keys[md5keyFingerprint] = keyname
 370         return keyname, nil
 371 }
 372
 373 func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
 374         var filters []*ec2.Filter
 375         for k, v := range tags {
 376                 filters = append(filters, &ec2.Filter{
 377                         Name:   aws.String("tag:" + k),
 378                         Values: []*string{aws.String(v)},
 379                 })
 380         }
 381         needAZs := false
 382         dii := &ec2.DescribeInstancesInput{Filters: filters}
 383         for {
 384                 dio, err := instanceSet.client.DescribeInstances(dii)
 385                 err = wrapError(err, &instanceSet.throttleDelayInstances)
 386                 if err != nil {
 387                         return nil, err
 388                 }
 389
 390                 for _, rsv := range dio.Reservations {
 391                         for _, inst := range rsv.Instances {
 392                                 if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" {
 393                                         instances = append(instances, &ec2Instance{
 394                                                 provider: instanceSet,
 395                                                 instance: inst,
 396                                         })
 397                                         if aws.StringValue(inst.InstanceLifecycle) == "spot" {
 398                                                 needAZs = true
 399                                         }
 400                                 }
 401                         }
 402                 }
 403                 if dio.NextToken == nil {
 404                         break
 405                 }
 406                 dii.NextToken = dio.NextToken
 407         }
 408         if needAZs && instanceSet.ec2config.SpotPriceUpdateInterval > 0 {
 409                 az := map[string]string{}
 410                 err := instanceSet.client.DescribeInstanceStatusPages(&ec2.DescribeInstanceStatusInput{
 411                         IncludeAllInstances: aws.Bool(true),
 412                 }, func(page *ec2.DescribeInstanceStatusOutput, lastPage bool) bool {
 413                         for _, ent := range page.InstanceStatuses {
 414                                 az[*ent.InstanceId] = *ent.AvailabilityZone
 415                         }
 416                         return true
 417                 })
 418                 if err != nil {
 419                         instanceSet.logger.Warnf("error getting instance statuses: %s", err)
 420                 }
 421                 for _, inst := range instances {
 422                         inst := inst.(*ec2Instance)
 423                         inst.availabilityZone = az[*inst.instance.InstanceId]
 424                 }
 425                 instanceSet.updateSpotPrices(instances)
 426         }
 427
 428         // Count instances in each subnet, and report in metrics.
 429         subnetInstances := map[string]int{"": 0}
 430         for _, subnet := range instanceSet.ec2config.SubnetID {
 431                 subnetInstances[subnet] = 0
 432         }
 433         for _, inst := range instances {
 434                 subnet := inst.(*ec2Instance).instance.SubnetId
 435                 if subnet != nil {
 436                         subnetInstances[*subnet]++
 437                 } else {
 438                         subnetInstances[""]++
 439                 }
 440         }
 441         for subnet, count := range subnetInstances {
 442                 instanceSet.mInstances.WithLabelValues(subnet).Set(float64(count))
 443         }
 444
 445         return instances, err
 446 }
 447
 448 type priceKey struct {
 449         instanceType     string
 450         spot             bool
 451         availabilityZone string
 452 }
 453
 454 // Refresh recent spot instance pricing data for the given instances,
 455 // unless we already have recent pricing data for all relevant types.
 456 func (instanceSet *ec2InstanceSet) updateSpotPrices(instances []cloud.Instance) {
 457         if len(instances) == 0 {
 458                 return
 459         }
 460
 461         instanceSet.pricesLock.Lock()
 462         defer instanceSet.pricesLock.Unlock()
 463         if instanceSet.prices == nil {
 464                 instanceSet.prices = map[priceKey][]cloud.InstancePrice{}
 465                 instanceSet.pricesUpdated = map[priceKey]time.Time{}
 466         }
 467
 468         updateTime := time.Now()
 469         staleTime := updateTime.Add(-instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
 470         needUpdate := false
 471         allTypes := map[string]bool{}
 472
 473         for _, inst := range instances {
 474                 ec2inst := inst.(*ec2Instance).instance
 475                 if aws.StringValue(ec2inst.InstanceLifecycle) == "spot" {
 476                         pk := priceKey{
 477                                 instanceType:     *ec2inst.InstanceType,
 478                                 spot:             true,
 479                                 availabilityZone: inst.(*ec2Instance).availabilityZone,
 480                         }
 481                         if instanceSet.pricesUpdated[pk].Before(staleTime) {
 482                                 needUpdate = true
 483                         }
 484                         allTypes[*ec2inst.InstanceType] = true
 485                 }
 486         }
 487         if !needUpdate {
 488                 return
 489         }
 490         var typeFilterValues []*string
 491         for instanceType := range allTypes {
 492                 typeFilterValues = append(typeFilterValues, aws.String(instanceType))
 493         }
 494         // Get 3x update interval worth of pricing data. (Ideally the
 495         // AWS API would tell us "we have shown you all of the price
 496         // changes up to time T", but it doesn't, so we'll just ask
 497         // for 3 intervals worth of data on each update, de-duplicate
 498         // the data points, and not worry too much about occasionally
 499         // missing some data points when our lookups fail twice in a
 500         // row.
 501         dsphi := &ec2.DescribeSpotPriceHistoryInput{
 502                 StartTime: aws.Time(updateTime.Add(-3 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())),
 503                 Filters: []*ec2.Filter{
 504                         &ec2.Filter{Name: aws.String("instance-type"), Values: typeFilterValues},
 505                         &ec2.Filter{Name: aws.String("product-description"), Values: []*string{aws.String("Linux/UNIX")}},
 506                 },
 507         }
 508         err := instanceSet.client.DescribeSpotPriceHistoryPages(dsphi, func(page *ec2.DescribeSpotPriceHistoryOutput, lastPage bool) bool {
 509                 for _, ent := range page.SpotPriceHistory {
 510                         if ent.InstanceType == nil || ent.SpotPrice == nil || ent.Timestamp == nil {
 511                                 // bogus record?
 512                                 continue
 513                         }
 514                         price, err := strconv.ParseFloat(*ent.SpotPrice, 64)
 515                         if err != nil {
 516                                 // bogus record?
 517                                 continue
 518                         }
 519                         pk := priceKey{
 520                                 instanceType:     *ent.InstanceType,
 521                                 spot:             true,
 522                                 availabilityZone: *ent.AvailabilityZone,
 523                         }
 524                         instanceSet.prices[pk] = append(instanceSet.prices[pk], cloud.InstancePrice{
 525                                 StartTime: *ent.Timestamp,
 526                                 Price:     price,
 527                         })
 528                         instanceSet.pricesUpdated[pk] = updateTime
 529                 }
 530                 return true
 531         })
 532         if err != nil {
 533                 instanceSet.logger.Warnf("error retrieving spot instance prices: %s", err)
 534         }
 535
 536         expiredTime := updateTime.Add(-64 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
 537         for pk, last := range instanceSet.pricesUpdated {
 538                 if last.Before(expiredTime) {
 539                         delete(instanceSet.pricesUpdated, pk)
 540                         delete(instanceSet.prices, pk)
 541                 }
 542         }
 543         for pk, prices := range instanceSet.prices {
 544                 instanceSet.prices[pk] = cloud.NormalizePriceHistory(prices)
 545         }
 546 }
 547
 548 func (instanceSet *ec2InstanceSet) Stop() {
 549 }
 550
 551 type ec2Instance struct {
 552         provider         *ec2InstanceSet
 553         instance         *ec2.Instance
 554         availabilityZone string // sometimes available for spot instances
 555 }
 556
 557 func (inst *ec2Instance) ID() cloud.InstanceID {
 558         return cloud.InstanceID(*inst.instance.InstanceId)
 559 }
 560
 561 func (inst *ec2Instance) String() string {
 562         return *inst.instance.InstanceId
 563 }
 564
 565 func (inst *ec2Instance) ProviderType() string {
 566         return *inst.instance.InstanceType
 567 }
 568
 569 func (inst *ec2Instance) SetTags(newTags cloud.InstanceTags) error {
 570         var ec2tags []*ec2.Tag
 571         for k, v := range newTags {
 572                 ec2tags = append(ec2tags, &ec2.Tag{
 573                         Key:   aws.String(k),
 574                         Value: aws.String(v),
 575                 })
 576         }
 577
 578         _, err := inst.provider.client.CreateTags(&ec2.CreateTagsInput{
 579                 Resources: []*string{inst.instance.InstanceId},
 580                 Tags:      ec2tags,
 581         })
 582
 583         return err
 584 }
 585
 586 func (inst *ec2Instance) Tags() cloud.InstanceTags {
 587         tags := make(map[string]string)
 588
 589         for _, t := range inst.instance.Tags {
 590                 tags[*t.Key] = *t.Value
 591         }
 592
 593         return tags
 594 }
 595
 596 func (inst *ec2Instance) Destroy() error {
 597         _, err := inst.provider.client.TerminateInstances(&ec2.TerminateInstancesInput{
 598                 InstanceIds: []*string{inst.instance.InstanceId},
 599         })
 600         return err
 601 }
 602
 603 func (inst *ec2Instance) Address() string {
 604         if inst.instance.PrivateIpAddress != nil {
 605                 return *inst.instance.PrivateIpAddress
 606         }
 607         return ""
 608 }
 609
 610 func (inst *ec2Instance) RemoteUser() string {
 611         return inst.provider.ec2config.AdminUsername
 612 }
 613
 614 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
 615         return cloud.ErrNotImplemented
 616 }
 617
 618 // PriceHistory returns the price history for this specific instance.
 619 //
 620 // AWS documentation is elusive about whether the hourly cost of a
 621 // given spot instance changes as the current spot price changes for
 622 // the corresponding instance type and availability zone. Our
 623 // implementation assumes the answer is yes, based on the following
 624 // hints.
 625 //
 626 // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html
 627 // says: "After your Spot Instance is running, if the Spot price rises
 628 // above your maximum price, Amazon EC2 interrupts your Spot
 629 // Instance." (This doesn't address what happens when the spot price
 630 // rises *without* exceeding your maximum price.)
 631 //
 632 // https://docs.aws.amazon.com/whitepapers/latest/cost-optimization-leveraging-ec2-spot-instances/how-spot-instances-work.html
 633 // says: "You pay the Spot price that's in effect, billed to the
 634 // nearest second." (But it's not explicitly stated whether "the price
 635 // in effect" changes over time for a given instance.)
 636 //
 637 // The same page also says, in a discussion about the effect of
 638 // specifying a maximum price: "Note that you never pay more than the
 639 // Spot price that is in effect when your Spot Instance is running."
 640 // (The use of the phrase "is running", as opposed to "was launched",
 641 // hints that pricing is dynamic.)
 642 func (inst *ec2Instance) PriceHistory(instType arvados.InstanceType) []cloud.InstancePrice {
 643         inst.provider.pricesLock.Lock()
 644         defer inst.provider.pricesLock.Unlock()
 645         // Note updateSpotPrices currently populates
 646         // inst.provider.prices only for spot instances, so if
 647         // spot==false here, we will return no data.
 648         pk := priceKey{
 649                 instanceType:     *inst.instance.InstanceType,
 650                 spot:             aws.StringValue(inst.instance.InstanceLifecycle) == "spot",
 651                 availabilityZone: inst.availabilityZone,
 652         }
 653         var prices []cloud.InstancePrice
 654         for _, price := range inst.provider.prices[pk] {
 655                 // ceil(added scratch space in GiB)
 656                 gib := (instType.AddedScratch + 1<<30 - 1) >> 30
 657                 monthly := inst.provider.ec2config.EBSPrice * float64(gib)
 658                 hourly := monthly / 30 / 24
 659                 price.Price += hourly
 660                 prices = append(prices, price)
 661         }
 662         return prices
 663 }
 664
 665 type rateLimitError struct {
 666         error
 667         earliestRetry time.Time
 668 }
 669
 670 func (err rateLimitError) EarliestRetry() time.Time {
 671         return err.earliestRetry
 672 }
 673
 674 type capacityError struct {
 675         error
 676         isInstanceTypeSpecific bool
 677 }
 678
 679 func (er *capacityError) IsCapacityError() bool {
 680         return true
 681 }
 682
 683 func (er *capacityError) IsInstanceTypeSpecific() bool {
 684         return er.isInstanceTypeSpecific
 685 }
 686
 687 var isCodeQuota = map[string]bool{
 688         "InstanceLimitExceeded":             true,
 689         "InsufficientAddressCapacity":       true,
 690         "InsufficientFreeAddressesInSubnet": true,
 691         "InsufficientVolumeCapacity":        true,
 692         "MaxSpotInstanceCountExceeded":      true,
 693         "VcpuLimitExceeded":                 true,
 694 }
 695
 696 // isErrorQuota returns whether the error indicates we have reached
 697 // some usage quota/limit -- i.e., immediately retrying with an equal
 698 // or larger instance type will probably not work.
 699 //
 700 // Returns false if error is nil.
 701 func isErrorQuota(err error) bool {
 702         if aerr, ok := err.(awserr.Error); ok && aerr != nil {
 703                 if _, ok := isCodeQuota[aerr.Code()]; ok {
 704                         return true
 705                 }
 706         }
 707         return false
 708 }
 709
 710 // isErrorSubnetSpecific returns true if the problem encountered by
 711 // RunInstances might be avoided by trying a different subnet.
 712 func isErrorSubnetSpecific(err error) bool {
 713         aerr, ok := err.(awserr.Error)
 714         if !ok {
 715                 return false
 716         }
 717         code := aerr.Code()
 718         return strings.Contains(code, "Subnet") ||
 719                 code == "InsufficientInstanceCapacity" ||
 720                 code == "InsufficientVolumeCapacity" ||
 721                 code == "Unsupported"
 722 }
 723
 724 // isErrorCapacity returns true if the error indicates lack of
 725 // capacity (either temporary or permanent) to run a specific instance
 726 // type -- i.e., retrying with a different instance type might
 727 // succeed.
 728 func isErrorCapacity(err error) bool {
 729         aerr, ok := err.(awserr.Error)
 730         if !ok {
 731                 return false
 732         }
 733         code := aerr.Code()
 734         return code == "InsufficientInstanceCapacity" ||
 735                 (code == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))
 736 }
 737
 738 type ec2QuotaError struct {
 739         error
 740 }
 741
 742 func (er *ec2QuotaError) IsQuotaError() bool {
 743         return true
 744 }
 745
 746 func wrapError(err error, throttleValue *atomic.Value) error {
 747         if request.IsErrorThrottle(err) {
 748                 // Back off exponentially until an upstream call
 749                 // either succeeds or returns a non-throttle error.
 750                 d, _ := throttleValue.Load().(time.Duration)
 751                 d = d*3/2 + time.Second
 752                 if d < throttleDelayMin {
 753                         d = throttleDelayMin
 754                 } else if d > throttleDelayMax {
 755                         d = throttleDelayMax
 756                 }
 757                 throttleValue.Store(d)
 758                 return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
 759         } else if isErrorQuota(err) {
 760                 return &ec2QuotaError{err}
 761         } else if isErrorCapacity(err) {
 762                 return &capacityError{err, true}
 763         } else if err != nil {
 764                 throttleValue.Store(time.Duration(0))
 765                 return err
 766         }
 767         throttleValue.Store(time.Duration(0))
 768         return nil
 769 }
 770
 771 var boolLabelValue = map[bool]string{false: "0", true: "1"}