lib/cloud/ec2/ec2.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package ec2
   6
   7 import (
   8         "crypto/md5"
   9         "crypto/rsa"
  10         "crypto/sha1"
  11         "crypto/x509"
  12         "encoding/base64"
  13         "encoding/json"
  14         "fmt"
  15         "math/big"
  16         "regexp"
  17         "strconv"
  18         "strings"
  19         "sync"
  20         "sync/atomic"
  21         "time"
  22
  23         "git.arvados.org/arvados.git/lib/cloud"
  24         "git.arvados.org/arvados.git/sdk/go/arvados"
  25         "github.com/aws/aws-sdk-go/aws"
  26         "github.com/aws/aws-sdk-go/aws/awserr"
  27         "github.com/aws/aws-sdk-go/aws/credentials"
  28         "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
  29         "github.com/aws/aws-sdk-go/aws/ec2metadata"
  30         "github.com/aws/aws-sdk-go/aws/request"
  31         "github.com/aws/aws-sdk-go/aws/session"
  32         "github.com/aws/aws-sdk-go/service/ec2"
  33         "github.com/prometheus/client_golang/prometheus"
  34         "github.com/sirupsen/logrus"
  35         "golang.org/x/crypto/ssh"
  36 )
  37
  38 // Driver is the ec2 implementation of the cloud.Driver interface.
  39 var Driver = cloud.DriverFunc(newEC2InstanceSet)
  40
  41 const (
  42         throttleDelayMin = time.Second
  43         throttleDelayMax = time.Minute
  44 )
  45
  46 type ec2InstanceSetConfig struct {
  47         AccessKeyID             string
  48         SecretAccessKey         string
  49         Region                  string
  50         SecurityGroupIDs        arvados.StringSet
  51         SubnetID                sliceOrSingleString
  52         AdminUsername           string
  53         EBSVolumeType           string
  54         EBSPrice                float64
  55         IAMInstanceProfile      string
  56         SpotPriceUpdateInterval arvados.Duration
  57 }
  58
  59 type sliceOrSingleString []string
  60
  61 // UnmarshalJSON unmarshals an array of strings, and also accepts ""
  62 // as [], and "foo" as ["foo"].
  63 func (ss *sliceOrSingleString) UnmarshalJSON(data []byte) error {
  64         if len(data) == 0 {
  65                 *ss = nil
  66         } else if data[0] == '[' {
  67                 var slice []string
  68                 err := json.Unmarshal(data, &slice)
  69                 if err != nil {
  70                         return err
  71                 }
  72                 if len(slice) == 0 {
  73                         *ss = nil
  74                 } else {
  75                         *ss = slice
  76                 }
  77         } else {
  78                 var str string
  79                 err := json.Unmarshal(data, &str)
  80                 if err != nil {
  81                         return err
  82                 }
  83                 if str == "" {
  84                         *ss = nil
  85                 } else {
  86                         *ss = []string{str}
  87                 }
  88         }
  89         return nil
  90 }
  91
  92 type ec2Interface interface {
  93         DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error)
  94         ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
  95         RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error)
  96         DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
  97         DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error
  98         DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error
  99         CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error)
 100         TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
 101 }
 102
 103 type ec2InstanceSet struct {
 104         ec2config              ec2InstanceSetConfig
 105         currentSubnetIDIndex   int32
 106         instanceSetID          cloud.InstanceSetID
 107         logger                 logrus.FieldLogger
 108         client                 ec2Interface
 109         keysMtx                sync.Mutex
 110         keys                   map[string]string
 111         throttleDelayCreate    atomic.Value
 112         throttleDelayInstances atomic.Value
 113
 114         prices        map[priceKey][]cloud.InstancePrice
 115         pricesLock    sync.Mutex
 116         pricesUpdated map[priceKey]time.Time
 117
 118         mInstances      *prometheus.GaugeVec
 119         mInstanceStarts *prometheus.CounterVec
 120 }
 121
 122 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) {
 123         instanceSet := &ec2InstanceSet{
 124                 instanceSetID: instanceSetID,
 125                 logger:        logger,
 126         }
 127         err = json.Unmarshal(config, &instanceSet.ec2config)
 128         if err != nil {
 129                 return nil, err
 130         }
 131
 132         sess, err := session.NewSession()
 133         if err != nil {
 134                 return nil, err
 135         }
 136         // First try any static credentials, fall back to an IAM instance profile/role
 137         creds := credentials.NewChainCredentials(
 138                 []credentials.Provider{
 139                         &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
 140                         &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
 141                 })
 142
 143         awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
 144         instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
 145         instanceSet.keys = make(map[string]string)
 146         if instanceSet.ec2config.EBSVolumeType == "" {
 147                 instanceSet.ec2config.EBSVolumeType = "gp2"
 148         }
 149
 150         // Set up metrics
 151         instanceSet.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 152                 Namespace: "arvados",
 153                 Subsystem: "dispatchcloud",
 154                 Name:      "ec2_instances",
 155                 Help:      "Number of instances running",
 156         }, []string{"subnet_id"})
 157         instanceSet.mInstanceStarts = prometheus.NewCounterVec(prometheus.CounterOpts{
 158                 Namespace: "arvados",
 159                 Subsystem: "dispatchcloud",
 160                 Name:      "ec2_instance_starts_total",
 161                 Help:      "Number of attempts to start a new instance",
 162         }, []string{"subnet_id", "success"})
 163         // Initialize all of the series we'll be reporting.  Otherwise
 164         // the {subnet=A, success=0} series doesn't appear in metrics
 165         // at all until there's a failure in subnet A.
 166         for _, subnet := range instanceSet.ec2config.SubnetID {
 167                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "0").Add(0)
 168                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "1").Add(0)
 169         }
 170         if len(instanceSet.ec2config.SubnetID) == 0 {
 171                 instanceSet.mInstanceStarts.WithLabelValues("", "0").Add(0)
 172                 instanceSet.mInstanceStarts.WithLabelValues("", "1").Add(0)
 173         }
 174         if reg != nil {
 175                 reg.MustRegister(instanceSet.mInstances)
 176                 reg.MustRegister(instanceSet.mInstanceStarts)
 177         }
 178
 179         return instanceSet, nil
 180 }
 181
 182 func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error) {
 183         // AWS key fingerprints don't use the usual key fingerprint
 184         // you get from ssh-keygen or ssh.FingerprintLegacyMD5()
 185         // (you can get that from md5.Sum(pk.Marshal())
 186         //
 187         // AWS uses the md5 or sha1 of the PKIX DER encoding of the
 188         // public key, so calculate those fingerprints here.
 189         var rsaPub struct {
 190                 Name string
 191                 E    *big.Int
 192                 N    *big.Int
 193         }
 194         if err := ssh.Unmarshal(pk.Marshal(), &rsaPub); err != nil {
 195                 return "", "", fmt.Errorf("agent: Unmarshal failed to parse public key: %v", err)
 196         }
 197         rsaPk := rsa.PublicKey{
 198                 E: int(rsaPub.E.Int64()),
 199                 N: rsaPub.N,
 200         }
 201         pkix, _ := x509.MarshalPKIXPublicKey(&rsaPk)
 202         md5pkix := md5.Sum([]byte(pkix))
 203         sha1pkix := sha1.Sum([]byte(pkix))
 204         md5fp = ""
 205         sha1fp = ""
 206         for i := 0; i < len(md5pkix); i++ {
 207                 md5fp += fmt.Sprintf(":%02x", md5pkix[i])
 208         }
 209         for i := 0; i < len(sha1pkix); i++ {
 210                 sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
 211         }
 212         return md5fp[1:], sha1fp[1:], nil
 213 }
 214
 215 func (instanceSet *ec2InstanceSet) Create(
 216         instanceType arvados.InstanceType,
 217         imageID cloud.ImageID,
 218         newTags cloud.InstanceTags,
 219         initCommand cloud.InitCommand,
 220         publicKey ssh.PublicKey) (cloud.Instance, error) {
 221
 222         ec2tags := []*ec2.Tag{}
 223         for k, v := range newTags {
 224                 ec2tags = append(ec2tags, &ec2.Tag{
 225                         Key:   aws.String(k),
 226                         Value: aws.String(v),
 227                 })
 228         }
 229
 230         var groups []string
 231         for sg := range instanceSet.ec2config.SecurityGroupIDs {
 232                 groups = append(groups, sg)
 233         }
 234
 235         rii := ec2.RunInstancesInput{
 236                 ImageId:      aws.String(string(imageID)),
 237                 InstanceType: &instanceType.ProviderType,
 238                 MaxCount:     aws.Int64(1),
 239                 MinCount:     aws.Int64(1),
 240
 241                 NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
 242                         {
 243                                 AssociatePublicIpAddress: aws.Bool(false),
 244                                 DeleteOnTermination:      aws.Bool(true),
 245                                 DeviceIndex:              aws.Int64(0),
 246                                 Groups:                   aws.StringSlice(groups),
 247                         }},
 248                 DisableApiTermination:             aws.Bool(false),
 249                 InstanceInitiatedShutdownBehavior: aws.String("terminate"),
 250                 TagSpecifications: []*ec2.TagSpecification{
 251                         {
 252                                 ResourceType: aws.String("instance"),
 253                                 Tags:         ec2tags,
 254                         }},
 255                 MetadataOptions: &ec2.InstanceMetadataOptionsRequest{
 256                         // Require IMDSv2, as described at
 257                         // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-IMDS-new-instances.html
 258                         HttpEndpoint: aws.String(ec2.InstanceMetadataEndpointStateEnabled),
 259                         HttpTokens:   aws.String(ec2.HttpTokensStateRequired),
 260                 },
 261                 UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
 262         }
 263
 264         if publicKey != nil {
 265                 keyname, err := instanceSet.getKeyName(publicKey)
 266                 if err != nil {
 267                         return nil, err
 268                 }
 269                 rii.KeyName = &keyname
 270         }
 271
 272         if instanceType.AddedScratch > 0 {
 273                 rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
 274                         DeviceName: aws.String("/dev/xvdt"),
 275                         Ebs: &ec2.EbsBlockDevice{
 276                                 DeleteOnTermination: aws.Bool(true),
 277                                 VolumeSize:          aws.Int64((int64(instanceType.AddedScratch) + (1<<30 - 1)) >> 30),
 278                                 VolumeType:          &instanceSet.ec2config.EBSVolumeType,
 279                         }}}
 280         }
 281
 282         if instanceType.Preemptible {
 283                 rii.InstanceMarketOptions = &ec2.InstanceMarketOptionsRequest{
 284                         MarketType: aws.String("spot"),
 285                         SpotOptions: &ec2.SpotMarketOptions{
 286                                 InstanceInterruptionBehavior: aws.String("terminate"),
 287                                 MaxPrice:                     aws.String(fmt.Sprintf("%v", instanceType.Price)),
 288                         }}
 289         }
 290
 291         if instanceSet.ec2config.IAMInstanceProfile != "" {
 292                 rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
 293                         Name: aws.String(instanceSet.ec2config.IAMInstanceProfile),
 294                 }
 295         }
 296
 297         var rsv *ec2.Reservation
 298         var errToReturn error
 299         subnets := instanceSet.ec2config.SubnetID
 300         currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
 301         for tryOffset := 0; ; tryOffset++ {
 302                 tryIndex := 0
 303                 trySubnet := ""
 304                 if len(subnets) > 0 {
 305                         tryIndex = (currentSubnetIDIndex + tryOffset) % len(subnets)
 306                         trySubnet = subnets[tryIndex]
 307                         rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
 308                 }
 309                 var err error
 310                 rsv, err = instanceSet.client.RunInstances(&rii)
 311                 instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
 312                 if !isErrorCapacity(errToReturn) || isErrorCapacity(err) {
 313                         // We want to return the last capacity error,
 314                         // if any; otherwise the last non-capacity
 315                         // error.
 316                         errToReturn = err
 317                 }
 318                 if isErrorSubnetSpecific(err) &&
 319                         tryOffset < len(subnets)-1 {
 320                         instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
 321                                 Warn("RunInstances failed, trying next subnet")
 322                         continue
 323                 }
 324                 // Succeeded, or exhausted all subnets, or got a
 325                 // non-subnet-related error.
 326                 //
 327                 // We intentionally update currentSubnetIDIndex even
 328                 // in the non-retryable-failure case here to avoid a
 329                 // situation where successive calls to Create() keep
 330                 // returning errors for the same subnet (perhaps
 331                 // "subnet full") and never reveal the errors for the
 332                 // other configured subnets (perhaps "subnet ID
 333                 // invalid").
 334                 atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
 335                 break
 336         }
 337         if rsv == nil || len(rsv.Instances) == 0 {
 338                 return nil, wrapError(errToReturn, &instanceSet.throttleDelayCreate)
 339         }
 340         return &ec2Instance{
 341                 provider: instanceSet,
 342                 instance: rsv.Instances[0],
 343         }, nil
 344 }
 345
 346 func (instanceSet *ec2InstanceSet) getKeyName(publicKey ssh.PublicKey) (string, error) {
 347         instanceSet.keysMtx.Lock()
 348         defer instanceSet.keysMtx.Unlock()
 349         md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
 350         if err != nil {
 351                 return "", fmt.Errorf("Could not make key fingerprint: %v", err)
 352         }
 353         if keyname, ok := instanceSet.keys[md5keyFingerprint]; ok {
 354                 return keyname, nil
 355         }
 356         keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
 357                 Filters: []*ec2.Filter{{
 358                         Name:   aws.String("fingerprint"),
 359                         Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
 360                 }},
 361         })
 362         if err != nil {
 363                 return "", fmt.Errorf("Could not search for keypair: %v", err)
 364         }
 365         if len(keyout.KeyPairs) > 0 {
 366                 return *(keyout.KeyPairs[0].KeyName), nil
 367         }
 368         keyname := "arvados-dispatch-keypair-" + md5keyFingerprint
 369         _, err = instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
 370                 KeyName:           &keyname,
 371                 PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
 372         })
 373         if err != nil {
 374                 return "", fmt.Errorf("Could not import keypair: %v", err)
 375         }
 376         instanceSet.keys[md5keyFingerprint] = keyname
 377         return keyname, nil
 378 }
 379
 380 func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
 381         var filters []*ec2.Filter
 382         for k, v := range tags {
 383                 filters = append(filters, &ec2.Filter{
 384                         Name:   aws.String("tag:" + k),
 385                         Values: []*string{aws.String(v)},
 386                 })
 387         }
 388         needAZs := false
 389         dii := &ec2.DescribeInstancesInput{Filters: filters}
 390         for {
 391                 dio, err := instanceSet.client.DescribeInstances(dii)
 392                 err = wrapError(err, &instanceSet.throttleDelayInstances)
 393                 if err != nil {
 394                         return nil, err
 395                 }
 396
 397                 for _, rsv := range dio.Reservations {
 398                         for _, inst := range rsv.Instances {
 399                                 if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" {
 400                                         instances = append(instances, &ec2Instance{
 401                                                 provider: instanceSet,
 402                                                 instance: inst,
 403                                         })
 404                                         if aws.StringValue(inst.InstanceLifecycle) == "spot" {
 405                                                 needAZs = true
 406                                         }
 407                                 }
 408                         }
 409                 }
 410                 if dio.NextToken == nil {
 411                         break
 412                 }
 413                 dii.NextToken = dio.NextToken
 414         }
 415         if needAZs && instanceSet.ec2config.SpotPriceUpdateInterval > 0 {
 416                 az := map[string]string{}
 417                 err := instanceSet.client.DescribeInstanceStatusPages(&ec2.DescribeInstanceStatusInput{
 418                         IncludeAllInstances: aws.Bool(true),
 419                 }, func(page *ec2.DescribeInstanceStatusOutput, lastPage bool) bool {
 420                         for _, ent := range page.InstanceStatuses {
 421                                 az[*ent.InstanceId] = *ent.AvailabilityZone
 422                         }
 423                         return true
 424                 })
 425                 if err != nil {
 426                         instanceSet.logger.Warnf("error getting instance statuses: %s", err)
 427                 }
 428                 for _, inst := range instances {
 429                         inst := inst.(*ec2Instance)
 430                         inst.availabilityZone = az[*inst.instance.InstanceId]
 431                 }
 432                 instanceSet.updateSpotPrices(instances)
 433         }
 434
 435         // Count instances in each subnet, and report in metrics.
 436         subnetInstances := map[string]int{"": 0}
 437         for _, subnet := range instanceSet.ec2config.SubnetID {
 438                 subnetInstances[subnet] = 0
 439         }
 440         for _, inst := range instances {
 441                 subnet := inst.(*ec2Instance).instance.SubnetId
 442                 if subnet != nil {
 443                         subnetInstances[*subnet]++
 444                 } else {
 445                         subnetInstances[""]++
 446                 }
 447         }
 448         for subnet, count := range subnetInstances {
 449                 instanceSet.mInstances.WithLabelValues(subnet).Set(float64(count))
 450         }
 451
 452         return instances, err
 453 }
 454
 455 type priceKey struct {
 456         instanceType     string
 457         spot             bool
 458         availabilityZone string
 459 }
 460
 461 // Refresh recent spot instance pricing data for the given instances,
 462 // unless we already have recent pricing data for all relevant types.
 463 func (instanceSet *ec2InstanceSet) updateSpotPrices(instances []cloud.Instance) {
 464         if len(instances) == 0 {
 465                 return
 466         }
 467
 468         instanceSet.pricesLock.Lock()
 469         defer instanceSet.pricesLock.Unlock()
 470         if instanceSet.prices == nil {
 471                 instanceSet.prices = map[priceKey][]cloud.InstancePrice{}
 472                 instanceSet.pricesUpdated = map[priceKey]time.Time{}
 473         }
 474
 475         updateTime := time.Now()
 476         staleTime := updateTime.Add(-instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
 477         needUpdate := false
 478         allTypes := map[string]bool{}
 479
 480         for _, inst := range instances {
 481                 ec2inst := inst.(*ec2Instance).instance
 482                 if aws.StringValue(ec2inst.InstanceLifecycle) == "spot" {
 483                         pk := priceKey{
 484                                 instanceType:     *ec2inst.InstanceType,
 485                                 spot:             true,
 486                                 availabilityZone: inst.(*ec2Instance).availabilityZone,
 487                         }
 488                         if instanceSet.pricesUpdated[pk].Before(staleTime) {
 489                                 needUpdate = true
 490                         }
 491                         allTypes[*ec2inst.InstanceType] = true
 492                 }
 493         }
 494         if !needUpdate {
 495                 return
 496         }
 497         var typeFilterValues []*string
 498         for instanceType := range allTypes {
 499                 typeFilterValues = append(typeFilterValues, aws.String(instanceType))
 500         }
 501         // Get 3x update interval worth of pricing data. (Ideally the
 502         // AWS API would tell us "we have shown you all of the price
 503         // changes up to time T", but it doesn't, so we'll just ask
 504         // for 3 intervals worth of data on each update, de-duplicate
 505         // the data points, and not worry too much about occasionally
 506         // missing some data points when our lookups fail twice in a
 507         // row.
 508         dsphi := &ec2.DescribeSpotPriceHistoryInput{
 509                 StartTime: aws.Time(updateTime.Add(-3 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())),
 510                 Filters: []*ec2.Filter{
 511                         &ec2.Filter{Name: aws.String("instance-type"), Values: typeFilterValues},
 512                         &ec2.Filter{Name: aws.String("product-description"), Values: []*string{aws.String("Linux/UNIX")}},
 513                 },
 514         }
 515         err := instanceSet.client.DescribeSpotPriceHistoryPages(dsphi, func(page *ec2.DescribeSpotPriceHistoryOutput, lastPage bool) bool {
 516                 for _, ent := range page.SpotPriceHistory {
 517                         if ent.InstanceType == nil || ent.SpotPrice == nil || ent.Timestamp == nil {
 518                                 // bogus record?
 519                                 continue
 520                         }
 521                         price, err := strconv.ParseFloat(*ent.SpotPrice, 64)
 522                         if err != nil {
 523                                 // bogus record?
 524                                 continue
 525                         }
 526                         pk := priceKey{
 527                                 instanceType:     *ent.InstanceType,
 528                                 spot:             true,
 529                                 availabilityZone: *ent.AvailabilityZone,
 530                         }
 531                         instanceSet.prices[pk] = append(instanceSet.prices[pk], cloud.InstancePrice{
 532                                 StartTime: *ent.Timestamp,
 533                                 Price:     price,
 534                         })
 535                         instanceSet.pricesUpdated[pk] = updateTime
 536                 }
 537                 return true
 538         })
 539         if err != nil {
 540                 instanceSet.logger.Warnf("error retrieving spot instance prices: %s", err)
 541         }
 542
 543         expiredTime := updateTime.Add(-64 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
 544         for pk, last := range instanceSet.pricesUpdated {
 545                 if last.Before(expiredTime) {
 546                         delete(instanceSet.pricesUpdated, pk)
 547                         delete(instanceSet.prices, pk)
 548                 }
 549         }
 550         for pk, prices := range instanceSet.prices {
 551                 instanceSet.prices[pk] = cloud.NormalizePriceHistory(prices)
 552         }
 553 }
 554
 555 func (instanceSet *ec2InstanceSet) Stop() {
 556 }
 557
 558 type ec2Instance struct {
 559         provider         *ec2InstanceSet
 560         instance         *ec2.Instance
 561         availabilityZone string // sometimes available for spot instances
 562 }
 563
 564 func (inst *ec2Instance) ID() cloud.InstanceID {
 565         return cloud.InstanceID(*inst.instance.InstanceId)
 566 }
 567
 568 func (inst *ec2Instance) String() string {
 569         return *inst.instance.InstanceId
 570 }
 571
 572 func (inst *ec2Instance) ProviderType() string {
 573         return *inst.instance.InstanceType
 574 }
 575
 576 func (inst *ec2Instance) SetTags(newTags cloud.InstanceTags) error {
 577         var ec2tags []*ec2.Tag
 578         for k, v := range newTags {
 579                 ec2tags = append(ec2tags, &ec2.Tag{
 580                         Key:   aws.String(k),
 581                         Value: aws.String(v),
 582                 })
 583         }
 584
 585         _, err := inst.provider.client.CreateTags(&ec2.CreateTagsInput{
 586                 Resources: []*string{inst.instance.InstanceId},
 587                 Tags:      ec2tags,
 588         })
 589
 590         return err
 591 }
 592
 593 func (inst *ec2Instance) Tags() cloud.InstanceTags {
 594         tags := make(map[string]string)
 595
 596         for _, t := range inst.instance.Tags {
 597                 tags[*t.Key] = *t.Value
 598         }
 599
 600         return tags
 601 }
 602
 603 func (inst *ec2Instance) Destroy() error {
 604         _, err := inst.provider.client.TerminateInstances(&ec2.TerminateInstancesInput{
 605                 InstanceIds: []*string{inst.instance.InstanceId},
 606         })
 607         return err
 608 }
 609
 610 func (inst *ec2Instance) Address() string {
 611         if inst.instance.PrivateIpAddress != nil {
 612                 return *inst.instance.PrivateIpAddress
 613         }
 614         return ""
 615 }
 616
 617 func (inst *ec2Instance) RemoteUser() string {
 618         return inst.provider.ec2config.AdminUsername
 619 }
 620
 621 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
 622         return cloud.ErrNotImplemented
 623 }
 624
 625 // PriceHistory returns the price history for this specific instance.
 626 //
 627 // AWS documentation is elusive about whether the hourly cost of a
 628 // given spot instance changes as the current spot price changes for
 629 // the corresponding instance type and availability zone. Our
 630 // implementation assumes the answer is yes, based on the following
 631 // hints.
 632 //
 633 // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html
 634 // says: "After your Spot Instance is running, if the Spot price rises
 635 // above your maximum price, Amazon EC2 interrupts your Spot
 636 // Instance." (This doesn't address what happens when the spot price
 637 // rises *without* exceeding your maximum price.)
 638 //
 639 // https://docs.aws.amazon.com/whitepapers/latest/cost-optimization-leveraging-ec2-spot-instances/how-spot-instances-work.html
 640 // says: "You pay the Spot price that's in effect, billed to the
 641 // nearest second." (But it's not explicitly stated whether "the price
 642 // in effect" changes over time for a given instance.)
 643 //
 644 // The same page also says, in a discussion about the effect of
 645 // specifying a maximum price: "Note that you never pay more than the
 646 // Spot price that is in effect when your Spot Instance is running."
 647 // (The use of the phrase "is running", as opposed to "was launched",
 648 // hints that pricing is dynamic.)
 649 func (inst *ec2Instance) PriceHistory(instType arvados.InstanceType) []cloud.InstancePrice {
 650         inst.provider.pricesLock.Lock()
 651         defer inst.provider.pricesLock.Unlock()
 652         // Note updateSpotPrices currently populates
 653         // inst.provider.prices only for spot instances, so if
 654         // spot==false here, we will return no data.
 655         pk := priceKey{
 656                 instanceType:     *inst.instance.InstanceType,
 657                 spot:             aws.StringValue(inst.instance.InstanceLifecycle) == "spot",
 658                 availabilityZone: inst.availabilityZone,
 659         }
 660         var prices []cloud.InstancePrice
 661         for _, price := range inst.provider.prices[pk] {
 662                 // ceil(added scratch space in GiB)
 663                 gib := (instType.AddedScratch + 1<<30 - 1) >> 30
 664                 monthly := inst.provider.ec2config.EBSPrice * float64(gib)
 665                 hourly := monthly / 30 / 24
 666                 price.Price += hourly
 667                 prices = append(prices, price)
 668         }
 669         return prices
 670 }
 671
 672 type rateLimitError struct {
 673         error
 674         earliestRetry time.Time
 675 }
 676
 677 func (err rateLimitError) EarliestRetry() time.Time {
 678         return err.earliestRetry
 679 }
 680
 681 type capacityError struct {
 682         error
 683         isInstanceTypeSpecific bool
 684 }
 685
 686 func (er *capacityError) IsCapacityError() bool {
 687         return true
 688 }
 689
 690 func (er *capacityError) IsInstanceTypeSpecific() bool {
 691         return er.isInstanceTypeSpecific
 692 }
 693
 694 var isCodeQuota = map[string]bool{
 695         "InstanceLimitExceeded":             true,
 696         "InsufficientAddressCapacity":       true,
 697         "InsufficientFreeAddressesInSubnet": true,
 698         "InsufficientVolumeCapacity":        true,
 699         "MaxSpotInstanceCountExceeded":      true,
 700         "VcpuLimitExceeded":                 true,
 701 }
 702
 703 // isErrorQuota returns whether the error indicates we have reached
 704 // some usage quota/limit -- i.e., immediately retrying with an equal
 705 // or larger instance type will probably not work.
 706 //
 707 // Returns false if error is nil.
 708 func isErrorQuota(err error) bool {
 709         if aerr, ok := err.(awserr.Error); ok && aerr != nil {
 710                 if _, ok := isCodeQuota[aerr.Code()]; ok {
 711                         return true
 712                 }
 713         }
 714         return false
 715 }
 716
 717 var reSubnetSpecificInvalidParameterMessage = regexp.MustCompile(`(?ms).*( subnet |sufficient free [Ii]pv[46] addresses).*`)
 718
 719 // isErrorSubnetSpecific returns true if the problem encountered by
 720 // RunInstances might be avoided by trying a different subnet.
 721 func isErrorSubnetSpecific(err error) bool {
 722         aerr, ok := err.(awserr.Error)
 723         if !ok {
 724                 return false
 725         }
 726         code := aerr.Code()
 727         return strings.Contains(code, "Subnet") ||
 728                 code == "InsufficientInstanceCapacity" ||
 729                 code == "InsufficientVolumeCapacity" ||
 730                 code == "Unsupported" ||
 731                 // See TestIsErrorSubnetSpecific for examples of why
 732                 // we look for substrings in code/message instead of
 733                 // only using specific codes here.
 734                 (strings.Contains(code, "InvalidParameter") &&
 735                         reSubnetSpecificInvalidParameterMessage.MatchString(aerr.Message()))
 736 }
 737
 738 // isErrorCapacity returns true if the error indicates lack of
 739 // capacity (either temporary or permanent) to run a specific instance
 740 // type -- i.e., retrying with a different instance type might
 741 // succeed.
 742 func isErrorCapacity(err error) bool {
 743         aerr, ok := err.(awserr.Error)
 744         if !ok {
 745                 return false
 746         }
 747         code := aerr.Code()
 748         return code == "InsufficientInstanceCapacity" ||
 749                 (code == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))
 750 }
 751
 752 type ec2QuotaError struct {
 753         error
 754 }
 755
 756 func (er *ec2QuotaError) IsQuotaError() bool {
 757         return true
 758 }
 759
 760 func wrapError(err error, throttleValue *atomic.Value) error {
 761         if request.IsErrorThrottle(err) {
 762                 // Back off exponentially until an upstream call
 763                 // either succeeds or returns a non-throttle error.
 764                 d, _ := throttleValue.Load().(time.Duration)
 765                 d = d*3/2 + time.Second
 766                 if d < throttleDelayMin {
 767                         d = throttleDelayMin
 768                 } else if d > throttleDelayMax {
 769                         d = throttleDelayMax
 770                 }
 771                 throttleValue.Store(d)
 772                 return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
 773         } else if isErrorQuota(err) {
 774                 return &ec2QuotaError{err}
 775         } else if isErrorCapacity(err) {
 776                 return &capacityError{err, true}
 777         } else if err != nil {
 778                 throttleValue.Store(time.Duration(0))
 779                 return err
 780         }
 781         throttleValue.Store(time.Duration(0))
 782         return nil
 783 }
 784
 785 var boolLabelValue = map[bool]string{false: "0", true: "1"}