X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/e7d06f9da1396e2ba69318b3bc8d3de690c6437a..756b80504e55ff7d9b9ec3f221bd11e231e9c1c6:/lib/cloud/ec2/ec2.go diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go index 29062c491e..e2cf5e0f1c 100644 --- a/lib/cloud/ec2/ec2.go +++ b/lib/cloud/ec2/ec2.go @@ -13,12 +13,19 @@ import ( "encoding/json" "fmt" "math/big" + "strconv" "sync" + "sync/atomic" + "time" "git.arvados.org/arvados.git/lib/cloud" "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds" + "github.com/aws/aws-sdk-go/aws/ec2metadata" + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" "github.com/sirupsen/logrus" @@ -28,14 +35,22 @@ import ( // Driver is the ec2 implementation of the cloud.Driver interface. var Driver = cloud.DriverFunc(newEC2InstanceSet) +const ( + throttleDelayMin = time.Second + throttleDelayMax = time.Minute +) + type ec2InstanceSetConfig struct { - AccessKeyID string - SecretAccessKey string - Region string - SecurityGroupIDs arvados.StringSet - SubnetID string - AdminUsername string - EBSVolumeType string + AccessKeyID string + SecretAccessKey string + Region string + SecurityGroupIDs arvados.StringSet + SubnetID string + AdminUsername string + EBSVolumeType string + EBSPrice float64 + IAMInstanceProfile string + SpotPriceUpdateInterval arvados.Duration } type ec2Interface interface { @@ -43,17 +58,25 @@ type ec2Interface interface { ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error) RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error) DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error) + DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error + DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) } type ec2InstanceSet struct { - ec2config ec2InstanceSetConfig - instanceSetID cloud.InstanceSetID - logger logrus.FieldLogger - client ec2Interface - keysMtx sync.Mutex - keys map[string]string + ec2config ec2InstanceSetConfig + instanceSetID cloud.InstanceSetID + logger logrus.FieldLogger + client ec2Interface + keysMtx sync.Mutex + keys map[string]string + throttleDelayCreate atomic.Value + throttleDelayInstances atomic.Value + + prices map[priceKey][]cloud.InstancePrice + pricesLock sync.Mutex + pricesUpdated map[priceKey]time.Time } func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) { @@ -65,12 +88,19 @@ func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID if err != nil { return nil, err } - awsConfig := aws.NewConfig(). - WithCredentials(credentials.NewStaticCredentials( - instanceSet.ec2config.AccessKeyID, - instanceSet.ec2config.SecretAccessKey, - "")). - WithRegion(instanceSet.ec2config.Region) + + sess, err := session.NewSession() + if err != nil { + return nil, err + } + // First try any static credentials, fall back to an IAM instance profile/role + creds := credentials.NewChainCredentials( + []credentials.Provider{ + &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}}, + &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)}, + }) + + awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region) instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig))) instanceSet.keys = make(map[string]string) if instanceSet.ec2config.EBSVolumeType == "" { @@ -103,10 +133,10 @@ func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error sha1pkix := sha1.Sum([]byte(pkix)) md5fp = "" sha1fp = "" - for i := 0; i < len(md5pkix); i += 1 { + for i := 0; i < len(md5pkix); i++ { md5fp += fmt.Sprintf(":%02x", md5pkix[i]) } - for i := 0; i < len(sha1pkix); i += 1 { + for i := 0; i < len(sha1pkix); i++ { sha1fp += fmt.Sprintf(":%02x", sha1pkix[i]) } return md5fp[1:], sha1fp[1:], nil @@ -119,40 +149,6 @@ func (instanceSet *ec2InstanceSet) Create( initCommand cloud.InitCommand, publicKey ssh.PublicKey) (cloud.Instance, error) { - md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey) - if err != nil { - return nil, fmt.Errorf("Could not make key fingerprint: %v", err) - } - instanceSet.keysMtx.Lock() - var keyname string - var ok bool - if keyname, ok = instanceSet.keys[md5keyFingerprint]; !ok { - keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{ - Filters: []*ec2.Filter{{ - Name: aws.String("fingerprint"), - Values: []*string{&md5keyFingerprint, &sha1keyFingerprint}, - }}, - }) - if err != nil { - return nil, fmt.Errorf("Could not search for keypair: %v", err) - } - - if len(keyout.KeyPairs) > 0 { - keyname = *(keyout.KeyPairs[0].KeyName) - } else { - keyname = "arvados-dispatch-keypair-" + md5keyFingerprint - _, err := instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{ - KeyName: &keyname, - PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey), - }) - if err != nil { - return nil, fmt.Errorf("Could not import keypair: %v", err) - } - } - instanceSet.keys[md5keyFingerprint] = keyname - } - instanceSet.keysMtx.Unlock() - ec2tags := []*ec2.Tag{} for k, v := range newTags { ec2tags = append(ec2tags, &ec2.Tag{ @@ -171,7 +167,6 @@ func (instanceSet *ec2InstanceSet) Create( InstanceType: &instanceType.ProviderType, MaxCount: aws.Int64(1), MinCount: aws.Int64(1), - KeyName: &keyname, NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{ { @@ -191,6 +186,14 @@ func (instanceSet *ec2InstanceSet) Create( UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))), } + if publicKey != nil { + keyname, err := instanceSet.getKeyName(publicKey) + if err != nil { + return nil, err + } + rii.KeyName = &keyname + } + if instanceType.AddedScratch > 0 { rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{ DeviceName: aws.String("/dev/xvdt"), @@ -210,18 +213,57 @@ func (instanceSet *ec2InstanceSet) Create( }} } - rsv, err := instanceSet.client.RunInstances(&rii) + if instanceSet.ec2config.IAMInstanceProfile != "" { + rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{ + Name: aws.String(instanceSet.ec2config.IAMInstanceProfile), + } + } + rsv, err := instanceSet.client.RunInstances(&rii) + err = wrapError(err, &instanceSet.throttleDelayCreate) if err != nil { return nil, err } - return &ec2Instance{ provider: instanceSet, instance: rsv.Instances[0], }, nil } +func (instanceSet *ec2InstanceSet) getKeyName(publicKey ssh.PublicKey) (string, error) { + instanceSet.keysMtx.Lock() + defer instanceSet.keysMtx.Unlock() + md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey) + if err != nil { + return "", fmt.Errorf("Could not make key fingerprint: %v", err) + } + if keyname, ok := instanceSet.keys[md5keyFingerprint]; ok { + return keyname, nil + } + keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{ + Filters: []*ec2.Filter{{ + Name: aws.String("fingerprint"), + Values: []*string{&md5keyFingerprint, &sha1keyFingerprint}, + }}, + }) + if err != nil { + return "", fmt.Errorf("Could not search for keypair: %v", err) + } + if len(keyout.KeyPairs) > 0 { + return *(keyout.KeyPairs[0].KeyName), nil + } + keyname := "arvados-dispatch-keypair-" + md5keyFingerprint + _, err = instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{ + KeyName: &keyname, + PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey), + }) + if err != nil { + return "", fmt.Errorf("Could not import keypair: %v", err) + } + instanceSet.keys[md5keyFingerprint] = keyname + return keyname, nil +} + func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) { var filters []*ec2.Filter for k, v := range tags { @@ -230,9 +272,11 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances Values: []*string{aws.String(v)}, }) } + needAZs := false dii := &ec2.DescribeInstancesInput{Filters: filters} for { dio, err := instanceSet.client.DescribeInstances(dii) + err = wrapError(err, &instanceSet.throttleDelayInstances) if err != nil { return nil, err } @@ -240,23 +284,150 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances for _, rsv := range dio.Reservations { for _, inst := range rsv.Instances { if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" { - instances = append(instances, &ec2Instance{instanceSet, inst}) + instances = append(instances, &ec2Instance{ + provider: instanceSet, + instance: inst, + }) + if aws.StringValue(inst.InstanceLifecycle) == "spot" { + needAZs = true + } } } } if dio.NextToken == nil { - return instances, err + break } dii.NextToken = dio.NextToken } + if needAZs && instanceSet.ec2config.SpotPriceUpdateInterval > 0 { + az := map[string]string{} + err := instanceSet.client.DescribeInstanceStatusPages(&ec2.DescribeInstanceStatusInput{ + IncludeAllInstances: aws.Bool(true), + }, func(page *ec2.DescribeInstanceStatusOutput, lastPage bool) bool { + for _, ent := range page.InstanceStatuses { + az[*ent.InstanceId] = *ent.AvailabilityZone + } + return true + }) + if err != nil { + instanceSet.logger.Warnf("error getting instance statuses: %s", err) + } + for _, inst := range instances { + inst := inst.(*ec2Instance) + inst.availabilityZone = az[*inst.instance.InstanceId] + } + instanceSet.updateSpotPrices(instances) + } + return instances, err } -func (az *ec2InstanceSet) Stop() { +type priceKey struct { + instanceType string + spot bool + availabilityZone string +} + +// Refresh recent spot instance pricing data for the given instances, +// unless we already have recent pricing data for all relevant types. +func (instanceSet *ec2InstanceSet) updateSpotPrices(instances []cloud.Instance) { + if len(instances) == 0 { + return + } + + instanceSet.pricesLock.Lock() + defer instanceSet.pricesLock.Unlock() + if instanceSet.prices == nil { + instanceSet.prices = map[priceKey][]cloud.InstancePrice{} + instanceSet.pricesUpdated = map[priceKey]time.Time{} + } + + updateTime := time.Now() + staleTime := updateTime.Add(-instanceSet.ec2config.SpotPriceUpdateInterval.Duration()) + needUpdate := false + allTypes := map[string]bool{} + + for _, inst := range instances { + ec2inst := inst.(*ec2Instance).instance + if aws.StringValue(ec2inst.InstanceLifecycle) == "spot" { + pk := priceKey{ + instanceType: *ec2inst.InstanceType, + spot: true, + availabilityZone: inst.(*ec2Instance).availabilityZone, + } + if instanceSet.pricesUpdated[pk].Before(staleTime) { + needUpdate = true + } + allTypes[*ec2inst.InstanceType] = true + } + } + if !needUpdate { + return + } + var typeFilterValues []*string + for instanceType := range allTypes { + typeFilterValues = append(typeFilterValues, aws.String(instanceType)) + } + // Get 3x update interval worth of pricing data. (Ideally the + // AWS API would tell us "we have shown you all of the price + // changes up to time T", but it doesn't, so we'll just ask + // for 3 intervals worth of data on each update, de-duplicate + // the data points, and not worry too much about occasionally + // missing some data points when our lookups fail twice in a + // row. + dsphi := &ec2.DescribeSpotPriceHistoryInput{ + StartTime: aws.Time(updateTime.Add(-3 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())), + Filters: []*ec2.Filter{ + &ec2.Filter{Name: aws.String("instance-type"), Values: typeFilterValues}, + &ec2.Filter{Name: aws.String("product-description"), Values: []*string{aws.String("Linux/UNIX")}}, + }, + } + err := instanceSet.client.DescribeSpotPriceHistoryPages(dsphi, func(page *ec2.DescribeSpotPriceHistoryOutput, lastPage bool) bool { + for _, ent := range page.SpotPriceHistory { + if ent.InstanceType == nil || ent.SpotPrice == nil || ent.Timestamp == nil { + // bogus record? + continue + } + price, err := strconv.ParseFloat(*ent.SpotPrice, 64) + if err != nil { + // bogus record? + continue + } + pk := priceKey{ + instanceType: *ent.InstanceType, + spot: true, + availabilityZone: *ent.AvailabilityZone, + } + instanceSet.prices[pk] = append(instanceSet.prices[pk], cloud.InstancePrice{ + StartTime: *ent.Timestamp, + Price: price, + }) + instanceSet.pricesUpdated[pk] = updateTime + } + return true + }) + if err != nil { + instanceSet.logger.Warnf("error retrieving spot instance prices: %s", err) + } + + expiredTime := updateTime.Add(-64 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration()) + for pk, last := range instanceSet.pricesUpdated { + if last.Before(expiredTime) { + delete(instanceSet.pricesUpdated, pk) + delete(instanceSet.prices, pk) + } + } + for pk, prices := range instanceSet.prices { + instanceSet.prices[pk] = cloud.NormalizePriceHistory(prices) + } +} + +func (instanceSet *ec2InstanceSet) Stop() { } type ec2Instance struct { - provider *ec2InstanceSet - instance *ec2.Instance + provider *ec2InstanceSet + instance *ec2.Instance + availabilityZone string // sometimes available for spot instances } func (inst *ec2Instance) ID() cloud.InstanceID { @@ -319,3 +490,109 @@ func (inst *ec2Instance) RemoteUser() string { func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error { return cloud.ErrNotImplemented } + +// PriceHistory returns the price history for this specific instance. +// +// AWS documentation is elusive about whether the hourly cost of a +// given spot instance changes as the current spot price changes for +// the corresponding instance type and availability zone. Our +// implementation assumes the answer is yes, based on the following +// hints. +// +// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html +// says: "After your Spot Instance is running, if the Spot price rises +// above your maximum price, Amazon EC2 interrupts your Spot +// Instance." (This doesn't address what happens when the spot price +// rises *without* exceeding your maximum price.) +// +// https://docs.aws.amazon.com/whitepapers/latest/cost-optimization-leveraging-ec2-spot-instances/how-spot-instances-work.html +// says: "You pay the Spot price that's in effect, billed to the +// nearest second." (But it's not explicitly stated whether "the price +// in effect" changes over time for a given instance.) +// +// The same page also says, in a discussion about the effect of +// specifying a maximum price: "Note that you never pay more than the +// Spot price that is in effect when your Spot Instance is running." +// (The use of the phrase "is running", as opposed to "was launched", +// hints that pricing is dynamic.) +func (inst *ec2Instance) PriceHistory(instType arvados.InstanceType) []cloud.InstancePrice { + inst.provider.pricesLock.Lock() + defer inst.provider.pricesLock.Unlock() + // Note updateSpotPrices currently populates + // inst.provider.prices only for spot instances, so if + // spot==false here, we will return no data. + pk := priceKey{ + instanceType: *inst.instance.InstanceType, + spot: aws.StringValue(inst.instance.InstanceLifecycle) == "spot", + availabilityZone: inst.availabilityZone, + } + var prices []cloud.InstancePrice + for _, price := range inst.provider.prices[pk] { + // ceil(added scratch space in GiB) + gib := (instType.AddedScratch + 1<<30 - 1) >> 30 + monthly := inst.provider.ec2config.EBSPrice * float64(gib) + hourly := monthly / 30 / 24 + price.Price += hourly + prices = append(prices, price) + } + return prices +} + +type rateLimitError struct { + error + earliestRetry time.Time +} + +func (err rateLimitError) EarliestRetry() time.Time { + return err.earliestRetry +} + +var isCodeCapacity = map[string]bool{ + "InsufficientFreeAddressesInSubnet": true, + "InsufficientInstanceCapacity": true, + "InsufficientVolumeCapacity": true, + "MaxSpotInstanceCountExceeded": true, + "VcpuLimitExceeded": true, +} + +// isErrorCapacity returns whether the error is to be throttled based on its code. +// Returns false if error is nil. +func isErrorCapacity(err error) bool { + if aerr, ok := err.(awserr.Error); ok && aerr != nil { + if _, ok := isCodeCapacity[aerr.Code()]; ok { + return true + } + } + return false +} + +type ec2QuotaError struct { + error +} + +func (er *ec2QuotaError) IsQuotaError() bool { + return true +} + +func wrapError(err error, throttleValue *atomic.Value) error { + if request.IsErrorThrottle(err) { + // Back off exponentially until an upstream call + // either succeeds or returns a non-throttle error. + d, _ := throttleValue.Load().(time.Duration) + d = d*3/2 + time.Second + if d < throttleDelayMin { + d = throttleDelayMin + } else if d > throttleDelayMax { + d = throttleDelayMax + } + throttleValue.Store(d) + return rateLimitError{error: err, earliestRetry: time.Now().Add(d)} + } else if isErrorCapacity(err) { + return &ec2QuotaError{err} + } else if err != nil { + throttleValue.Store(time.Duration(0)) + return err + } + throttleValue.Store(time.Duration(0)) + return nil +}