X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/e7d06f9da1396e2ba69318b3bc8d3de690c6437a..756b80504e55ff7d9b9ec3f221bd11e231e9c1c6:/lib/cloud/ec2/ec2.go

diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go
index 29062c491e..e2cf5e0f1c 100644
--- a/lib/cloud/ec2/ec2.go
+++ b/lib/cloud/ec2/ec2.go
@@ -13,12 +13,19 @@ import (
 	"encoding/json"
 	"fmt"
 	"math/big"
+	"strconv"
 	"sync"
+	"sync/atomic"
+	"time"
 
 	"git.arvados.org/arvados.git/lib/cloud"
 	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/aws/awserr"
 	"github.com/aws/aws-sdk-go/aws/credentials"
+	"github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
+	"github.com/aws/aws-sdk-go/aws/ec2metadata"
+	"github.com/aws/aws-sdk-go/aws/request"
 	"github.com/aws/aws-sdk-go/aws/session"
 	"github.com/aws/aws-sdk-go/service/ec2"
 	"github.com/sirupsen/logrus"
@@ -28,14 +35,22 @@ import (
 // Driver is the ec2 implementation of the cloud.Driver interface.
 var Driver = cloud.DriverFunc(newEC2InstanceSet)
 
+const (
+	throttleDelayMin = time.Second
+	throttleDelayMax = time.Minute
+)
+
 type ec2InstanceSetConfig struct {
-	AccessKeyID      string
-	SecretAccessKey  string
-	Region           string
-	SecurityGroupIDs arvados.StringSet
-	SubnetID         string
-	AdminUsername    string
-	EBSVolumeType    string
+	AccessKeyID             string
+	SecretAccessKey         string
+	Region                  string
+	SecurityGroupIDs        arvados.StringSet
+	SubnetID                string
+	AdminUsername           string
+	EBSVolumeType           string
+	EBSPrice                float64
+	IAMInstanceProfile      string
+	SpotPriceUpdateInterval arvados.Duration
 }
 
 type ec2Interface interface {
@@ -43,17 +58,25 @@ type ec2Interface interface {
 	ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
 	RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error)
 	DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
+	DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error
+	DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error
 	CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error)
 	TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
 }
 
 type ec2InstanceSet struct {
-	ec2config     ec2InstanceSetConfig
-	instanceSetID cloud.InstanceSetID
-	logger        logrus.FieldLogger
-	client        ec2Interface
-	keysMtx       sync.Mutex
-	keys          map[string]string
+	ec2config              ec2InstanceSetConfig
+	instanceSetID          cloud.InstanceSetID
+	logger                 logrus.FieldLogger
+	client                 ec2Interface
+	keysMtx                sync.Mutex
+	keys                   map[string]string
+	throttleDelayCreate    atomic.Value
+	throttleDelayInstances atomic.Value
+
+	prices        map[priceKey][]cloud.InstancePrice
+	pricesLock    sync.Mutex
+	pricesUpdated map[priceKey]time.Time
 }
 
 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
@@ -65,12 +88,19 @@ func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID
 	if err != nil {
 		return nil, err
 	}
-	awsConfig := aws.NewConfig().
-		WithCredentials(credentials.NewStaticCredentials(
-			instanceSet.ec2config.AccessKeyID,
-			instanceSet.ec2config.SecretAccessKey,
-			"")).
-		WithRegion(instanceSet.ec2config.Region)
+
+	sess, err := session.NewSession()
+	if err != nil {
+		return nil, err
+	}
+	// First try any static credentials, fall back to an IAM instance profile/role
+	creds := credentials.NewChainCredentials(
+		[]credentials.Provider{
+			&credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
+			&ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
+		})
+
+	awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
 	instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
 	instanceSet.keys = make(map[string]string)
 	if instanceSet.ec2config.EBSVolumeType == "" {
@@ -103,10 +133,10 @@ func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error
 	sha1pkix := sha1.Sum([]byte(pkix))
 	md5fp = ""
 	sha1fp = ""
-	for i := 0; i < len(md5pkix); i += 1 {
+	for i := 0; i < len(md5pkix); i++ {
 		md5fp += fmt.Sprintf(":%02x", md5pkix[i])
 	}
-	for i := 0; i < len(sha1pkix); i += 1 {
+	for i := 0; i < len(sha1pkix); i++ {
 		sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
 	}
 	return md5fp[1:], sha1fp[1:], nil
@@ -119,40 +149,6 @@ func (instanceSet *ec2InstanceSet) Create(
 	initCommand cloud.InitCommand,
 	publicKey ssh.PublicKey) (cloud.Instance, error) {
 
-	md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
-	if err != nil {
-		return nil, fmt.Errorf("Could not make key fingerprint: %v", err)
-	}
-	instanceSet.keysMtx.Lock()
-	var keyname string
-	var ok bool
-	if keyname, ok = instanceSet.keys[md5keyFingerprint]; !ok {
-		keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
-			Filters: []*ec2.Filter{{
-				Name:   aws.String("fingerprint"),
-				Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
-			}},
-		})
-		if err != nil {
-			return nil, fmt.Errorf("Could not search for keypair: %v", err)
-		}
-
-		if len(keyout.KeyPairs) > 0 {
-			keyname = *(keyout.KeyPairs[0].KeyName)
-		} else {
-			keyname = "arvados-dispatch-keypair-" + md5keyFingerprint
-			_, err := instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
-				KeyName:           &keyname,
-				PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
-			})
-			if err != nil {
-				return nil, fmt.Errorf("Could not import keypair: %v", err)
-			}
-		}
-		instanceSet.keys[md5keyFingerprint] = keyname
-	}
-	instanceSet.keysMtx.Unlock()
-
 	ec2tags := []*ec2.Tag{}
 	for k, v := range newTags {
 		ec2tags = append(ec2tags, &ec2.Tag{
@@ -171,7 +167,6 @@ func (instanceSet *ec2InstanceSet) Create(
 		InstanceType: &instanceType.ProviderType,
 		MaxCount:     aws.Int64(1),
 		MinCount:     aws.Int64(1),
-		KeyName:      &keyname,
 
 		NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
 			{
@@ -191,6 +186,14 @@ func (instanceSet *ec2InstanceSet) Create(
 		UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
 	}
 
+	if publicKey != nil {
+		keyname, err := instanceSet.getKeyName(publicKey)
+		if err != nil {
+			return nil, err
+		}
+		rii.KeyName = &keyname
+	}
+
 	if instanceType.AddedScratch > 0 {
 		rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
 			DeviceName: aws.String("/dev/xvdt"),
@@ -210,18 +213,57 @@ func (instanceSet *ec2InstanceSet) Create(
 			}}
 	}
 
-	rsv, err := instanceSet.client.RunInstances(&rii)
+	if instanceSet.ec2config.IAMInstanceProfile != "" {
+		rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
+			Name: aws.String(instanceSet.ec2config.IAMInstanceProfile),
+		}
+	}
 
+	rsv, err := instanceSet.client.RunInstances(&rii)
+	err = wrapError(err, &instanceSet.throttleDelayCreate)
 	if err != nil {
 		return nil, err
 	}
-
 	return &ec2Instance{
 		provider: instanceSet,
 		instance: rsv.Instances[0],
 	}, nil
 }
 
+func (instanceSet *ec2InstanceSet) getKeyName(publicKey ssh.PublicKey) (string, error) {
+	instanceSet.keysMtx.Lock()
+	defer instanceSet.keysMtx.Unlock()
+	md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
+	if err != nil {
+		return "", fmt.Errorf("Could not make key fingerprint: %v", err)
+	}
+	if keyname, ok := instanceSet.keys[md5keyFingerprint]; ok {
+		return keyname, nil
+	}
+	keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
+		Filters: []*ec2.Filter{{
+			Name:   aws.String("fingerprint"),
+			Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
+		}},
+	})
+	if err != nil {
+		return "", fmt.Errorf("Could not search for keypair: %v", err)
+	}
+	if len(keyout.KeyPairs) > 0 {
+		return *(keyout.KeyPairs[0].KeyName), nil
+	}
+	keyname := "arvados-dispatch-keypair-" + md5keyFingerprint
+	_, err = instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
+		KeyName:           &keyname,
+		PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
+	})
+	if err != nil {
+		return "", fmt.Errorf("Could not import keypair: %v", err)
+	}
+	instanceSet.keys[md5keyFingerprint] = keyname
+	return keyname, nil
+}
+
 func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
 	var filters []*ec2.Filter
 	for k, v := range tags {
@@ -230,9 +272,11 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances
 			Values: []*string{aws.String(v)},
 		})
 	}
+	needAZs := false
 	dii := &ec2.DescribeInstancesInput{Filters: filters}
 	for {
 		dio, err := instanceSet.client.DescribeInstances(dii)
+		err = wrapError(err, &instanceSet.throttleDelayInstances)
 		if err != nil {
 			return nil, err
 		}
@@ -240,23 +284,150 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances
 		for _, rsv := range dio.Reservations {
 			for _, inst := range rsv.Instances {
 				if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" {
-					instances = append(instances, &ec2Instance{instanceSet, inst})
+					instances = append(instances, &ec2Instance{
+						provider: instanceSet,
+						instance: inst,
+					})
+					if aws.StringValue(inst.InstanceLifecycle) == "spot" {
+						needAZs = true
+					}
 				}
 			}
 		}
 		if dio.NextToken == nil {
-			return instances, err
+			break
 		}
 		dii.NextToken = dio.NextToken
 	}
+	if needAZs && instanceSet.ec2config.SpotPriceUpdateInterval > 0 {
+		az := map[string]string{}
+		err := instanceSet.client.DescribeInstanceStatusPages(&ec2.DescribeInstanceStatusInput{
+			IncludeAllInstances: aws.Bool(true),
+		}, func(page *ec2.DescribeInstanceStatusOutput, lastPage bool) bool {
+			for _, ent := range page.InstanceStatuses {
+				az[*ent.InstanceId] = *ent.AvailabilityZone
+			}
+			return true
+		})
+		if err != nil {
+			instanceSet.logger.Warnf("error getting instance statuses: %s", err)
+		}
+		for _, inst := range instances {
+			inst := inst.(*ec2Instance)
+			inst.availabilityZone = az[*inst.instance.InstanceId]
+		}
+		instanceSet.updateSpotPrices(instances)
+	}
+	return instances, err
 }
 
-func (az *ec2InstanceSet) Stop() {
+type priceKey struct {
+	instanceType     string
+	spot             bool
+	availabilityZone string
+}
+
+// Refresh recent spot instance pricing data for the given instances,
+// unless we already have recent pricing data for all relevant types.
+func (instanceSet *ec2InstanceSet) updateSpotPrices(instances []cloud.Instance) {
+	if len(instances) == 0 {
+		return
+	}
+
+	instanceSet.pricesLock.Lock()
+	defer instanceSet.pricesLock.Unlock()
+	if instanceSet.prices == nil {
+		instanceSet.prices = map[priceKey][]cloud.InstancePrice{}
+		instanceSet.pricesUpdated = map[priceKey]time.Time{}
+	}
+
+	updateTime := time.Now()
+	staleTime := updateTime.Add(-instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
+	needUpdate := false
+	allTypes := map[string]bool{}
+
+	for _, inst := range instances {
+		ec2inst := inst.(*ec2Instance).instance
+		if aws.StringValue(ec2inst.InstanceLifecycle) == "spot" {
+			pk := priceKey{
+				instanceType:     *ec2inst.InstanceType,
+				spot:             true,
+				availabilityZone: inst.(*ec2Instance).availabilityZone,
+			}
+			if instanceSet.pricesUpdated[pk].Before(staleTime) {
+				needUpdate = true
+			}
+			allTypes[*ec2inst.InstanceType] = true
+		}
+	}
+	if !needUpdate {
+		return
+	}
+	var typeFilterValues []*string
+	for instanceType := range allTypes {
+		typeFilterValues = append(typeFilterValues, aws.String(instanceType))
+	}
+	// Get 3x update interval worth of pricing data. (Ideally the
+	// AWS API would tell us "we have shown you all of the price
+	// changes up to time T", but it doesn't, so we'll just ask
+	// for 3 intervals worth of data on each update, de-duplicate
+	// the data points, and not worry too much about occasionally
+	// missing some data points when our lookups fail twice in a
+	// row.
+	dsphi := &ec2.DescribeSpotPriceHistoryInput{
+		StartTime: aws.Time(updateTime.Add(-3 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())),
+		Filters: []*ec2.Filter{
+			&ec2.Filter{Name: aws.String("instance-type"), Values: typeFilterValues},
+			&ec2.Filter{Name: aws.String("product-description"), Values: []*string{aws.String("Linux/UNIX")}},
+		},
+	}
+	err := instanceSet.client.DescribeSpotPriceHistoryPages(dsphi, func(page *ec2.DescribeSpotPriceHistoryOutput, lastPage bool) bool {
+		for _, ent := range page.SpotPriceHistory {
+			if ent.InstanceType == nil || ent.SpotPrice == nil || ent.Timestamp == nil {
+				// bogus record?
+				continue
+			}
+			price, err := strconv.ParseFloat(*ent.SpotPrice, 64)
+			if err != nil {
+				// bogus record?
+				continue
+			}
+			pk := priceKey{
+				instanceType:     *ent.InstanceType,
+				spot:             true,
+				availabilityZone: *ent.AvailabilityZone,
+			}
+			instanceSet.prices[pk] = append(instanceSet.prices[pk], cloud.InstancePrice{
+				StartTime: *ent.Timestamp,
+				Price:     price,
+			})
+			instanceSet.pricesUpdated[pk] = updateTime
+		}
+		return true
+	})
+	if err != nil {
+		instanceSet.logger.Warnf("error retrieving spot instance prices: %s", err)
+	}
+
+	expiredTime := updateTime.Add(-64 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
+	for pk, last := range instanceSet.pricesUpdated {
+		if last.Before(expiredTime) {
+			delete(instanceSet.pricesUpdated, pk)
+			delete(instanceSet.prices, pk)
+		}
+	}
+	for pk, prices := range instanceSet.prices {
+		instanceSet.prices[pk] = cloud.NormalizePriceHistory(prices)
+	}
+}
+
+func (instanceSet *ec2InstanceSet) Stop() {
 }
 
 type ec2Instance struct {
-	provider *ec2InstanceSet
-	instance *ec2.Instance
+	provider         *ec2InstanceSet
+	instance         *ec2.Instance
+	availabilityZone string // sometimes available for spot instances
 }
 
 func (inst *ec2Instance) ID() cloud.InstanceID {
@@ -319,3 +490,109 @@ func (inst *ec2Instance) RemoteUser() string {
 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
 	return cloud.ErrNotImplemented
 }
+
+// PriceHistory returns the price history for this specific instance.
+//
+// AWS documentation is elusive about whether the hourly cost of a
+// given spot instance changes as the current spot price changes for
+// the corresponding instance type and availability zone. Our
+// implementation assumes the answer is yes, based on the following
+// hints.
+//
+// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html
+// says: "After your Spot Instance is running, if the Spot price rises
+// above your maximum price, Amazon EC2 interrupts your Spot
+// Instance." (This doesn't address what happens when the spot price
+// rises *without* exceeding your maximum price.)
+//
+// https://docs.aws.amazon.com/whitepapers/latest/cost-optimization-leveraging-ec2-spot-instances/how-spot-instances-work.html
+// says: "You pay the Spot price that's in effect, billed to the
+// nearest second." (But it's not explicitly stated whether "the price
+// in effect" changes over time for a given instance.)
+//
+// The same page also says, in a discussion about the effect of
+// specifying a maximum price: "Note that you never pay more than the
+// Spot price that is in effect when your Spot Instance is running."
+// (The use of the phrase "is running", as opposed to "was launched",
+// hints that pricing is dynamic.)
+func (inst *ec2Instance) PriceHistory(instType arvados.InstanceType) []cloud.InstancePrice {
+	inst.provider.pricesLock.Lock()
+	defer inst.provider.pricesLock.Unlock()
+	// Note updateSpotPrices currently populates
+	// inst.provider.prices only for spot instances, so if
+	// spot==false here, we will return no data.
+	pk := priceKey{
+		instanceType:     *inst.instance.InstanceType,
+		spot:             aws.StringValue(inst.instance.InstanceLifecycle) == "spot",
+		availabilityZone: inst.availabilityZone,
+	}
+	var prices []cloud.InstancePrice
+	for _, price := range inst.provider.prices[pk] {
+		// ceil(added scratch space in GiB)
+		gib := (instType.AddedScratch + 1<<30 - 1) >> 30
+		monthly := inst.provider.ec2config.EBSPrice * float64(gib)
+		hourly := monthly / 30 / 24
+		price.Price += hourly
+		prices = append(prices, price)
+	}
+	return prices
+}
+
+type rateLimitError struct {
+	error
+	earliestRetry time.Time
+}
+
+func (err rateLimitError) EarliestRetry() time.Time {
+	return err.earliestRetry
+}
+
+var isCodeCapacity = map[string]bool{
+	"InsufficientFreeAddressesInSubnet": true,
+	"InsufficientInstanceCapacity":      true,
+	"InsufficientVolumeCapacity":        true,
+	"MaxSpotInstanceCountExceeded":      true,
+	"VcpuLimitExceeded":                 true,
+}
+
+// isErrorCapacity returns whether the error is to be throttled based on its code.
+// Returns false if error is nil.
+func isErrorCapacity(err error) bool {
+	if aerr, ok := err.(awserr.Error); ok && aerr != nil {
+		if _, ok := isCodeCapacity[aerr.Code()]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+type ec2QuotaError struct {
+	error
+}
+
+func (er *ec2QuotaError) IsQuotaError() bool {
+	return true
+}
+
+func wrapError(err error, throttleValue *atomic.Value) error {
+	if request.IsErrorThrottle(err) {
+		// Back off exponentially until an upstream call
+		// either succeeds or returns a non-throttle error.
+		d, _ := throttleValue.Load().(time.Duration)
+		d = d*3/2 + time.Second
+		if d < throttleDelayMin {
+			d = throttleDelayMin
+		} else if d > throttleDelayMax {
+			d = throttleDelayMax
+		}
+		throttleValue.Store(d)
+		return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
+	} else if isErrorCapacity(err) {
+		return &ec2QuotaError{err}
+	} else if err != nil {
+		throttleValue.Store(time.Duration(0))
+		return err
+	}
+	throttleValue.Store(time.Duration(0))
+	return nil
+}