X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/349a2a7fdd230456f8a8ccddf1b9932c824ca4f3..2195844ba309db0ec552aa8b14a7f02cf74e9b7b:/lib/cloud/ec2/ec2.go diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go index 526fc1307d..816df48d90 100644 --- a/lib/cloud/ec2/ec2.go +++ b/lib/cloud/ec2/ec2.go @@ -29,6 +29,7 @@ import ( "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "golang.org/x/crypto/ssh" ) @@ -112,9 +113,12 @@ type ec2InstanceSet struct { prices map[priceKey][]cloud.InstancePrice pricesLock sync.Mutex pricesUpdated map[priceKey]time.Time + + mInstances *prometheus.GaugeVec + mInstanceStarts *prometheus.CounterVec } -func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) { +func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) { instanceSet := &ec2InstanceSet{ instanceSetID: instanceSetID, logger: logger, @@ -141,6 +145,36 @@ func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID if instanceSet.ec2config.EBSVolumeType == "" { instanceSet.ec2config.EBSVolumeType = "gp2" } + + // Set up metrics + instanceSet.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "ec2_instances", + Help: "Number of instances running", + }, []string{"subnet_id"}) + instanceSet.mInstanceStarts = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "ec2_instance_starts_total", + Help: "Number of attempts to start a new instance", + }, []string{"subnet_id", "success"}) + // Initialize all of the series we'll be reporting. Otherwise + // the {subnet=A, success=0} series doesn't appear in metrics + // at all until there's a failure in subnet A. + for _, subnet := range instanceSet.ec2config.SubnetID { + instanceSet.mInstanceStarts.WithLabelValues(subnet, "0").Add(0) + instanceSet.mInstanceStarts.WithLabelValues(subnet, "1").Add(0) + } + if len(instanceSet.ec2config.SubnetID) == 0 { + instanceSet.mInstanceStarts.WithLabelValues("", "0").Add(0) + instanceSet.mInstanceStarts.WithLabelValues("", "1").Add(0) + } + if reg != nil { + reg.MustRegister(instanceSet.mInstances) + reg.MustRegister(instanceSet.mInstanceStarts) + } + return instanceSet, nil } @@ -259,11 +293,14 @@ func (instanceSet *ec2InstanceSet) Create( currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex)) for tryOffset := 0; ; tryOffset++ { tryIndex := 0 + trySubnet := "" if len(subnets) > 0 { tryIndex = (currentSubnetIDIndex + tryOffset) % len(subnets) - rii.NetworkInterfaces[0].SubnetId = aws.String(subnets[tryIndex]) + trySubnet = subnets[tryIndex] + rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet) } rsv, err = instanceSet.client.RunInstances(&rii) + instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1) if isErrorSubnetSpecific(err) && tryOffset < len(subnets)-1 { instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]). @@ -381,6 +418,24 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances } instanceSet.updateSpotPrices(instances) } + + // Count instances in each subnet, and report in metrics. + subnetInstances := map[string]int{"": 0} + for _, subnet := range instanceSet.ec2config.SubnetID { + subnetInstances[subnet] = 0 + } + for _, inst := range instances { + subnet := inst.(*ec2Instance).instance.SubnetId + if subnet != nil { + subnetInstances[*subnet]++ + } else { + subnetInstances[""]++ + } + } + for subnet, count := range subnetInstances { + instanceSet.mInstances.WithLabelValues(subnet).Set(float64(count)) + } + return instances, err } @@ -610,21 +665,36 @@ func (err rateLimitError) EarliestRetry() time.Time { return err.earliestRetry } -var isCodeCapacity = map[string]bool{ +type capacityError struct { + error + isInstanceTypeSpecific bool +} + +func (er *capacityError) IsCapacityError() bool { + return true +} + +func (er *capacityError) IsInstanceTypeSpecific() bool { + return er.isInstanceTypeSpecific +} + +var isCodeQuota = map[string]bool{ "InstanceLimitExceeded": true, "InsufficientAddressCapacity": true, "InsufficientFreeAddressesInSubnet": true, - "InsufficientInstanceCapacity": true, "InsufficientVolumeCapacity": true, "MaxSpotInstanceCountExceeded": true, "VcpuLimitExceeded": true, } -// isErrorCapacity returns whether the error is to be throttled based on its code. +// isErrorQuota returns whether the error indicates we have reached +// some usage quota/limit -- i.e., immediately retrying with an equal +// or larger instance type will probably not work. +// // Returns false if error is nil. -func isErrorCapacity(err error) bool { +func isErrorQuota(err error) bool { if aerr, ok := err.(awserr.Error); ok && aerr != nil { - if _, ok := isCodeCapacity[aerr.Code()]; ok { + if _, ok := isCodeQuota[aerr.Code()]; ok { return true } } @@ -665,8 +735,10 @@ func wrapError(err error, throttleValue *atomic.Value) error { } throttleValue.Store(d) return rateLimitError{error: err, earliestRetry: time.Now().Add(d)} - } else if isErrorCapacity(err) { + } else if isErrorQuota(err) { return &ec2QuotaError{err} + } else if aerr, ok := err.(awserr.Error); ok && aerr != nil && aerr.Code() == "InsufficientInstanceCapacity" { + return &capacityError{err, true} } else if err != nil { throttleValue.Store(time.Duration(0)) return err @@ -674,3 +746,5 @@ func wrapError(err error, throttleValue *atomic.Value) error { throttleValue.Store(time.Duration(0)) return nil } + +var boolLabelValue = map[bool]string{false: "0", true: "1"}