20978: Treat "unsupported instance type" as capacity=0.
[arvados.git] / lib / cloud / ec2 / ec2_test.go
index 2f3d319e0be3b706e4fbdaef0000bdeba8972fc4..6ce5aa3cf9aa6899610c937d9dad1ae70e9cb695 100644 (file)
@@ -34,12 +34,14 @@ import (
        "git.arvados.org/arvados.git/lib/cloud"
        "git.arvados.org/arvados.git/lib/dispatchcloud/test"
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadostest"
        "git.arvados.org/arvados.git/sdk/go/config"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/aws/aws-sdk-go/aws"
        "github.com/aws/aws-sdk-go/aws/awserr"
        "github.com/aws/aws-sdk-go/service/ec2"
        "github.com/ghodss/yaml"
+       "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
        check "gopkg.in/check.v1"
 )
@@ -210,7 +212,8 @@ func (err *ec2stubError) OrigErr() error  { return errors.New("stub OrigErr") }
 // Ensure ec2stubError satisfies the aws.Error interface
 var _ = awserr.Error(&ec2stubError{})
 
-func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster) {
+func GetInstanceSet(c *check.C, conf string) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster, *prometheus.Registry) {
+       reg := prometheus.NewRegistry()
        cluster := arvados.Cluster{
                InstanceTypes: arvados.InstanceTypeMap(map[string]arvados.InstanceType{
                        "tiny": {
@@ -246,21 +249,19 @@ func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster
                err := config.LoadFile(&exampleCfg, *live)
                c.Assert(err, check.IsNil)
 
-               ap, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger())
+               is, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), reg)
                c.Assert(err, check.IsNil)
-               return ap.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster
-       }
-       ap := ec2InstanceSet{
-               instanceSetID: "test123",
-               logger:        ctxlog.TestLogger(c),
-               client:        &ec2stub{c: c, reftime: time.Now().UTC()},
-               keys:          make(map[string]string),
+               return is.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster, reg
+       } else {
+               is, err := newEC2InstanceSet(json.RawMessage(conf), "test123", nil, ctxlog.TestLogger(c), reg)
+               c.Assert(err, check.IsNil)
+               is.(*ec2InstanceSet).client = &ec2stub{c: c, reftime: time.Now().UTC()}
+               return is.(*ec2InstanceSet), cloud.ImageID("blob"), cluster, reg
        }
-       return &ap, cloud.ImageID("blob"), cluster
 }
 
 func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
-       ap, img, cluster := GetInstanceSet(c)
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
        pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
 
        inst, err := ap.Create(cluster.InstanceTypes["tiny"],
@@ -280,7 +281,7 @@ func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
-       ap, img, cluster := GetInstanceSet(c)
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
        inst, err := ap.Create(cluster.InstanceTypes["tiny-with-extra-scratch"],
                img, map[string]string{
                        "TestTagName": "test tag value",
@@ -301,7 +302,7 @@ func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
-       ap, img, cluster := GetInstanceSet(c)
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
        pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
 
        inst, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"],
@@ -323,8 +324,7 @@ func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
                return
        }
 
-       ap, img, cluster := GetInstanceSet(c)
-       ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-good"}
+       ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-good"]}`)
        ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
                "subnet-full": &ec2stubError{
                        code:    "InsufficientFreeAddressesInSubnet",
@@ -335,12 +335,26 @@ func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
        c.Check(err, check.IsNil)
        c.Check(inst, check.NotNil)
        c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 1\n`+
+               `.*`)
 
        // Next RunInstances call should try the working subnet first
        inst, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
        c.Check(err, check.IsNil)
        c.Check(inst, check.NotNil)
        c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 3)
+       metrics = arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 2\n`+
+               `.*`)
 }
 
 func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
@@ -349,8 +363,7 @@ func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
                return
        }
 
-       ap, img, cluster := GetInstanceSet(c)
-       ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-broken"}
+       ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-broken"]}`)
        ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
                "subnet-full": &ec2stubError{
                        code:    "InsufficientFreeAddressesInSubnet",
@@ -365,15 +378,29 @@ func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
        c.Check(err, check.NotNil)
        c.Check(err, check.ErrorMatches, `.*InvalidSubnetId\.NotFound.*`)
        c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `.*`)
 
        _, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
        c.Check(err, check.NotNil)
        c.Check(err, check.ErrorMatches, `.*InsufficientFreeAddressesInSubnet.*`)
        c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 4)
+       metrics = arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 2\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 2\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `.*`)
 }
 
 func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
-       ap, _, _ := GetInstanceSet(c)
+       ap, _, _, _ := GetInstanceSet(c, "{}")
        l, err := ap.Instances(nil)
        c.Assert(err, check.IsNil)
 
@@ -385,7 +412,7 @@ func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
-       ap, _, _ := GetInstanceSet(c)
+       ap, _, _, reg := GetInstanceSet(c, "{}")
        l, err := ap.Instances(nil)
        c.Assert(err, check.IsNil)
 
@@ -393,10 +420,15 @@ func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
                tg := i.Tags()
                c.Logf("%v %v %v", i.String(), i.Address(), tg)
        }
+
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instances{subnet_id="[^"]*"} \d+\n`+
+               `.*`)
 }
 
 func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
-       ap, _, _ := GetInstanceSet(c)
+       ap, _, _, _ := GetInstanceSet(c, "{}")
        l, err := ap.Instances(nil)
        c.Assert(err, check.IsNil)
 
@@ -406,7 +438,7 @@ func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestInstancePriceHistory(c *check.C) {
-       ap, img, cluster := GetInstanceSet(c)
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
        pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
        tags := cloud.InstanceTags{"arvados-ec2-driver": "test"}
 
@@ -476,8 +508,23 @@ func (*EC2InstanceSetSuite) TestWrapError(c *check.C) {
        _, ok := wrapped.(cloud.RateLimitError)
        c.Check(ok, check.Equals, true)
 
-       quotaError := awserr.New("InsufficientInstanceCapacity", "", nil)
+       quotaError := awserr.New("InstanceLimitExceeded", "", nil)
        wrapped = wrapError(quotaError, nil)
        _, ok = wrapped.(cloud.QuotaError)
        c.Check(ok, check.Equals, true)
+
+       for _, trial := range []struct {
+               code string
+               msg  string
+       }{
+               {"InsufficientInstanceCapacity", ""},
+               {"Unsupported", "Your requested instance type (t3.micro) is not supported in your requested Availability Zone (us-east-1e). Please retry your request by not specifying an Availability Zone or choosing us-east-1a, us-east-1b, us-east-1c, us-east-1d, us-east-1f."},
+       } {
+               capacityError := awserr.New(trial.code, trial.msg, nil)
+               wrapped = wrapError(capacityError, nil)
+               caperr, ok := wrapped.(cloud.CapacityError)
+               c.Check(ok, check.Equals, true)
+               c.Check(caperr.IsCapacityError(), check.Equals, true)
+               c.Check(caperr.IsInstanceTypeSpecific(), check.Equals, true)
+       }
 }