21603: Recognize subnet error despite generic error code.
[arvados.git] / lib / cloud / ec2 / ec2_test.go
index 3cd238ded5a0035adaec66ad4d5c32b9c3fd816a..5e6cf2c82b5caee81de1255936ee30a2edeb1024 100644 (file)
@@ -9,7 +9,7 @@
 //
 // Tests should be run individually and in the order they are listed in the file:
 //
-// Example azconfig.yml:
+// Example ec2config.yml:
 //
 // ImageIDForTestSuite: ami-xxxxxxxxxxxxxxxxx
 // DriverParameters:
@@ -24,17 +24,24 @@ package ec2
 
 import (
        "encoding/json"
+       "errors"
        "flag"
+       "fmt"
        "sync/atomic"
        "testing"
+       "time"
 
        "git.arvados.org/arvados.git/lib/cloud"
        "git.arvados.org/arvados.git/lib/dispatchcloud/test"
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadostest"
        "git.arvados.org/arvados.git/sdk/go/config"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/aws/aws-sdk-go/aws"
        "github.com/aws/aws-sdk-go/aws/awserr"
        "github.com/aws/aws-sdk-go/service/ec2"
+       "github.com/ghodss/yaml"
+       "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
        check "gopkg.in/check.v1"
 )
@@ -46,6 +53,34 @@ func Test(t *testing.T) {
        check.TestingT(t)
 }
 
+type sliceOrStringSuite struct{}
+
+var _ = check.Suite(&sliceOrStringSuite{})
+
+func (s *sliceOrStringSuite) TestUnmarshal(c *check.C) {
+       var conf ec2InstanceSetConfig
+       for _, trial := range []struct {
+               input  string
+               output sliceOrSingleString
+       }{
+               {``, nil},
+               {`""`, nil},
+               {`[]`, nil},
+               {`"foo"`, sliceOrSingleString{"foo"}},
+               {`["foo"]`, sliceOrSingleString{"foo"}},
+               {`[foo]`, sliceOrSingleString{"foo"}},
+               {`["foo", "bar"]`, sliceOrSingleString{"foo", "bar"}},
+               {`[foo-bar, baz]`, sliceOrSingleString{"foo-bar", "baz"}},
+       } {
+               c.Logf("trial: %+v", trial)
+               err := yaml.Unmarshal([]byte("SubnetID: "+trial.input+"\n"), &conf)
+               if !c.Check(err, check.IsNil) {
+                       continue
+               }
+               c.Check(conf.SubnetID, check.DeepEquals, trial.output)
+       }
+}
+
 type EC2InstanceSetSuite struct{}
 
 var _ = check.Suite(&EC2InstanceSetSuite{})
@@ -56,25 +91,104 @@ type testConfig struct {
 }
 
 type ec2stub struct {
+       c                     *check.C
+       reftime               time.Time
+       importKeyPairCalls    []*ec2.ImportKeyPairInput
+       describeKeyPairsCalls []*ec2.DescribeKeyPairsInput
+       runInstancesCalls     []*ec2.RunInstancesInput
+       // {subnetID => error}: RunInstances returns error if subnetID
+       // matches.
+       subnetErrorOnRunInstances map[string]error
 }
 
 func (e *ec2stub) ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error) {
+       e.importKeyPairCalls = append(e.importKeyPairCalls, input)
        return nil, nil
 }
 
 func (e *ec2stub) DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error) {
+       e.describeKeyPairsCalls = append(e.describeKeyPairsCalls, input)
        return &ec2.DescribeKeyPairsOutput{}, nil
 }
 
 func (e *ec2stub) RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error) {
+       e.runInstancesCalls = append(e.runInstancesCalls, input)
+       if len(input.NetworkInterfaces) > 0 && input.NetworkInterfaces[0].SubnetId != nil {
+               err := e.subnetErrorOnRunInstances[*input.NetworkInterfaces[0].SubnetId]
+               if err != nil {
+                       return nil, err
+               }
+       }
        return &ec2.Reservation{Instances: []*ec2.Instance{{
-               InstanceId: aws.String("i-123"),
-               Tags:       input.TagSpecifications[0].Tags,
+               InstanceId:   aws.String("i-123"),
+               InstanceType: aws.String("t2.micro"),
+               Tags:         input.TagSpecifications[0].Tags,
        }}}, nil
 }
 
 func (e *ec2stub) DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error) {
-       return &ec2.DescribeInstancesOutput{}, nil
+       return &ec2.DescribeInstancesOutput{
+               Reservations: []*ec2.Reservation{{
+                       Instances: []*ec2.Instance{{
+                               InstanceId:        aws.String("i-123"),
+                               InstanceLifecycle: aws.String("spot"),
+                               InstanceType:      aws.String("t2.micro"),
+                               PrivateIpAddress:  aws.String("10.1.2.3"),
+                               State:             &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
+                       }, {
+                               InstanceId:        aws.String("i-124"),
+                               InstanceLifecycle: aws.String("spot"),
+                               InstanceType:      aws.String("t2.micro"),
+                               PrivateIpAddress:  aws.String("10.1.2.4"),
+                               State:             &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
+                       }},
+               }},
+       }, nil
+}
+
+func (e *ec2stub) DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error {
+       fn(&ec2.DescribeInstanceStatusOutput{
+               InstanceStatuses: []*ec2.InstanceStatus{{
+                       InstanceId:       aws.String("i-123"),
+                       AvailabilityZone: aws.String("aa-east-1a"),
+               }, {
+                       InstanceId:       aws.String("i-124"),
+                       AvailabilityZone: aws.String("aa-east-1a"),
+               }},
+       }, true)
+       return nil
+}
+
+func (e *ec2stub) DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error {
+       if !fn(&ec2.DescribeSpotPriceHistoryOutput{
+               SpotPriceHistory: []*ec2.SpotPrice{
+                       &ec2.SpotPrice{
+                               InstanceType:     aws.String("t2.micro"),
+                               AvailabilityZone: aws.String("aa-east-1a"),
+                               SpotPrice:        aws.String("0.005"),
+                               Timestamp:        aws.Time(e.reftime.Add(-9 * time.Minute)),
+                       },
+                       &ec2.SpotPrice{
+                               InstanceType:     aws.String("t2.micro"),
+                               AvailabilityZone: aws.String("aa-east-1a"),
+                               SpotPrice:        aws.String("0.015"),
+                               Timestamp:        aws.Time(e.reftime.Add(-5 * time.Minute)),
+                       },
+               },
+       }, false) {
+               return nil
+       }
+       fn(&ec2.DescribeSpotPriceHistoryOutput{
+               SpotPriceHistory: []*ec2.SpotPrice{
+                       &ec2.SpotPrice{
+                               InstanceType:     aws.String("t2.micro"),
+                               AvailabilityZone: aws.String("aa-east-1a"),
+                               SpotPrice:        aws.String("0.01"),
+                               Timestamp:        aws.Time(e.reftime.Add(-2 * time.Minute)),
+                       },
+               },
+       }, true)
+       return nil
 }
 
 func (e *ec2stub) CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) {
@@ -85,7 +199,21 @@ func (e *ec2stub) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.T
        return nil, nil
 }
 
-func GetInstanceSet() (cloud.InstanceSet, cloud.ImageID, arvados.Cluster, error) {
+type ec2stubError struct {
+       code    string
+       message string
+}
+
+func (err *ec2stubError) Code() string    { return err.code }
+func (err *ec2stubError) Message() string { return err.message }
+func (err *ec2stubError) Error() string   { return fmt.Sprintf("%s: %s", err.code, err.message) }
+func (err *ec2stubError) OrigErr() error  { return errors.New("stub OrigErr") }
+
+// Ensure ec2stubError satisfies the aws.Error interface
+var _ = awserr.Error(&ec2stubError{})
+
+func GetInstanceSet(c *check.C, conf string) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster, *prometheus.Registry) {
+       reg := prometheus.NewRegistry()
        cluster := arvados.Cluster{
                InstanceTypes: arvados.InstanceTypeMap(map[string]arvados.InstanceType{
                        "tiny": {
@@ -98,7 +226,7 @@ func GetInstanceSet() (cloud.InstanceSet, cloud.ImageID, arvados.Cluster, error)
                                Preemptible:  false,
                        },
                        "tiny-with-extra-scratch": {
-                               Name:         "tiny",
+                               Name:         "tiny-with-extra-scratch",
                                ProviderType: "t2.micro",
                                VCPUs:        1,
                                RAM:          4000000000,
@@ -107,7 +235,7 @@ func GetInstanceSet() (cloud.InstanceSet, cloud.ImageID, arvados.Cluster, error)
                                AddedScratch: 20000000000,
                        },
                        "tiny-preemptible": {
-                               Name:         "tiny",
+                               Name:         "tiny-preemptible",
                                ProviderType: "t2.micro",
                                VCPUs:        1,
                                RAM:          4000000000,
@@ -119,58 +247,51 @@ func GetInstanceSet() (cloud.InstanceSet, cloud.ImageID, arvados.Cluster, error)
        if *live != "" {
                var exampleCfg testConfig
                err := config.LoadFile(&exampleCfg, *live)
-               if err != nil {
-                       return nil, cloud.ImageID(""), cluster, err
-               }
-
-               ap, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger())
-               return ap, cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster, err
-       }
-       ap := ec2InstanceSet{
-               ec2config:     ec2InstanceSetConfig{},
-               instanceSetID: "test123",
-               logger:        logrus.StandardLogger(),
-               client:        &ec2stub{},
-               keys:          make(map[string]string),
+               c.Assert(err, check.IsNil)
+
+               is, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), reg)
+               c.Assert(err, check.IsNil)
+               return is.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster, reg
+       } else {
+               is, err := newEC2InstanceSet(json.RawMessage(conf), "test123", nil, ctxlog.TestLogger(c), reg)
+               c.Assert(err, check.IsNil)
+               is.(*ec2InstanceSet).client = &ec2stub{c: c, reftime: time.Now().UTC()}
+               return is.(*ec2InstanceSet), cloud.ImageID("blob"), cluster, reg
        }
-       return &ap, cloud.ImageID("blob"), cluster, nil
 }
 
 func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
-       ap, img, cluster, err := GetInstanceSet()
-       if err != nil {
-               c.Fatal("Error making provider", err)
-       }
-
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
        pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
-       c.Assert(err, check.IsNil)
 
        inst, err := ap.Create(cluster.InstanceTypes["tiny"],
                img, map[string]string{
                        "TestTagName": "test tag value",
                }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
-
        c.Assert(err, check.IsNil)
 
        tags := inst.Tags()
        c.Check(tags["TestTagName"], check.Equals, "test tag value")
        c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
 
-}
+       if *live == "" {
+               c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 1)
+               c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 1)
 
-func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
-       ap, img, cluster, err := GetInstanceSet()
-       if err != nil {
-               c.Fatal("Error making provider", err)
+               runcalls := ap.client.(*ec2stub).runInstancesCalls
+               if c.Check(runcalls, check.HasLen, 1) {
+                       c.Check(runcalls[0].MetadataOptions.HttpEndpoint, check.DeepEquals, aws.String("enabled"))
+                       c.Check(runcalls[0].MetadataOptions.HttpTokens, check.DeepEquals, aws.String("required"))
+               }
        }
+}
 
-       pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
-       c.Assert(err, check.IsNil)
-
+func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
        inst, err := ap.Create(cluster.InstanceTypes["tiny-with-extra-scratch"],
                img, map[string]string{
                        "TestTagName": "test tag value",
-               }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
+               }, "umask 0600; echo -n test-file-data >/var/run/test-file", nil)
 
        c.Assert(err, check.IsNil)
 
@@ -178,16 +299,17 @@ func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
        c.Check(tags["TestTagName"], check.Equals, "test tag value")
        c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
 
+       if *live == "" {
+               // Should not have called key pair APIs, because
+               // publickey arg was nil
+               c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 0)
+               c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 0)
+       }
 }
 
 func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
-       ap, img, cluster, err := GetInstanceSet()
-       if err != nil {
-               c.Fatal("Error making provider", err)
-       }
-
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
        pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
-       c.Assert(err, check.IsNil)
 
        inst, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"],
                img, map[string]string{
@@ -202,12 +324,171 @@ func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
 
 }
 
-func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
-       ap, _, _, err := GetInstanceSet()
-       if err != nil {
-               c.Fatal("Error making provider", err)
+func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
+       if *live != "" {
+               c.Skip("not applicable in live mode")
+               return
+       }
+
+       ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-good"]}`)
+       ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
+               "subnet-full": &ec2stubError{
+                       code:    "InsufficientFreeAddressesInSubnet",
+                       message: "subnet is full",
+               },
+       }
+       inst, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+       c.Check(err, check.IsNil)
+       c.Check(inst, check.NotNil)
+       c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 1\n`+
+               `.*`)
+
+       // Next RunInstances call should try the working subnet first
+       inst, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+       c.Check(err, check.IsNil)
+       c.Check(inst, check.NotNil)
+       c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 3)
+       metrics = arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 2\n`+
+               `.*`)
+}
+
+func (*EC2InstanceSetSuite) TestIsErrorSubnetSpecific(c *check.C) {
+       c.Check(isErrorSubnetSpecific(nil), check.Equals, false)
+       c.Check(isErrorSubnetSpecific(errors.New("misc error")), check.Equals, false)
+
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code: "InsufficientInstanceCapacity",
+       }), check.Equals, true)
+
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code: "InsufficientVolumeCapacity",
+       }), check.Equals, true)
+
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code:    "InsufficientFreeAddressesInSubnet",
+               message: "Not enough free addresses in subnet subnet-abcdefg\n\tstatus code: 400, request id: abcdef01-2345-6789-abcd-ef0123456789",
+       }), check.Equals, true)
+
+       // #21603: (Sometimes?) EC2 returns code InvalidParameterValue
+       // even though the code "InsufficientFreeAddressesInSubnet"
+       // seems like it must be meant for exactly this error.
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code:    "InvalidParameterValue",
+               message: "Not enough free addresses in subnet subnet-abcdefg\n\tstatus code: 400, request id: abcdef01-2345-6789-abcd-ef0123456789",
+       }), check.Equals, true)
+
+       // Similarly, AWS docs
+       // (https://repost.aws/knowledge-center/vpc-insufficient-ip-errors)
+       // suggest the following code/message combinations also exist.
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code:    "Client.InvalidParameterValue",
+               message: "There aren't sufficient free Ipv4 addresses or prefixes",
+       }), check.Equals, true)
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code:    "InvalidParameterValue",
+               message: "There aren't sufficient free Ipv4 addresses or prefixes",
+       }), check.Equals, true)
+       // Meanwhile, other AWS docs
+       // (https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html)
+       // suggest Client.InvalidParameterValue is not a real code but
+       // ClientInvalidParameterValue is.
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code:    "ClientInvalidParameterValue",
+               message: "There aren't sufficient free Ipv4 addresses or prefixes",
+       }), check.Equals, true)
+
+       c.Check(isErrorSubnetSpecific(&ec2stubError{
+               code:    "InvalidParameterValue",
+               message: "Some other invalid parameter error",
+       }), check.Equals, false)
+}
+
+func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
+       if *live != "" {
+               c.Skip("not applicable in live mode")
+               return
+       }
+
+       ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-broken"]}`)
+       ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
+               "subnet-full": &ec2stubError{
+                       code:    "InsufficientFreeAddressesInSubnet",
+                       message: "subnet is full",
+               },
+               "subnet-broken": &ec2stubError{
+                       code:    "InvalidSubnetId.NotFound",
+                       message: "bogus subnet id",
+               },
        }
+       _, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+       c.Check(err, check.NotNil)
+       c.Check(err, check.ErrorMatches, `.*InvalidSubnetId\.NotFound.*`)
+       c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `.*`)
+
+       _, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+       c.Check(err, check.NotNil)
+       c.Check(err, check.ErrorMatches, `.*InsufficientFreeAddressesInSubnet.*`)
+       c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 4)
+       metrics = arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 2\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 2\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `.*`)
+}
 
+func (*EC2InstanceSetSuite) TestCreateOneSubnetFailingCapacity(c *check.C) {
+       if *live != "" {
+               c.Skip("not applicable in live mode")
+               return
+       }
+       ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-broken"]}`)
+       ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
+               "subnet-full": &ec2stubError{
+                       code:    "InsufficientFreeAddressesInSubnet",
+                       message: "subnet is full",
+               },
+               "subnet-broken": &ec2stubError{
+                       code:    "InsufficientInstanceCapacity",
+                       message: "insufficient capacity",
+               },
+       }
+       for i := 0; i < 3; i++ {
+               _, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+               c.Check(err, check.NotNil)
+               c.Check(err, check.ErrorMatches, `.*InsufficientInstanceCapacity.*`)
+       }
+       c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 6)
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 3\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 3\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `.*`)
+}
+
+func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
+       ap, _, _, _ := GetInstanceSet(c, "{}")
        l, err := ap.Instances(nil)
        c.Assert(err, check.IsNil)
 
@@ -219,27 +500,23 @@ func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
 }
 
 func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
-       ap, _, _, err := GetInstanceSet()
-       if err != nil {
-               c.Fatal("Error making provider: ", err)
-       }
-
+       ap, _, _, reg := GetInstanceSet(c, "{}")
        l, err := ap.Instances(nil)
-
        c.Assert(err, check.IsNil)
 
        for _, i := range l {
                tg := i.Tags()
                c.Logf("%v %v %v", i.String(), i.Address(), tg)
        }
+
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instances{subnet_id="[^"]*"} \d+\n`+
+               `.*`)
 }
 
 func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
-       ap, _, _, err := GetInstanceSet()
-       if err != nil {
-               c.Fatal("Error making provider", err)
-       }
-
+       ap, _, _, _ := GetInstanceSet(c, "{}")
        l, err := ap.Instances(nil)
        c.Assert(err, check.IsNil)
 
@@ -248,14 +525,94 @@ func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
        }
 }
 
+func (*EC2InstanceSetSuite) TestInstancePriceHistory(c *check.C) {
+       ap, img, cluster, _ := GetInstanceSet(c, "{}")
+       pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
+       tags := cloud.InstanceTags{"arvados-ec2-driver": "test"}
+
+       defer func() {
+               instances, err := ap.Instances(tags)
+               c.Assert(err, check.IsNil)
+               for _, inst := range instances {
+                       c.Logf("cleanup: destroy instance %s", inst)
+                       c.Check(inst.Destroy(), check.IsNil)
+               }
+       }()
+
+       ap.ec2config.SpotPriceUpdateInterval = arvados.Duration(time.Hour)
+       ap.ec2config.EBSPrice = 0.1 // $/GiB/month
+       inst1, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
+       c.Assert(err, check.IsNil)
+       defer inst1.Destroy()
+       inst2, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
+       c.Assert(err, check.IsNil)
+       defer inst2.Destroy()
+
+       // in live mode, we need to wait for the instances to reach
+       // running state before we can discover their availability
+       // zones and look up the appropriate prices.
+       var instances []cloud.Instance
+       for deadline := time.Now().Add(5 * time.Minute); ; {
+               if deadline.Before(time.Now()) {
+                       c.Fatal("timed out")
+               }
+               instances, err = ap.Instances(tags)
+               running := 0
+               for _, inst := range instances {
+                       ec2i := inst.(*ec2Instance).instance
+                       if *ec2i.InstanceLifecycle == "spot" && *ec2i.State.Code&16 != 0 {
+                               running++
+                       }
+               }
+               if running >= 2 {
+                       c.Logf("instances are running, and identifiable as spot instances")
+                       break
+               }
+               c.Logf("waiting for instances to reach running state so their availability zone becomes visible...")
+               time.Sleep(10 * time.Second)
+       }
+
+       for _, inst := range instances {
+               hist := inst.PriceHistory(arvados.InstanceType{})
+               c.Logf("%s price history: %v", inst.ID(), hist)
+               c.Check(len(hist) > 0, check.Equals, true)
+
+               histWithScratch := inst.PriceHistory(arvados.InstanceType{AddedScratch: 640 << 30})
+               c.Logf("%s price history with 640 GiB scratch: %v", inst.ID(), histWithScratch)
+
+               for i, ip := range hist {
+                       c.Check(ip.Price, check.Not(check.Equals), 0.0)
+                       if i > 0 {
+                               c.Check(ip.StartTime.Before(hist[i-1].StartTime), check.Equals, true)
+                       }
+                       c.Check(ip.Price < histWithScratch[i].Price, check.Equals, true)
+               }
+       }
+}
+
 func (*EC2InstanceSetSuite) TestWrapError(c *check.C) {
        retryError := awserr.New("Throttling", "", nil)
        wrapped := wrapError(retryError, &atomic.Value{})
        _, ok := wrapped.(cloud.RateLimitError)
        c.Check(ok, check.Equals, true)
 
-       quotaError := awserr.New("InsufficientInstanceCapacity", "", nil)
+       quotaError := awserr.New("InstanceLimitExceeded", "", nil)
        wrapped = wrapError(quotaError, nil)
        _, ok = wrapped.(cloud.QuotaError)
        c.Check(ok, check.Equals, true)
+
+       for _, trial := range []struct {
+               code string
+               msg  string
+       }{
+               {"InsufficientInstanceCapacity", ""},
+               {"Unsupported", "Your requested instance type (t3.micro) is not supported in your requested Availability Zone (us-east-1e). Please retry your request by not specifying an Availability Zone or choosing us-east-1a, us-east-1b, us-east-1c, us-east-1d, us-east-1f."},
+       } {
+               capacityError := awserr.New(trial.code, trial.msg, nil)
+               wrapped = wrapError(capacityError, nil)
+               caperr, ok := wrapped.(cloud.CapacityError)
+               c.Check(ok, check.Equals, true)
+               c.Check(caperr.IsCapacityError(), check.Equals, true)
+               c.Check(caperr.IsInstanceTypeSpecific(), check.Equals, true)
+       }
 }