1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
20 "git.arvados.org/arvados.git/lib/cloud"
21 "git.arvados.org/arvados.git/sdk/go/arvados"
22 "github.com/aws/aws-sdk-go/aws"
23 "github.com/aws/aws-sdk-go/aws/awserr"
24 "github.com/aws/aws-sdk-go/aws/credentials"
25 "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
26 "github.com/aws/aws-sdk-go/aws/ec2metadata"
27 "github.com/aws/aws-sdk-go/aws/request"
28 "github.com/aws/aws-sdk-go/aws/session"
29 "github.com/aws/aws-sdk-go/service/ec2"
30 "github.com/sirupsen/logrus"
31 "golang.org/x/crypto/ssh"
34 // Driver is the ec2 implementation of the cloud.Driver interface.
35 var Driver = cloud.DriverFunc(newEC2InstanceSet)
38 throttleDelayMin = time.Second
39 throttleDelayMax = time.Minute
42 type ec2InstanceSetConfig struct {
44 SecretAccessKey string
46 SecurityGroupIDs arvados.StringSet
50 IAMInstanceProfile string
53 type ec2Interface interface {
54 DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error)
55 ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
56 RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error)
57 DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
58 CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error)
59 TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
62 type ec2InstanceSet struct {
63 ec2config ec2InstanceSetConfig
64 instanceSetID cloud.InstanceSetID
65 logger logrus.FieldLogger
68 keys map[string]string
69 throttleDelayCreate atomic.Value
70 throttleDelayInstances atomic.Value
73 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
74 instanceSet := &ec2InstanceSet{
75 instanceSetID: instanceSetID,
78 err = json.Unmarshal(config, &instanceSet.ec2config)
83 sess, err := session.NewSession()
87 // First try any static credentials, fall back to an IAM instance profile/role
88 creds := credentials.NewChainCredentials(
89 []credentials.Provider{
90 &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
91 &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
94 awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
95 instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
96 instanceSet.keys = make(map[string]string)
97 if instanceSet.ec2config.EBSVolumeType == "" {
98 instanceSet.ec2config.EBSVolumeType = "gp2"
100 return instanceSet, nil
103 func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error) {
104 // AWS key fingerprints don't use the usual key fingerprint
105 // you get from ssh-keygen or ssh.FingerprintLegacyMD5()
106 // (you can get that from md5.Sum(pk.Marshal())
108 // AWS uses the md5 or sha1 of the PKIX DER encoding of the
109 // public key, so calculate those fingerprints here.
115 if err := ssh.Unmarshal(pk.Marshal(), &rsaPub); err != nil {
116 return "", "", fmt.Errorf("agent: Unmarshal failed to parse public key: %v", err)
118 rsaPk := rsa.PublicKey{
119 E: int(rsaPub.E.Int64()),
122 pkix, _ := x509.MarshalPKIXPublicKey(&rsaPk)
123 md5pkix := md5.Sum([]byte(pkix))
124 sha1pkix := sha1.Sum([]byte(pkix))
127 for i := 0; i < len(md5pkix); i++ {
128 md5fp += fmt.Sprintf(":%02x", md5pkix[i])
130 for i := 0; i < len(sha1pkix); i++ {
131 sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
133 return md5fp[1:], sha1fp[1:], nil
136 func (instanceSet *ec2InstanceSet) Create(
137 instanceType arvados.InstanceType,
138 imageID cloud.ImageID,
139 newTags cloud.InstanceTags,
140 initCommand cloud.InitCommand,
141 publicKey ssh.PublicKey) (cloud.Instance, error) {
143 md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
145 return nil, fmt.Errorf("Could not make key fingerprint: %v", err)
147 instanceSet.keysMtx.Lock()
150 if keyname, ok = instanceSet.keys[md5keyFingerprint]; !ok {
151 keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
152 Filters: []*ec2.Filter{{
153 Name: aws.String("fingerprint"),
154 Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
158 return nil, fmt.Errorf("Could not search for keypair: %v", err)
161 if len(keyout.KeyPairs) > 0 {
162 keyname = *(keyout.KeyPairs[0].KeyName)
164 keyname = "arvados-dispatch-keypair-" + md5keyFingerprint
165 _, err := instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
167 PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
170 return nil, fmt.Errorf("Could not import keypair: %v", err)
173 instanceSet.keys[md5keyFingerprint] = keyname
175 instanceSet.keysMtx.Unlock()
177 ec2tags := []*ec2.Tag{}
178 for k, v := range newTags {
179 ec2tags = append(ec2tags, &ec2.Tag{
181 Value: aws.String(v),
186 for sg := range instanceSet.ec2config.SecurityGroupIDs {
187 groups = append(groups, sg)
190 rii := ec2.RunInstancesInput{
191 ImageId: aws.String(string(imageID)),
192 InstanceType: &instanceType.ProviderType,
193 MaxCount: aws.Int64(1),
194 MinCount: aws.Int64(1),
197 NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
199 AssociatePublicIpAddress: aws.Bool(false),
200 DeleteOnTermination: aws.Bool(true),
201 DeviceIndex: aws.Int64(0),
202 Groups: aws.StringSlice(groups),
203 SubnetId: &instanceSet.ec2config.SubnetID,
205 DisableApiTermination: aws.Bool(false),
206 InstanceInitiatedShutdownBehavior: aws.String("terminate"),
207 TagSpecifications: []*ec2.TagSpecification{
209 ResourceType: aws.String("instance"),
212 UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
215 if instanceType.AddedScratch > 0 {
216 rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
217 DeviceName: aws.String("/dev/xvdt"),
218 Ebs: &ec2.EbsBlockDevice{
219 DeleteOnTermination: aws.Bool(true),
220 VolumeSize: aws.Int64((int64(instanceType.AddedScratch) + (1<<30 - 1)) >> 30),
221 VolumeType: &instanceSet.ec2config.EBSVolumeType,
225 if instanceType.Preemptible {
226 rii.InstanceMarketOptions = &ec2.InstanceMarketOptionsRequest{
227 MarketType: aws.String("spot"),
228 SpotOptions: &ec2.SpotMarketOptions{
229 InstanceInterruptionBehavior: aws.String("terminate"),
230 MaxPrice: aws.String(fmt.Sprintf("%v", instanceType.Price)),
234 if instanceSet.ec2config.IAMInstanceProfile != "" {
235 rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
236 Name: aws.String(instanceSet.ec2config.IAMInstanceProfile),
240 rsv, err := instanceSet.client.RunInstances(&rii)
241 err = wrapError(err, &instanceSet.throttleDelayCreate)
247 provider: instanceSet,
248 instance: rsv.Instances[0],
252 func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
253 var filters []*ec2.Filter
254 for k, v := range tags {
255 filters = append(filters, &ec2.Filter{
256 Name: aws.String("tag:" + k),
257 Values: []*string{aws.String(v)},
260 dii := &ec2.DescribeInstancesInput{Filters: filters}
262 dio, err := instanceSet.client.DescribeInstances(dii)
263 err = wrapError(err, &instanceSet.throttleDelayInstances)
268 for _, rsv := range dio.Reservations {
269 for _, inst := range rsv.Instances {
270 if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" {
271 instances = append(instances, &ec2Instance{instanceSet, inst})
275 if dio.NextToken == nil {
276 return instances, err
278 dii.NextToken = dio.NextToken
282 func (instanceSet *ec2InstanceSet) Stop() {
285 type ec2Instance struct {
286 provider *ec2InstanceSet
287 instance *ec2.Instance
290 func (inst *ec2Instance) ID() cloud.InstanceID {
291 return cloud.InstanceID(*inst.instance.InstanceId)
294 func (inst *ec2Instance) String() string {
295 return *inst.instance.InstanceId
298 func (inst *ec2Instance) ProviderType() string {
299 return *inst.instance.InstanceType
302 func (inst *ec2Instance) SetTags(newTags cloud.InstanceTags) error {
303 var ec2tags []*ec2.Tag
304 for k, v := range newTags {
305 ec2tags = append(ec2tags, &ec2.Tag{
307 Value: aws.String(v),
311 _, err := inst.provider.client.CreateTags(&ec2.CreateTagsInput{
312 Resources: []*string{inst.instance.InstanceId},
319 func (inst *ec2Instance) Tags() cloud.InstanceTags {
320 tags := make(map[string]string)
322 for _, t := range inst.instance.Tags {
323 tags[*t.Key] = *t.Value
329 func (inst *ec2Instance) Destroy() error {
330 _, err := inst.provider.client.TerminateInstances(&ec2.TerminateInstancesInput{
331 InstanceIds: []*string{inst.instance.InstanceId},
336 func (inst *ec2Instance) Address() string {
337 if inst.instance.PrivateIpAddress != nil {
338 return *inst.instance.PrivateIpAddress
343 func (inst *ec2Instance) RemoteUser() string {
344 return inst.provider.ec2config.AdminUsername
347 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
348 return cloud.ErrNotImplemented
351 type rateLimitError struct {
353 earliestRetry time.Time
356 func (err rateLimitError) EarliestRetry() time.Time {
357 return err.earliestRetry
360 var isCodeCapacity = map[string]bool{
361 "InsufficientInstanceCapacity": true,
362 "VcpuLimitExceeded": true,
363 "MaxSpotInstanceCountExceeded": true,
366 // isErrorCapacity returns whether the error is to be throttled based on its code.
367 // Returns false if error is nil.
368 func isErrorCapacity(err error) bool {
369 if aerr, ok := err.(awserr.Error); ok && aerr != nil {
370 if _, ok := isCodeCapacity[aerr.Code()]; ok {
377 type ec2QuotaError struct {
381 func (er *ec2QuotaError) IsQuotaError() bool {
385 func wrapError(err error, throttleValue *atomic.Value) error {
386 if request.IsErrorThrottle(err) {
387 // Back off exponentially until an upstream call
388 // either succeeds or returns a non-throttle error.
389 d, _ := throttleValue.Load().(time.Duration)
390 d = d*3/2 + time.Second
391 if d < throttleDelayMin {
393 } else if d > throttleDelayMax {
396 throttleValue.Store(d)
397 return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
398 } else if isErrorCapacity(err) {
399 return &ec2QuotaError{err}
400 } else if err != nil {
401 throttleValue.Store(time.Duration(0))
404 throttleValue.Store(time.Duration(0))