Merge branch 'master' into 7478-anm-spot-instances

author Lucas Di Pentima <ldipentima@veritasgenetics.com>

Mon, 11 Jun 2018 15:34:22 +0000 (12:34 -0300)

committer Lucas Di Pentima <ldipentima@veritasgenetics.com>

Mon, 11 Jun 2018 15:34:22 +0000 (12:34 -0300)
author Lucas Di Pentima <ldipentima@veritasgenetics.com>
Mon, 11 Jun 2018 15:34:22 +0000 (12:34 -0300)
committer Lucas Di Pentima <ldipentima@veritasgenetics.com>
Mon, 11 Jun 2018 15:34:22 +0000 (12:34 -0300)
diff --git a/build/build.list b/build/build.list

index 3d98fafb449b77f19bf074a7c8fa84ba88535d4f..fa1a260c3a225fcbf29bfccb17e364b1f7277000 100644 (file)
--- a/build/build.list
+++ b/build/build.list
@@ -5,7 +5,6 @@
  #distribution(s)|name|version|iteration|type|architecture|extra fpm arguments
  debian8,debian9,centos7|python-gflags|2.0|2|python|all
  debian8,debian9,ubuntu1404,ubuntu1604,centos7|google-api-python-client|1.6.2|2|python|all
-debian8,debian9,ubuntu1404,ubuntu1604,centos7|apache-libcloud|2.3.0|3|python|all|--depends 'python-requests >= 2.4.3'
  debian8,debian9,ubuntu1404,centos7|oauth2client|1.5.2|2|python|all
  debian8,debian9,ubuntu1404,centos7|pyasn1|0.1.7|2|python|all
  debian8,debian9,ubuntu1404,centos7|pyasn1-modules|0.0.5|2|python|all
diff --git a/build/libcloud-pin.sh b/build/libcloud-pin.sh

index cfbba404504e3b7c60d553040fb64c97e3698f77..bb66c6b218c020c5d038c1e5e7b51f8681043db9 100644 (file)
--- a/build/libcloud-pin.sh
+++ b/build/libcloud-pin.sh
@@ -2,9 +2,9 @@
  #
  # SPDX-License-Identifier: AGPL-3.0
  
-LIBCLOUD_PIN=2.3.0
+LIBCLOUD_PIN=2.3.1.dev1
  
-using_fork=false
+using_fork=true
  if [[ $using_fork = true ]]; then
      LIBCLOUD_PIN_SRC="https://github.com/curoverse/libcloud/archive/apache-libcloud-$LIBCLOUD_PIN.zip"
  else
diff --git a/build/run-build-packages.sh b/build/run-build-packages.sh

index 63f81832f0abecf4688cef2a65fa16bda31d691e..351d1b2a1f3df666c7fc90df6f456c10522c2dcf 100755 (executable)
--- a/build/run-build-packages.sh
+++ b/build/run-build-packages.sh
@@ -434,8 +434,30 @@ if [[ "$?" == "0" ]]; then
    fpm_build $WORKSPACE/tools/crunchstat-summary ${PYTHON2_PKG_PREFIX}-crunchstat-summary 'Curoverse, Inc.' 'python' "$crunchstat_summary_version" "--url=https://arvados.org" "--description=Crunchstat-summary reads Arvados Crunch log files and summarize resource usage" --iteration "$iteration"
  fi
  
-## if libcloud becomes our own fork see
-## https://dev.arvados.org/issues/12268#note-27
+# Forked libcloud
+if test_package_presence "$PYTHON2_PKG_PREFIX"-apache-libcloud "$LIBCLOUD_PIN" python 2
+then
+  LIBCLOUD_DIR=$(mktemp -d)
+  (
+      cd $LIBCLOUD_DIR
+      git clone $DASHQ_UNLESS_DEBUG https://github.com/curoverse/libcloud.git .
+      git checkout $DASHQ_UNLESS_DEBUG apache-libcloud-$LIBCLOUD_PIN
+      # libcloud is absurdly noisy without -q, so force -q here
+      OLD_DASHQ_UNLESS_DEBUG=$DASHQ_UNLESS_DEBUG
+      DASHQ_UNLESS_DEBUG=-q
+      handle_python_package
+      DASHQ_UNLESS_DEBUG=$OLD_DASHQ_UNLESS_DEBUG
+  )
+
+  # libcloud >= 2.3.0 now requires python-requests 2.4.3 or higher, otherwise
+  # it throws
+  #   ImportError: No module named packages.urllib3.poolmanager
+  # when loaded. We only see this problem on ubuntu1404, because that is our
+  # only supported distribution that ships with a python-requests older than
+  # 2.4.3.
+  fpm_build $LIBCLOUD_DIR "$PYTHON2_PKG_PREFIX"-apache-libcloud "" python "" --iteration 2 --depends 'python-requests >= 2.4.3'
+  rm -rf $LIBCLOUD_DIR
+fi
  
  # Python 2 dependencies
  declare -a PIP_DOWNLOAD_SWITCHES=(--no-deps)
diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go

index 2ca405060390c65df2f961f7c7a83e5a278d0687..f145901c32fd5a51e50b55df7b91268de6969c93 100644 (file)
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -47,6 +47,7 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
                 case it.Scratch < needScratch:
                 case it.RAM < needRAM:
                 case it.VCPUs < needVCPUs:
+               case it.Preemptable != ctr.SchedulingParameters.Preemptable:
                 case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
                         // Equal price, but worse specs
                 default:
diff --git a/lib/dispatchcloud/node_size_test.go b/lib/dispatchcloud/node_size_test.go

index 0c02a0e3e1be45bfeb6b2371287a4ce664de1d98..9a5e704695138cb54adb6791dcf3f61f8eb5e3ba 100644 (file)
--- a/lib/dispatchcloud/node_size_test.go
+++ b/lib/dispatchcloud/node_size_test.go
@@ -91,3 +91,31 @@ func (*NodeSizeSuite) TestChoose(c *check.C) {
                 c.Check(best.Scratch >= 2*GiB, check.Equals, true)
         }
  }
+
+func (*NodeSizeSuite) TestChoosePreemptable(c *check.C) {
+       menu := []arvados.InstanceType{
+               {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Preemptable: true, Name: "costly"},
+               {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "almost best"},
+               {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Preemptable: true, Name: "best"},
+               {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Preemptable: true, Name: "small"},
+       }
+       best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+               Mounts: map[string]arvados.Mount{
+                       "/tmp": {Kind: "tmp", Capacity: 2 * GiB},
+               },
+               RuntimeConstraints: arvados.RuntimeConstraints{
+                       VCPUs:        2,
+                       RAM:          987654321,
+                       KeepCacheRAM: 123456789,
+               },
+               SchedulingParameters: arvados.SchedulingParameters{
+                       Preemptable: true,
+               },
+       })
+       c.Check(err, check.IsNil)
+       c.Check(best.Name, check.Equals, "best")
+       c.Check(best.RAM >= 1234567890, check.Equals, true)
+       c.Check(best.VCPUs >= 2, check.Equals, true)
+       c.Check(best.Scratch >= 2*GiB, check.Equals, true)
+       c.Check(best.Preemptable, check.Equals, true)
+}
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go

index 9ed0eacf23e6d753c1b6c2a0f781282c96dde8cc..b0c7069cd98007ec0f24ad45cb809c137784eaae 100644 (file)
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -62,6 +62,7 @@ type InstanceType struct {
         RAM          int64
         Scratch      int64
         Price        float64
+       Preemptable  bool
  }
  
  // GetThisSystemNode returns a SystemNode for the node we're running
diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go

index daafc4995448524f7fe3794b9facd13e01480823..e71bcd5d0da5cdeaebdc6dfd3be05cd81d681d58 100644 (file)
--- a/sdk/go/arvados/container.go
+++ b/sdk/go/arvados/container.go
@@ -52,7 +52,8 @@ type RuntimeConstraints struct {
  // SchedulingParameters specify a container's scheduling parameters
  // such as Partitions
  type SchedulingParameters struct {
-       Partitions []string `json:"partitions"`
+       Partitions  []string `json:"partitions"`
+       Preemptable bool     `json:"preemptable"`
  }
  
  // ContainerList is an arvados#containerList resource.
diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb

index b4d1a414578d8fdd5f988d55f8890567ed729f1a..a8ffc193b824721a984ec8c02506f49ce3fabe2b 100644 (file)
--- a/services/api/app/models/container_request.rb
+++ b/services/api/app/models/container_request.rb
@@ -28,11 +28,12 @@ class ContainerRequest < ArvadosModel
  
    before_validation :fill_field_defaults, :if => :new_record?
    before_validation :validate_runtime_constraints
-  before_validation :validate_scheduling_parameters
    before_validation :set_container
+  before_validation :set_default_preemptable_scheduling_parameter
    validates :command, :container_image, :output_path, :cwd, :presence => true
    validates :output_ttl, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
    validates :priority, numericality: { only_integer: true, greater_than_or_equal_to: 0, less_than_or_equal_to: 1000 }
+  validate :validate_scheduling_parameters
    validate :validate_state_change
    validate :check_update_whitelist
    validate :secret_mounts_key_conflict
@@ -197,6 +198,18 @@ class ContainerRequest < ArvadosModel
      end
    end
  
+  def set_default_preemptable_scheduling_parameter
+    if self.state == Committed
+      # If preemptable instances (eg: AWS Spot Instances) are allowed,
+      # ask them on child containers by default.
+      if Rails.configuration.preemptable_instances and
+        !self.requesting_container_uuid.nil? and
+        self.scheduling_parameters['preemptable'].nil?
+          self.scheduling_parameters['preemptable'] = true
+      end
+    end
+  end
+
    def validate_runtime_constraints
      case self.state
      when Committed
@@ -223,6 +236,9 @@ class ContainerRequest < ArvadosModel
              scheduling_parameters['partitions'].size)
              errors.add :scheduling_parameters, "partitions must be an array of strings"
        end
+      if !Rails.configuration.preemptable_instances and scheduling_parameters['preemptable']
+        errors.add :scheduling_parameters, "preemptable instances are not allowed"
+      end
      end
    end
  
diff --git a/services/api/config/application.default.yml b/services/api/config/application.default.yml

index a1c35f10fcf1f9e1aae9ead9bf1cda00b5f2535a..19b6f9b25058b3d418e37126220287d2511aabda 100644 (file)
--- a/services/api/config/application.default.yml
+++ b/services/api/config/application.default.yml
@@ -289,6 +289,11 @@ common:
    ### Crunch, DNS & compute node management
    ###
  
+  # Preemptable instance support (e.g. AWS Spot Instances)
+  # When true, child containers will get created with the preemptable
+  # scheduling parameter parameter set.
+  preemptable_instances: false
+
    # Docker image to be used when none found in runtime_constraints of a job
    default_docker_image_for_jobs: false
  
diff --git a/services/api/test/unit/container_request_test.rb b/services/api/test/unit/container_request_test.rb

index 3483b874c6c71cd4db6185df6e600eca1c4169f0..b36ff06bbd551f3b2d0fe83468ad1db135ecd5b1 100644 (file)
--- a/services/api/test/unit/container_request_test.rb
+++ b/services/api/test/unit/container_request_test.rb
@@ -757,6 +757,98 @@ class ContainerRequestTest < ActiveSupport::TestCase
      assert_equal ContainerRequest::Final, cr3.state
    end
  
+  [
+    [false, ActiveRecord::RecordInvalid],
+    [true, nil],
+  ].each do |preemptable_conf, expected|
+    test "having Rails.configuration.preemptable_instances=#{preemptable_conf}, create preemptable container request and verify #{expected}" do
+      sp = {"preemptable" => true}
+      common_attrs = {cwd: "test",
+                      priority: 1,
+                      command: ["echo", "hello"],
+                      output_path: "test",
+                      scheduling_parameters: sp,
+                      mounts: {"test" => {"kind" => "json"}}}
+      Rails.configuration.preemptable_instances = preemptable_conf
+      set_user_from_auth :active
+
+      cr = create_minimal_req!(common_attrs)
+      cr.state = ContainerRequest::Committed
+
+      if !expected.nil?
+        assert_raises(expected) do
+          cr.save!
+        end
+      else
+        cr.save!
+        assert_equal sp, cr.scheduling_parameters
+      end
+    end
+  end
+
+  [
+    'zzzzz-dz642-runningcontainr',
+    nil,
+  ].each do |requesting_c|
+    test "having preemptable instances active on the API server, a committed #{requesting_c.nil? ? 'non-':''}child CR should not ask for preemptable instance if parameter already set to false" do
+      common_attrs = {cwd: "test",
+                      priority: 1,
+                      command: ["echo", "hello"],
+                      output_path: "test",
+                      scheduling_parameters: {"preemptable" => false},
+                      mounts: {"test" => {"kind" => "json"}}}
+
+      Rails.configuration.preemptable_instances = true
+      set_user_from_auth :active
+
+      if requesting_c
+        cr = with_container_auth(Container.find_by_uuid requesting_c) do
+          create_minimal_req!(common_attrs)
+        end
+        assert_not_nil cr.requesting_container_uuid
+      else
+        cr = create_minimal_req!(common_attrs)
+      end
+
+      cr.state = ContainerRequest::Committed
+      cr.save!
+
+      assert_equal false, cr.scheduling_parameters['preemptable']
+    end
+  end
+
+  [
+    [true, 'zzzzz-dz642-runningcontainr', true],
+    [true, nil, nil],
+    [false, 'zzzzz-dz642-runningcontainr', nil],
+    [false, nil, nil],
+  ].each do |preemptable_conf, requesting_c, schedule_preemptable|
+    test "having Rails.configuration.preemptable_instances=#{preemptable_conf}, #{requesting_c.nil? ? 'non-':''}child CR should #{schedule_preemptable ? '':'not'} ask for preemptable instance by default" do
+      common_attrs = {cwd: "test",
+                      priority: 1,
+                      command: ["echo", "hello"],
+                      output_path: "test",
+                      mounts: {"test" => {"kind" => "json"}}}
+
+      Rails.configuration.preemptable_instances = preemptable_conf
+      set_user_from_auth :active
+
+      if requesting_c
+        cr = with_container_auth(Container.find_by_uuid requesting_c) do
+          create_minimal_req!(common_attrs)
+        end
+        assert_not_nil cr.requesting_container_uuid
+      else
+        cr = create_minimal_req!(common_attrs)
+      end
+
+      cr.state = ContainerRequest::Committed
+      cr.save!
+
+      assert_equal schedule_preemptable, cr.scheduling_parameters['preemptable']
+    end
+  end
+
    [
      [{"partitions" => ["fastcpu","vfastcpu", 100]}, ContainerRequest::Committed, ActiveRecord::RecordInvalid],
      [{"partitions" => ["fastcpu","vfastcpu", 100]}, ContainerRequest::Uncommitted],
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py

index 9106ea67ccc8ffac7813d64baa5ebc537548fa21..dec8ef1434c5f5ca08290ab71f0ecac765dc6ab9 100644 (file)
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -370,6 +370,11 @@ class ComputeNodeMonitorActor(config.actor_class):
      def get_state(self):
          """Get node state, one of ['unpaired', 'busy', 'idle', 'down']."""
  
+        # If this node's size is invalid (because it has a stale arvados_node_size
+        # tag), return 'down' so that it's properly shut down.
+        if self.cloud_node.size.id == 'invalid':
+            return 'down'
+
          # If this node is not associated with an Arvados node, return
          # 'unpaired' if we're in the boot grace period, and 'down' if not,
          # so it isn't counted towards usable nodes.
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py

index 7ed7435553647fdc55958337e6d2461345c4098d..9e38d13eb7f4788d8af485a7e5b4b6589c9f324c 100644 (file)
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -174,7 +174,7 @@ class BaseComputeNodeDriver(RetryMixin):
          try:
              kwargs = self.create_kwargs.copy()
              kwargs.update(self.arvados_create_kwargs(size, arvados_node))
-            kwargs['size'] = size
+            kwargs['size'] = size.real
              return self.real.create_node(**kwargs)
          except CLOUD_ERRORS as create_error:
              # Workaround for bug #6702: sometimes the create node request
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py

index 152817253589cfbe7e4f8f1870437f4e8c97c382..7c299db8623417fefd46ac1ef11ca4299065c6c5 100644 (file)
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -78,6 +78,12 @@ echo %s > /var/tmp/arv-node-data/meta-data/instance-type
      def _init_image(self, urn):
          return "image", self.get_image(urn)
  
+    def create_node(self, size, arvados_node):
+        # Set up tag indicating the Arvados assigned Cloud Size id.
+        self.create_kwargs.setdefault('ex_tags', {})
+        self.create_kwargs['ex_tags'].update({'arvados_node_size': size.id})
+        return super(ComputeNodeDriver, self).create_node(size, arvados_node)
+
      def list_nodes(self):
          # Azure only supports filtering node lists by resource group.
          # Do our own filtering based on tag.
@@ -88,6 +94,7 @@ echo %s > /var/tmp/arv-node-data/meta-data/instance-type
              # Need to populate Node.size
              if not n.size:
                  n.size = self.sizes[n.extra["properties"]["hardwareProfile"]["vmSize"]]
+            n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size')
          return nodes
  
      def broken(self, cloud_node):
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py

index 9300645c38f47b74d780e605d32e37134df0c15a..c6c119b2885bb265a6e44087f17a4ad9b786a205 100644 (file)
--- a/services/nodemanager/arvnodeman/computenode/driver/ec2.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
@@ -91,18 +91,27 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
                      "VolumeSize": volsize,
                      "VolumeType": "gp2"
                  }}]
+        if size.preemptable:
+            # Request a Spot instance for this node
+            kw['ex_spot_market'] = True
          return kw
  
      def sync_node(self, cloud_node, arvados_node):
          self.real.ex_create_tags(cloud_node,
                                   {'Name': arvados_node_fqdn(arvados_node)})
  
+    def create_node(self, size, arvados_node):
+        # Set up tag indicating the Arvados assigned Cloud Size id.
+        self.create_kwargs['ex_metadata'].update({'arvados_node_size': size.id})
+        return super(ComputeNodeDriver, self).create_node(size, arvados_node)
+
      def list_nodes(self):
          # Need to populate Node.size
          nodes = super(ComputeNodeDriver, self).list_nodes()
          for n in nodes:
              if not n.size:
                  n.size = self.sizes[n.extra["instance_type"]]
+            n.extra['arvados_node_size'] = n.extra.get('metadata', {}).get('arvados_node_size')
          return nodes
  
      @classmethod
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py

index 3f1d575361a461f322e6475fab28b059d973e193..f8d81cab041d974ac253e3d11f68e95dcfd1abd3 100644 (file)
--- a/services/nodemanager/arvnodeman/computenode/driver/gce.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py
@@ -109,6 +109,11 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
          return result
  
  
+    def create_node(self, size, arvados_node):
+        # Set up tag indicating the Arvados assigned Cloud Size id.
+        self.create_kwargs['ex_metadata'].update({'arvados_node_size': size.id})
+        return super(ComputeNodeDriver, self).create_node(size, arvados_node)
+
      def list_nodes(self):
          # The GCE libcloud driver only supports filtering node lists by zone.
          # Do our own filtering based on tag list.
@@ -121,6 +126,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
          if nodelist and not hasattr(nodelist[0].size, 'id'):
              for node in nodelist:
                  node.size = self._sizes_by_id[node.size]
+                node.extra['arvados_node_size'] = node.extra.get('metadata', {}).get('arvados_node_size')
          return nodelist
  
      @classmethod
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py

index 22ea8e3af7d9e4ebc3b099b06f392e3818fea91b..4171b74a5dee3ed6bf2ee1a8637eb9acfe164e68 100644 (file)
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -17,6 +17,7 @@ from apiclient import errors as apierror
  
  from .baseactor import BaseNodeManagerActor
  
+from functools import partial
  from libcloud.common.types import LibcloudError
  from libcloud.common.exceptions import BaseHTTPError
  
@@ -69,12 +70,22 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                  if not self.has_option(sec_name, opt_name):
                      self.set(sec_name, opt_name, value)
  
-    def get_section(self, section, transformer=None):
+    def get_section(self, section, transformers={}, default_transformer=None):
+        transformer_map = {
+            int: self.getint,
+            bool: self.getboolean,
+            float: self.getfloat,
+        }
          result = self._dict()
          for key, value in self.items(section):
+            transformer = None
+            if transformers.get(key) in transformer_map:
+                transformer = partial(transformer_map[transformers[key]], section)
+            elif default_transformer in transformer_map:
+                transformer = partial(transformer_map[default_transformer], section)
              if transformer is not None:
                  try:
-                    value = transformer(value)
+                    value = transformer(key)
                  except (TypeError, ValueError):
                      pass
              result[key] = value
@@ -128,29 +139,38 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                                          self.get_section('Cloud Create'),
                                          driver_class=driver_class)
  
-    def node_sizes(self, all_sizes):
+    def node_sizes(self):
          """Finds all acceptable NodeSizes for our installation.
  
          Returns a list of (NodeSize, kwargs) pairs for each NodeSize object
          returned by libcloud that matches a size listed in our config file.
          """
-
+        all_sizes = self.new_cloud_client().list_sizes()
          size_kwargs = {}
+        section_types = {
+            'price': float,
+            'preemptable': bool,
+        }
          for sec_name in self.sections():
              sec_words = sec_name.split(None, 2)
              if sec_words[0] != 'Size':
                  continue
-            size_spec = self.get_section(sec_name, int)
-            if 'price' in size_spec:
-                size_spec['price'] = float(size_spec['price'])
+            size_spec = self.get_section(sec_name, section_types, int)
+            if 'preemptable' not in size_spec:
+                size_spec['preemptable'] = False
+            if 'instance_type' not in size_spec:
+                # Assume instance type is Size name if missing
+                size_spec['instance_type'] = sec_words[1]
+            size_spec['id'] = sec_words[1]
              size_kwargs[sec_words[1]] = size_spec
          # EC2 node sizes are identified by id. GCE sizes are identified by name.
          matching_sizes = []
          for size in all_sizes:
-            if size.id in size_kwargs:
-                matching_sizes.append((size, size_kwargs[size.id]))
-            elif size.name in size_kwargs:
-                matching_sizes.append((size, size_kwargs[size.name]))
+            matching_sizes += [
+                (size, size_kwargs[s]) for s in size_kwargs
+                if size_kwargs[s]['instance_type'] == size.id
+                or size_kwargs[s]['instance_type'] == size.name
+            ]
          return matching_sizes
  
      def shutdown_windows(self):
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py

index 90b32290b76932fa93dbb1ff0854aeb2219eaf4c..db578e01fe6cd3fc56cccdb82b603c4215d773d5 100644 (file)
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -24,6 +24,26 @@ class ServerCalculator(object):
      that would best satisfy the jobs, choosing the cheapest size that
      satisfies each job, and ignoring jobs that can't be satisfied.
      """
+    class InvalidCloudSize(object):
+        """
+        Dummy CloudSizeWrapper-like class, to be used when a cloud node doesn't
+        have a recognizable arvados_node_size tag.
+        """
+        def __init__(self):
+            self.id = 'invalid'
+            self.name = 'invalid'
+            self.ram = 0
+            self.disk = 0
+            self.scratch = 0
+            self.cores = 0
+            self.bandwidth = 0
+            self.price = 9999999
+            self.preemptable = False
+            self.extra = {}
+
+        def meets_constraints(self, **kwargs):
+            return False
+
  
      class CloudSizeWrapper(object):
          def __init__(self, real_size, node_mem_scaling, **kwargs):
@@ -38,7 +58,9 @@ class ServerCalculator(object):
                  self.disk = 0
              self.scratch = self.disk * 1000
              self.ram = int(self.ram * node_mem_scaling)
+            self.preemptable = False
              for name, override in kwargs.iteritems():
+                if name == 'instance_type': continue
                  if not hasattr(self, name):
                      raise ValueError("unrecognized size field '%s'" % (name,))
                  setattr(self, name, override)
@@ -80,10 +102,12 @@ class ServerCalculator(object):
          wants = {'cores': want_value('min_cores_per_node'),
                   'ram': want_value('min_ram_mb_per_node'),
                   'scratch': want_value('min_scratch_mb_per_node')}
+        # EC2 node sizes are identified by id. GCE sizes are identified by name.
          for size in self.cloud_sizes:
              if (size.meets_constraints(**wants) and
-                (specified_size is None or size.id == specified_size)):
-                    return size
+                (specified_size is None or
+                    size.id == specified_size or size.name == specified_size)):
+                        return size
          return None
  
      def servers_for_queue(self, queue):
@@ -101,7 +125,7 @@ class ServerCalculator(object):
                      "Job's min_nodes constraint is greater than the configured "
                      "max_nodes (%d)" % self.max_nodes)
              elif (want_count*cloud_size.price <= self.max_price):
-                servers.extend([cloud_size.real] * want_count)
+                servers.extend([cloud_size] * want_count)
              else:
                  unsatisfiable_jobs[job['uuid']] = (
                      "Job's price (%d) is above system's max_price "
@@ -115,7 +139,7 @@ class ServerCalculator(object):
          for s in self.cloud_sizes:
              if s.id == sizeid:
                  return s
-        return None
+        return InvalidCloudSize()
  
  
  class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py

index 04a84c9644fc1dd78cfb92b5762cb6cc8d9f37ca..f65e0806ec56df96f81c5bed87f657b15355fdec 100644 (file)
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -71,7 +71,7 @@ def setup_logging(path, level, **sublevels):
      return root_logger
  
  def build_server_calculator(config):
-    cloud_size_list = config.node_sizes(config.new_cloud_client().list_sizes())
+    cloud_size_list = config.node_sizes()
      if not cloud_size_list:
          abort("No valid node sizes configured")
      return ServerCalculator(cloud_size_list,
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py

index 4b9d5b60fb0ce5131d865f4b3d97b0652afb88c8..66af7c32d128ab3a51815a74443b885779052f6b 100644 (file)
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -80,8 +80,8 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
      def _send_request(self):
          nodes = self._client.list_nodes()
          for n in nodes:
-            # Replace with libcloud NodeSize object with compatible
+            # Replace the libcloud NodeSize object with compatible
              # CloudSizeWrapper object which merges the size info reported from
              # the cloud with size information from the configuration file.
-            n.size = self._calculator.find_size(n.size.id)
+            n.size = self._calculator.find_size(n.extra['arvados_node_size'])
          return nodes
diff --git a/services/nodemanager/arvnodeman/test/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py

index bf530f7ceb4917a18e3f76cce266371ee7455f83..fd3330e5e401b9f6ee885f5b6f1ac8380ee3eb1f 100644 (file)
--- a/services/nodemanager/arvnodeman/test/fake_driver.py
+++ b/services/nodemanager/arvnodeman/test/fake_driver.py
@@ -43,13 +43,16 @@ class FakeDriver(NodeDriver):
          global all_nodes, create_calls
          create_calls += 1
          nodeid = "node%i" % create_calls
+        if ex_tags is None:
+            ex_tags = {}
+        ex_tags.update({'arvados_node_size': size.id})
          n = Node(nodeid, nodeid, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
          all_nodes.append(n)
          if ex_customdata:
              ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0]
          if ex_userdata:
              ping_url = ex_userdata
-        if ex_metadata:
+        elif ex_metadata:
              ping_url = ex_metadata["arv-ping-url"]
          ping_url += "&instance_id=" + nodeid
          ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
@@ -161,7 +164,12 @@ class FakeAwsDriver(FakeDriver):
                                                        auth=auth,
                                                        ex_metadata=ex_metadata,
                                                        ex_userdata=ex_userdata)
-        n.extra = {"launch_time": time.strftime(ARVADOS_TIMEFMT, time.gmtime())[:-1]}
+        n.extra = {
+            "launch_time": time.strftime(ARVADOS_TIMEFMT, time.gmtime())[:-1],
+            "metadata" : {
+                "arvados_node_size": size.id
+            }
+        }
          return n
  
      def list_sizes(self, **kwargs):
@@ -187,7 +195,8 @@ class FakeGceDriver(FakeDriver):
                                                     ex_metadata=ex_metadata)
          n.extra = {
              "metadata": {
-                "items": [{"key": k, "value": v} for k,v in ex_metadata.iteritems()]
+                "items": [{"key": k, "value": v} for k,v in ex_metadata.iteritems()],
+                "arvados_node_size": size.id
              },
              "zone": "fake"
          }
diff --git a/services/nodemanager/setup.py b/services/nodemanager/setup.py

index 3b8502c0535ef14777af2e211162d7774714c27d..3d6b15c421d3028fc02b0bbf1ad51f36b66d96da 100644 (file)
--- a/services/nodemanager/setup.py
+++ b/services/nodemanager/setup.py
@@ -42,12 +42,15 @@ setup(name='arvados-node-manager',
            'python-daemon',
            'setuptools'
        ],
+      dependency_links=[
+          "https://github.com/curoverse/libcloud/archive/apache-libcloud-2.3.1.dev1.zip"
+      ],
        test_suite='tests',
        tests_require=[
            'requests',
            'pbr<1.7.0',
            'mock>=1.0',
-          'apache-libcloud>=2.3',
+          'apache-libcloud>=2.3.1.dev1',
        ],
        zip_safe=False
        )
diff --git a/services/nodemanager/tests/test_computenode_driver.py b/services/nodemanager/tests/test_computenode_driver.py

index 128a29e28d24ba4d5f3f8aae1bc535c9c60af043..4bf4c39efbc45ea069ea91ca3c0e94108d9b248b 100644 (file)
--- a/services/nodemanager/tests/test_computenode_driver.py
+++ b/services/nodemanager/tests/test_computenode_driver.py
@@ -80,7 +80,7 @@ class ComputeNodeDriverTestCase(unittest.TestCase):
          for an_error, is_cloud_error in errors:
              self.driver_mock().create_node.side_effect = an_error
              with self.assertRaises(an_error):
-                driver.create_node('1', 'id_1')
+                driver.create_node(testutil.MockSize(1), 'id_1')
              if is_cloud_error:
                  error_count += 1
              self.assertEqual(error_count, status.tracker.get('create_node_errors'))
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py

index 297eac0ef3baade9abcca3691009628f6c3647c2..ed0e0bd21ad7351084d8856542bb7c30d5d9c928 100644 (file)
--- a/services/nodemanager/tests/test_computenode_driver_ec2.py
+++ b/services/nodemanager/tests/test_computenode_driver_ec2.py
@@ -56,9 +56,11 @@ class EC2ComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
          driver.create_node(testutil.MockSize(1), arv_node)
          create_method = self.driver_mock().create_node
          self.assertTrue(create_method.called)
-        self.assertEqual(
-            {'test':'testvalue'},
-            create_method.call_args[1].get('ex_metadata', {'arg': 'missing'})
+        self.assertIn(
+            ('test', 'testvalue'),
+            create_method.call_args[1].get(
+                'ex_metadata',
+                {'arg': 'missing'}).items()
          )
  
      def test_hostname_from_arvados_node(self):
diff --git a/services/nodemanager/tests/test_config.py b/services/nodemanager/tests/test_config.py

index 921281bc517bd7b6bc41935193c0b8562395a6da..15716d5277e502c52df55cc034f49078ff148b6d 100644 (file)
--- a/services/nodemanager/tests/test_config.py
+++ b/services/nodemanager/tests/test_config.py
@@ -53,8 +53,7 @@ testlogger = INFO
  
      def test_list_sizes(self):
          config = self.load_config()
-        client = config.new_cloud_client()
-        sizes = config.node_sizes(client.list_sizes())
+        sizes = config.node_sizes()
          self.assertEqual(1, len(sizes))
          size, kwargs = sizes[0]
          self.assertEqual('Small', size.name)
diff --git a/services/nodemanager/tests/test_nodelist.py b/services/nodemanager/tests/test_nodelist.py

index 5becd0c2241386e34b6dfef8e57a29b025335a67..b087325c6f702347d68bd68983793b9ab3536787 100644 (file)
--- a/services/nodemanager/tests/test_nodelist.py
+++ b/services/nodemanager/tests/test_nodelist.py
@@ -84,6 +84,7 @@ class CloudNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
              self.public_ips = []
              self.size = testutil.MockSize(1)
              self.state = 0
+            self.extra = {'arvados_node_size': self.size.id}
  
  
      def build_monitor(self, side_effect, *args, **kwargs):
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py

index 555144c4d05d2bc562d9bc2357fa93421f64b35f..857e09f06e52eb2c5096b0d08783137230de48b9 100644 (file)
--- a/services/nodemanager/tests/testutil.py
+++ b/services/nodemanager/tests/testutil.py
@@ -87,6 +87,8 @@ class MockSize(object):
          self.bandwidth = 16 * factor
          self.price = float(factor)
          self.extra = {}
+        self.real = self
+        self.preemptable = False
  
      def __eq__(self, other):
          return self.id == other.id
author	Lucas Di Pentima <ldipentima@veritasgenetics.com>
	Mon, 11 Jun 2018 15:34:22 +0000 (12:34 -0300)
committer	Lucas Di Pentima <ldipentima@veritasgenetics.com>
	Mon, 11 Jun 2018 15:34:22 +0000 (12:34 -0300)
build/build.list		patch \| blob \| history
build/libcloud-pin.sh		patch \| blob \| history
build/run-build-packages.sh		patch \| blob \| history
lib/dispatchcloud/node_size.go		patch \| blob \| history
lib/dispatchcloud/node_size_test.go		patch \| blob \| history
sdk/go/arvados/config.go		patch \| blob \| history
sdk/go/arvados/container.go		patch \| blob \| history
services/api/app/models/container_request.rb		patch \| blob \| history
services/api/config/application.default.yml		patch \| blob \| history
services/api/test/unit/container_request_test.rb		patch \| blob \| history
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py		patch \| blob \| history
services/nodemanager/arvnodeman/computenode/driver/__init__.py		patch \| blob \| history
services/nodemanager/arvnodeman/computenode/driver/azure.py		patch \| blob \| history
services/nodemanager/arvnodeman/computenode/driver/ec2.py		patch \| blob \| history
services/nodemanager/arvnodeman/computenode/driver/gce.py		patch \| blob \| history
services/nodemanager/arvnodeman/config.py		patch \| blob \| history
services/nodemanager/arvnodeman/jobqueue.py		patch \| blob \| history
services/nodemanager/arvnodeman/launcher.py		patch \| blob \| history
services/nodemanager/arvnodeman/nodelist.py		patch \| blob \| history
services/nodemanager/arvnodeman/test/fake_driver.py		patch \| blob \| history
services/nodemanager/setup.py		patch \| blob \| history
services/nodemanager/tests/test_computenode_driver.py		patch \| blob \| history
services/nodemanager/tests/test_computenode_driver_ec2.py		patch \| blob \| history
services/nodemanager/tests/test_config.py		patch \| blob \| history
services/nodemanager/tests/test_nodelist.py		patch \| blob \| history
services/nodemanager/tests/testutil.py		patch \| blob \| history