Merge branch 'master' into 7478-anm-spot-instances
authorLucas Di Pentima <ldipentima@veritasgenetics.com>
Fri, 25 May 2018 15:42:25 +0000 (12:42 -0300)
committerLucas Di Pentima <ldipentima@veritasgenetics.com>
Fri, 25 May 2018 15:42:25 +0000 (12:42 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima@veritasgenetics.com>

15 files changed:
build/build.list
build/libcloud-pin.sh
build/run-build-packages.sh
lib/dispatchcloud/node_size.go
lib/dispatchcloud/node_size_test.go
sdk/go/arvados/config.go
sdk/go/arvados/container.go
services/api/app/models/container_request.rb
services/api/config/application.default.yml
services/api/test/unit/container_request_test.rb
services/nodemanager/arvnodeman/computenode/driver/__init__.py
services/nodemanager/arvnodeman/computenode/driver/ec2.py
services/nodemanager/arvnodeman/config.py
services/nodemanager/arvnodeman/jobqueue.py
services/nodemanager/setup.py

index 3d98fafb449b77f19bf074a7c8fa84ba88535d4f..fa1a260c3a225fcbf29bfccb17e364b1f7277000 100644 (file)
@@ -5,7 +5,6 @@
 #distribution(s)|name|version|iteration|type|architecture|extra fpm arguments
 debian8,debian9,centos7|python-gflags|2.0|2|python|all
 debian8,debian9,ubuntu1404,ubuntu1604,centos7|google-api-python-client|1.6.2|2|python|all
-debian8,debian9,ubuntu1404,ubuntu1604,centos7|apache-libcloud|2.3.0|3|python|all|--depends 'python-requests >= 2.4.3'
 debian8,debian9,ubuntu1404,centos7|oauth2client|1.5.2|2|python|all
 debian8,debian9,ubuntu1404,centos7|pyasn1|0.1.7|2|python|all
 debian8,debian9,ubuntu1404,centos7|pyasn1-modules|0.0.5|2|python|all
index cfbba404504e3b7c60d553040fb64c97e3698f77..bb66c6b218c020c5d038c1e5e7b51f8681043db9 100644 (file)
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-LIBCLOUD_PIN=2.3.0
+LIBCLOUD_PIN=2.3.1.dev1
 
-using_fork=false
+using_fork=true
 if [[ $using_fork = true ]]; then
     LIBCLOUD_PIN_SRC="https://github.com/curoverse/libcloud/archive/apache-libcloud-$LIBCLOUD_PIN.zip"
 else
index 63f81832f0abecf4688cef2a65fa16bda31d691e..351d1b2a1f3df666c7fc90df6f456c10522c2dcf 100755 (executable)
@@ -434,8 +434,30 @@ if [[ "$?" == "0" ]]; then
   fpm_build $WORKSPACE/tools/crunchstat-summary ${PYTHON2_PKG_PREFIX}-crunchstat-summary 'Curoverse, Inc.' 'python' "$crunchstat_summary_version" "--url=https://arvados.org" "--description=Crunchstat-summary reads Arvados Crunch log files and summarize resource usage" --iteration "$iteration"
 fi
 
-## if libcloud becomes our own fork see
-## https://dev.arvados.org/issues/12268#note-27
+# Forked libcloud
+if test_package_presence "$PYTHON2_PKG_PREFIX"-apache-libcloud "$LIBCLOUD_PIN" python 2
+then
+  LIBCLOUD_DIR=$(mktemp -d)
+  (
+      cd $LIBCLOUD_DIR
+      git clone $DASHQ_UNLESS_DEBUG https://github.com/curoverse/libcloud.git .
+      git checkout $DASHQ_UNLESS_DEBUG apache-libcloud-$LIBCLOUD_PIN
+      # libcloud is absurdly noisy without -q, so force -q here
+      OLD_DASHQ_UNLESS_DEBUG=$DASHQ_UNLESS_DEBUG
+      DASHQ_UNLESS_DEBUG=-q
+      handle_python_package
+      DASHQ_UNLESS_DEBUG=$OLD_DASHQ_UNLESS_DEBUG
+  )
+
+  # libcloud >= 2.3.0 now requires python-requests 2.4.3 or higher, otherwise
+  # it throws
+  #   ImportError: No module named packages.urllib3.poolmanager
+  # when loaded. We only see this problem on ubuntu1404, because that is our
+  # only supported distribution that ships with a python-requests older than
+  # 2.4.3.
+  fpm_build $LIBCLOUD_DIR "$PYTHON2_PKG_PREFIX"-apache-libcloud "" python "" --iteration 2 --depends 'python-requests >= 2.4.3'
+  rm -rf $LIBCLOUD_DIR
+fi
 
 # Python 2 dependencies
 declare -a PIP_DOWNLOAD_SWITCHES=(--no-deps)
index 2ca405060390c65df2f961f7c7a83e5a278d0687..f145901c32fd5a51e50b55df7b91268de6969c93 100644 (file)
@@ -47,6 +47,7 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
                case it.Scratch < needScratch:
                case it.RAM < needRAM:
                case it.VCPUs < needVCPUs:
+               case it.Preemptable != ctr.SchedulingParameters.Preemptable:
                case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
                        // Equal price, but worse specs
                default:
index 0c02a0e3e1be45bfeb6b2371287a4ce664de1d98..9a5e704695138cb54adb6791dcf3f61f8eb5e3ba 100644 (file)
@@ -91,3 +91,31 @@ func (*NodeSizeSuite) TestChoose(c *check.C) {
                c.Check(best.Scratch >= 2*GiB, check.Equals, true)
        }
 }
+
+func (*NodeSizeSuite) TestChoosePreemptable(c *check.C) {
+       menu := []arvados.InstanceType{
+               {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Preemptable: true, Name: "costly"},
+               {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "almost best"},
+               {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Preemptable: true, Name: "best"},
+               {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Preemptable: true, Name: "small"},
+       }
+       best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
+               Mounts: map[string]arvados.Mount{
+                       "/tmp": {Kind: "tmp", Capacity: 2 * GiB},
+               },
+               RuntimeConstraints: arvados.RuntimeConstraints{
+                       VCPUs:        2,
+                       RAM:          987654321,
+                       KeepCacheRAM: 123456789,
+               },
+               SchedulingParameters: arvados.SchedulingParameters{
+                       Preemptable: true,
+               },
+       })
+       c.Check(err, check.IsNil)
+       c.Check(best.Name, check.Equals, "best")
+       c.Check(best.RAM >= 1234567890, check.Equals, true)
+       c.Check(best.VCPUs >= 2, check.Equals, true)
+       c.Check(best.Scratch >= 2*GiB, check.Equals, true)
+       c.Check(best.Preemptable, check.Equals, true)
+}
index 9ed0eacf23e6d753c1b6c2a0f781282c96dde8cc..b0c7069cd98007ec0f24ad45cb809c137784eaae 100644 (file)
@@ -62,6 +62,7 @@ type InstanceType struct {
        RAM          int64
        Scratch      int64
        Price        float64
+       Preemptable  bool
 }
 
 // GetThisSystemNode returns a SystemNode for the node we're running
index daafc4995448524f7fe3794b9facd13e01480823..e71bcd5d0da5cdeaebdc6dfd3be05cd81d681d58 100644 (file)
@@ -52,7 +52,8 @@ type RuntimeConstraints struct {
 // SchedulingParameters specify a container's scheduling parameters
 // such as Partitions
 type SchedulingParameters struct {
-       Partitions []string `json:"partitions"`
+       Partitions  []string `json:"partitions"`
+       Preemptable bool     `json:"preemptable"`
 }
 
 // ContainerList is an arvados#containerList resource.
index ac4415bf2e618c0e8ef4c6fc6a06d9191cb91db4..5c5a043a0563a6c3013c1697b02afcbfc6ee02f1 100644 (file)
@@ -223,6 +223,13 @@ class ContainerRequest < ArvadosModel
             scheduling_parameters['partitions'].size)
             errors.add :scheduling_parameters, "partitions must be an array of strings"
       end
+      if !self.scheduling_parameters.include?('preemptable')
+        if !self.requesting_container_uuid.nil? and Rails.configuration.preemptable_instances
+          self.scheduling_parameters['preemptable'] = true
+        end
+      elsif scheduling_parameters['preemptable'] and self.requesting_container_uuid.nil?
+        errors.add :scheduling_parameters, "only child containers can be preemptable"
+      end
     end
   end
 
index a1c35f10fcf1f9e1aae9ead9bf1cda00b5f2535a..19b6f9b25058b3d418e37126220287d2511aabda 100644 (file)
@@ -289,6 +289,11 @@ common:
   ### Crunch, DNS & compute node management
   ###
 
+  # Preemptable instance support (e.g. AWS Spot Instances)
+  # When true, child containers will get created with the preemptable
+  # scheduling parameter parameter set.
+  preemptable_instances: false
+
   # Docker image to be used when none found in runtime_constraints of a job
   default_docker_image_for_jobs: false
 
index 3483b874c6c71cd4db6185df6e600eca1c4169f0..5bca02a967691ad328a7dee9d999badc3a8cc6aa 100644 (file)
@@ -757,6 +757,75 @@ class ContainerRequestTest < ActiveSupport::TestCase
     assert_equal ContainerRequest::Final, cr3.state
   end
 
+  [
+    [{"preemptable" => true}, nil, ActiveRecord::RecordInvalid],
+    [{"preemptable" => false}, nil, nil],
+    [{"preemptable" => true}, 'zzzzz-dz642-runningcontainr', nil],
+    [{"preemptable" => false}, 'zzzzz-dz642-runningcontainr', nil],
+  ].each do |sp, requesting_c, expected|
+    test "create container request with scheduling_parameters #{sp} with requesting_container_uuid=#{requesting_c} and verify #{expected}" do
+      common_attrs = {cwd: "test",
+                      priority: 1,
+                      command: ["echo", "hello"],
+                      output_path: "test",
+                      scheduling_parameters: sp,
+                      mounts: {"test" => {"kind" => "json"}}}
+
+      set_user_from_auth :active
+
+      if requesting_c
+        cr = with_container_auth(Container.find_by_uuid requesting_c) do
+          create_minimal_req!(common_attrs)
+        end
+        assert_not_nil cr.requesting_container_uuid
+      else
+        cr = create_minimal_req!(common_attrs)
+      end
+
+      cr.state = ContainerRequest::Committed
+      if !expected.nil?
+        assert_raises(expected) do
+          cr.save!
+        end
+      else
+        cr.save!
+        assert_equal sp, cr.scheduling_parameters
+      end
+    end
+  end
+
+  [
+    [true, 'zzzzz-dz642-runningcontainr', true],
+    [true, nil, nil],
+    [false, 'zzzzz-dz642-runningcontainr', nil],
+    [false, nil, nil],
+  ].each do |preemptable_conf, requesting_c, schedule_preemptable|
+    test "having Rails.configuration.preemptable_instances=#{preemptable_conf}, #{requesting_c.nil? ? 'non-':''}child CR should #{schedule_preemptable ? '':'not'} ask for preemptable instance by default" do
+      common_attrs = {cwd: "test",
+                      priority: 1,
+                      command: ["echo", "hello"],
+                      output_path: "test",
+                      mounts: {"test" => {"kind" => "json"}}}
+
+      Rails.configuration.preemptable_instances = preemptable_conf
+      set_user_from_auth :active
+
+      if requesting_c
+        cr = with_container_auth(Container.find_by_uuid requesting_c) do
+          create_minimal_req!(common_attrs)
+        end
+        assert_not_nil cr.requesting_container_uuid
+      else
+        cr = create_minimal_req!(common_attrs)
+      end
+
+      cr.state = ContainerRequest::Committed
+      cr.save!
+
+      assert_equal schedule_preemptable, cr.scheduling_parameters['preemptable']
+    end
+  end
+
   [
     [{"partitions" => ["fastcpu","vfastcpu", 100]}, ContainerRequest::Committed, ActiveRecord::RecordInvalid],
     [{"partitions" => ["fastcpu","vfastcpu", 100]}, ContainerRequest::Uncommitted],
index 7ed7435553647fdc55958337e6d2461345c4098d..9e38d13eb7f4788d8af485a7e5b4b6589c9f324c 100644 (file)
@@ -174,7 +174,7 @@ class BaseComputeNodeDriver(RetryMixin):
         try:
             kwargs = self.create_kwargs.copy()
             kwargs.update(self.arvados_create_kwargs(size, arvados_node))
-            kwargs['size'] = size
+            kwargs['size'] = size.real
             return self.real.create_node(**kwargs)
         except CLOUD_ERRORS as create_error:
             # Workaround for bug #6702: sometimes the create node request
index 9300645c38f47b74d780e605d32e37134df0c15a..1442a1245b2fa795fec159985fc3332c70116196 100644 (file)
@@ -91,6 +91,9 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
                     "VolumeSize": volsize,
                     "VolumeType": "gp2"
                 }}]
+        if size.preemptable:
+            # Request a Spot instance for this node
+            kw['ex_spot_market'] = True
         return kw
 
     def sync_node(self, cloud_node, arvados_node):
index e47f9fcb1d036b78f94af0af25e8c37dc17b5ad0..86550468425f44fb07d120e5cea7d1c572807e35 100644 (file)
@@ -17,6 +17,7 @@ from apiclient import errors as apierror
 
 from .baseactor import BaseNodeManagerActor
 
+from functools import partial
 from libcloud.common.types import LibcloudError
 from libcloud.common.exceptions import BaseHTTPError
 
@@ -69,12 +70,22 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                 if not self.has_option(sec_name, opt_name):
                     self.set(sec_name, opt_name, value)
 
-    def get_section(self, section, transformer=None):
+    def get_section(self, section, transformers={}, default_transformer=None):
+        transformer_map = {
+            int: self.getint,
+            bool: self.getboolean,
+            float: self.getfloat,
+        }
         result = self._dict()
         for key, value in self.items(section):
+            transformer = None
+            if transformers.get(key) in transformer_map:
+                transformer = partial(transformer_map[transformers[key]], section)
+            elif default_transformer in transformer_map:
+                transformer = partial(transformer_map[default_transformer], section)
             if transformer is not None:
                 try:
-                    value = transformer(value)
+                    value = transformer(key)
                 except (TypeError, ValueError):
                     pass
             result[key] = value
@@ -136,21 +147,29 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
         """
 
         size_kwargs = {}
+        section_types = {
+            'price': float,
+            'preemptable': bool,
+        }
         for sec_name in self.sections():
             sec_words = sec_name.split(None, 2)
             if sec_words[0] != 'Size':
                 continue
-            size_spec = self.get_section(sec_name, int)
-            if 'price' in size_spec:
-                size_spec['price'] = float(size_spec['price'])
+            size_spec = self.get_section(sec_name, section_types, int)
+            if 'preemptable' not in size_spec:
+                size_spec['preemptable'] = False
+            if 'instance_type' not in size_spec:
+                # Assume instance type is Size name is missing
+                size_spec['instance_type'] = sec_words[1]
             size_kwargs[sec_words[1]] = size_spec
         # EC2 node sizes are identified by id. GCE sizes are identified by name.
         matching_sizes = []
         for size in all_sizes:
-            if size.id in size_kwargs:
-                matching_sizes.append((size, size_kwargs[size.id]))
-            elif size.name in size_kwargs:
-                matching_sizes.append((size, size_kwargs[size.name]))
+            matching_sizes += [
+                (size, size_kwargs[s]) for s in size_kwargs
+                if size_kwargs[s]['instance_type'] == size.id
+                or size_kwargs[s]['instance_type'] == size.name
+            ]
         return matching_sizes
 
     def shutdown_windows(self):
index 90b32290b76932fa93dbb1ff0854aeb2219eaf4c..6ca28e62397384bd853583ecab1f25de52dbbc2d 100644 (file)
@@ -38,6 +38,8 @@ class ServerCalculator(object):
                 self.disk = 0
             self.scratch = self.disk * 1000
             self.ram = int(self.ram * node_mem_scaling)
+            self.preemptable = False
+            self.instance_type = None
             for name, override in kwargs.iteritems():
                 if not hasattr(self, name):
                     raise ValueError("unrecognized size field '%s'" % (name,))
@@ -80,10 +82,12 @@ class ServerCalculator(object):
         wants = {'cores': want_value('min_cores_per_node'),
                  'ram': want_value('min_ram_mb_per_node'),
                  'scratch': want_value('min_scratch_mb_per_node')}
+        # EC2 node sizes are identified by id. GCE sizes are identified by name.
         for size in self.cloud_sizes:
             if (size.meets_constraints(**wants) and
-                (specified_size is None or size.id == specified_size)):
-                    return size
+                (specified_size is None or
+                    size.id == specified_size or size.name == specified_size)):
+                        return size
         return None
 
     def servers_for_queue(self, queue):
@@ -101,7 +105,7 @@ class ServerCalculator(object):
                     "Job's min_nodes constraint is greater than the configured "
                     "max_nodes (%d)" % self.max_nodes)
             elif (want_count*cloud_size.price <= self.max_price):
-                servers.extend([cloud_size.real] * want_count)
+                servers.extend([cloud_size] * want_count)
             else:
                 unsatisfiable_jobs[job['uuid']] = (
                     "Job's price (%d) is above system's max_price "
index 3b8502c0535ef14777af2e211162d7774714c27d..3d6b15c421d3028fc02b0bbf1ad51f36b66d96da 100644 (file)
@@ -42,12 +42,15 @@ setup(name='arvados-node-manager',
           'python-daemon',
           'setuptools'
       ],
+      dependency_links=[
+          "https://github.com/curoverse/libcloud/archive/apache-libcloud-2.3.1.dev1.zip"
+      ],
       test_suite='tests',
       tests_require=[
           'requests',
           'pbr<1.7.0',
           'mock>=1.0',
-          'apache-libcloud>=2.3',
+          'apache-libcloud>=2.3.1.dev1',
       ],
       zip_safe=False
       )