7475: Merge branch 'master' into 7475-nodemgr-unsatisfiable-job-comms
authorLucas Di Pentima <lucas@curoverse.com>
Sat, 29 Jul 2017 14:11:43 +0000 (11:11 -0300)
committerLucas Di Pentima <lucas@curoverse.com>
Sat, 29 Jul 2017 14:11:43 +0000 (11:11 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <lucas@curoverse.com>

sdk/python/arvados/util.py
services/nodemanager/arvnodeman/jobqueue.py
services/nodemanager/tests/integration_test.py
services/nodemanager/tests/test_jobqueue.py

index 97e1d26d2ba2e1cf7ad503ab80b2974676c87cf8..1a973586051769e816103553e22326839a0c3670 100644 (file)
@@ -24,6 +24,8 @@ collection_uuid_pattern = re.compile(r'[a-z0-9]{5}-4zz18-[a-z0-9]{15}')
 group_uuid_pattern = re.compile(r'[a-z0-9]{5}-j7d0g-[a-z0-9]{15}')
 user_uuid_pattern = re.compile(r'[a-z0-9]{5}-tpzed-[a-z0-9]{15}')
 link_uuid_pattern = re.compile(r'[a-z0-9]{5}-o0j2j-[a-z0-9]{15}')
+job_uuid_pattern = re.compile(r'[a-z0-9]{5}-8i9sb-[a-z0-9]{15}')
+container_uuid_pattern = re.compile(r'[a-z0-9]{5}-dz642-[a-z0-9]{15}')
 manifest_pattern = re.compile(r'((\S+)( +[a-f0-9]{32}(\+\d+)(\+\S+)*)+( +\d+:\d+:\S+)+$)+', flags=re.MULTILINE)
 
 def clear_tmpdir(path=None):
index ca914e1096def7d28a9be41e90ffcbac2d01d203..fa1ea983015f0e041ea258bba085eabda1924d77 100644 (file)
@@ -8,9 +8,12 @@ from __future__ import absolute_import, print_function
 import logging
 import subprocess
 
+import arvados.util
+
 from . import clientactor
 from .config import ARVADOS_ERRORS
 
+
 class ServerCalculator(object):
     """Generate cloud server wishlists from an Arvados job queue.
 
@@ -58,7 +61,6 @@ class ServerCalculator(object):
         self.max_nodes = max_nodes or float('inf')
         self.max_price = max_price or float('inf')
         self.logger = logging.getLogger('arvnodeman.jobqueue')
-        self.logged_jobs = set()
 
         self.logger.info("Using cloud node sizes:")
         for s in self.cloud_sizes:
@@ -83,20 +85,26 @@ class ServerCalculator(object):
 
     def servers_for_queue(self, queue):
         servers = []
-        seen_jobs = set()
+        unsatisfiable_jobs = {}
         for job in queue:
-            seen_jobs.add(job['uuid'])
             constraints = job['runtime_constraints']
             want_count = max(1, self.coerce_int(constraints.get('min_nodes'), 1))
             cloud_size = self.cloud_size_for_constraints(constraints)
             if cloud_size is None:
-                if job['uuid'] not in self.logged_jobs:
-                    self.logged_jobs.add(job['uuid'])
-                    self.logger.debug("job %s not satisfiable", job['uuid'])
-            elif (want_count <= self.max_nodes) and (want_count*cloud_size.price <= self.max_price):
+                unsatisfiable_jobs[job['uuid']] = (
+                    'Requirements for a single node exceed the available '
+                    'cloud node size')
+            elif (want_count > self.max_nodes):
+                unsatisfiable_jobs[job['uuid']] = (
+                    "Job's min_nodes constraint is greater than the configured "
+                    "max_nodes (%d)" % self.max_nodes)
+            elif (want_count*cloud_size.price <= self.max_price):
                 servers.extend([cloud_size.real] * want_count)
-        self.logged_jobs.intersection_update(seen_jobs)
-        return servers
+            else:
+                unsatisfiable_jobs[job['uuid']] = (
+                    "Job's price (%d) is above system's max_price "
+                    "limit (%d)" % (want_count*cloud_size.price, self.max_price))
+        return (servers, unsatisfiable_jobs)
 
     def cheapest_size(self):
         return self.cloud_sizes[0]
@@ -107,6 +115,7 @@ class ServerCalculator(object):
                 return s
         return None
 
+
 class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
     """Actor to generate server wishlists from the job queue.
 
@@ -165,7 +174,28 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
         return queuelist
 
     def _got_response(self, queue):
-        server_list = self._calculator.servers_for_queue(queue)
+        server_list, unsatisfiable_jobs = self._calculator.servers_for_queue(queue)
+        # Cancel any job/container with unsatisfiable requirements, emitting
+        # a log explaining why.
+        for job_uuid, reason in unsatisfiable_jobs.iteritems():
+            try:
+                self._client.logs().create(body={
+                    'object_uuid': job_uuid,
+                    'event_type': 'stderr',
+                    'properties': {'text': reason},
+                }).execute()
+                # Cancel the job depending on its type
+                if arvados.util.container_uuid_pattern.match(job_uuid):
+                    subprocess.check_call(['scancel', '--name='+job_uuid])
+                elif arvados.util.job_uuid_pattern.match(job_uuid):
+                    self._client.jobs().cancel(uuid=job_uuid).execute()
+                else:
+                    raise Exception('Unknown job type')
+                self._logger.debug("Cancelled unsatisfiable job '%s'", job_uuid)
+            except Exception as error:
+                self._logger.error("Trying to cancel job '%s': %s",
+                                   job_uuid,
+                                   error)
         self._logger.debug("Calculated wishlist: %s",
                            ', '.join(s.name for s in server_list) or "(empty)")
         return super(JobQueueMonitorActor, self)._got_response(server_list)
index feba3ce185caaf46517adc988fd63fafbc4985b1..c4565fdbaf7291ac2f46f48a2c4e629b9e63989a 100755 (executable)
@@ -40,6 +40,7 @@ detail.addHandler(logging.StreamHandler(detail_content))
 fake_slurm = None
 compute_nodes = None
 all_jobs = None
+unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(), "scancel_called")
 
 def update_script(path, val):
     with open(path+"_", "w") as f:
@@ -54,6 +55,33 @@ def set_squeue(g):
                   "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
     return 0
 
+def set_queue_unsatisfiable(g):
+    global all_jobs, unsatisfiable_job_scancelled
+    # Simulate a job requesting a 99 core node.
+    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
+                  "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
+    update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
+                  "\ntouch %s" % unsatisfiable_job_scancelled)
+    return 0
+
+def job_cancelled(g):
+    global unsatisfiable_job_scancelled
+    cancelled_job = g.group(1)
+    api = arvados.api('v1')
+    # Check that 'scancel' was called
+    if not os.path.isfile(unsatisfiable_job_scancelled):
+        return 1
+    # Check for the log entry
+    log_entry = api.logs().list(
+        filters=[
+            ['object_uuid', '=', cancelled_job],
+            ['event_type', '=', 'stderr'],
+        ]).execute()['items'][0]
+    if not re.match(
+            r"Requirements for a single node exceed the available cloud node size",
+            log_entry['properties']['text']):
+        return 1
+    return 0
 
 def node_paired(g):
     global compute_nodes
@@ -159,7 +187,7 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
 
     # Test main loop:
     # - Read line
-    # - Apply negative checks (thinks that are not supposed to happen)
+    # - Apply negative checks (things that are not supposed to happen)
     # - Check timeout
     # - Check if the next action should trigger
     # - If all actions are exhausted, terminate with test success
@@ -213,6 +241,7 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
         code = 1
 
     shutil.rmtree(fake_slurm)
+    shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
 
     if code == 0:
         logger.info("%s passed", name)
@@ -228,6 +257,23 @@ def main():
     # Test lifecycle.
 
     tests = {
+        "test_unsatisfiable_jobs" : (
+            # Actions (pattern -> action)
+            [
+                (r".*Daemon started", set_queue_unsatisfiable),
+                (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
+            ],
+            # Checks (things that shouldn't happen)
+            {
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
+                r".*Trying to cancel job '(\S+)'": fail,
+            },
+            # Driver class
+            "arvnodeman.test.fake_driver.FakeDriver",
+            # Jobs
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_single_node_azure": (
             [
                 (r".*Daemon started", set_squeue),
index 8aa0835aca395af7594659e6c836e6245d189494..669b6247114c0f4843f5c2dd51eb9f9d4c00d4a9 100644 (file)
@@ -24,63 +24,69 @@ class ServerCalculatorTestCase(unittest.TestCase):
 
     def test_empty_queue_needs_no_servers(self):
         servcalc = self.make_calculator([1])
-        self.assertEqual([], servcalc.servers_for_queue([]))
+        self.assertEqual(([], {}), servcalc.servers_for_queue([]))
 
     def test_easy_server_count(self):
         servcalc = self.make_calculator([1])
-        servlist = self.calculate(servcalc, {'min_nodes': 3})
+        servlist, _ = self.calculate(servcalc, {'min_nodes': 3})
         self.assertEqual(3, len(servlist))
 
     def test_default_5pct_ram_value_decrease(self):
         servcalc = self.make_calculator([1])
-        servlist = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
+        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
         self.assertEqual(0, len(servlist))
-        servlist = self.calculate(servcalc, {'min_ram_mb_per_node': 121})
+        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 121})
         self.assertEqual(1, len(servlist))
 
     def test_custom_node_mem_scaling_factor(self):
         # Simulate a custom 'node_mem_scaling' config parameter by passing
         # the value to ServerCalculator
         servcalc = self.make_calculator([1], node_mem_scaling=0.5)
-        servlist = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
+        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
         self.assertEqual(0, len(servlist))
-        servlist = self.calculate(servcalc, {'min_ram_mb_per_node': 64})
+        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 64})
         self.assertEqual(1, len(servlist))
 
     def test_implicit_server_count(self):
         servcalc = self.make_calculator([1])
-        servlist = self.calculate(servcalc, {}, {'min_nodes': 3})
+        servlist, _ = self.calculate(servcalc, {}, {'min_nodes': 3})
         self.assertEqual(4, len(servlist))
 
     def test_bad_min_nodes_override(self):
         servcalc = self.make_calculator([1])
-        servlist = self.calculate(servcalc,
-                                  {'min_nodes': -2}, {'min_nodes': 'foo'})
+        servlist, _ = self.calculate(servcalc,
+                                     {'min_nodes': -2}, {'min_nodes': 'foo'})
         self.assertEqual(2, len(servlist))
 
-    def test_ignore_unsatisfiable_jobs(self):
+    def test_ignore_and_return_unsatisfiable_jobs(self):
         servcalc = self.make_calculator([1], max_nodes=9)
-        servlist = self.calculate(servcalc,
-                                  {'min_cores_per_node': 2},
-                                  {'min_ram_mb_per_node': 256},
-                                  {'min_nodes': 6},
-                                  {'min_nodes': 12},
-                                  {'min_scratch_mb_per_node': 300000})
+        servlist, u_jobs = self.calculate(servcalc,
+                                          {'min_cores_per_node': 2},
+                                          {'min_ram_mb_per_node': 256},
+                                          {'min_nodes': 6},
+                                          {'min_nodes': 12},
+                                          {'min_scratch_mb_per_node': 300000})
         self.assertEqual(6, len(servlist))
+        # Only unsatisfiable jobs are returned on u_jobs
+        self.assertIn('zzzzz-jjjjj-000000000000000', u_jobs.keys())
+        self.assertIn('zzzzz-jjjjj-000000000000001', u_jobs.keys())
+        self.assertNotIn('zzzzz-jjjjj-000000000000002', u_jobs.keys())
+        self.assertIn('zzzzz-jjjjj-000000000000003', u_jobs.keys())
+        self.assertIn('zzzzz-jjjjj-000000000000004', u_jobs.keys())
 
     def test_ignore_too_expensive_jobs(self):
         servcalc = self.make_calculator([1, 2], max_nodes=12, max_price=6)
-        servlist = self.calculate(servcalc,
-                                  {'min_cores_per_node': 1, 'min_nodes': 6})
+        servlist, _ = self.calculate(servcalc,
+                                     {'min_cores_per_node': 1, 'min_nodes': 6})
         self.assertEqual(6, len(servlist))
 
-        servlist = self.calculate(servcalc,
-                                  {'min_cores_per_node': 2, 'min_nodes': 6})
+        servlist, _ = self.calculate(servcalc,
+                                     {'min_cores_per_node': 2, 'min_nodes': 6})
         self.assertEqual(0, len(servlist))
 
     def test_job_requesting_max_nodes_accepted(self):
         servcalc = self.make_calculator([1], max_nodes=4)
-        servlist = self.calculate(servcalc, {'min_nodes': 4})
+        servlist, _ = self.calculate(servcalc, {'min_nodes': 4})
         self.assertEqual(4, len(servlist))
 
     def test_cheapest_size(self):
@@ -89,37 +95,37 @@ class ServerCalculatorTestCase(unittest.TestCase):
 
     def test_next_biggest(self):
         servcalc = self.make_calculator([1, 2, 4, 8])
-        servlist = self.calculate(servcalc,
-                                  {'min_cores_per_node': 3},
-                                  {'min_cores_per_node': 6})
+        servlist, _ = self.calculate(servcalc,
+                                     {'min_cores_per_node': 3},
+                                     {'min_cores_per_node': 6})
         self.assertEqual([servcalc.cloud_sizes[2].id,
                           servcalc.cloud_sizes[3].id],
                          [s.id for s in servlist])
 
     def test_multiple_sizes(self):
         servcalc = self.make_calculator([1, 2])
-        servlist = self.calculate(servcalc,
-                                  {'min_cores_per_node': 2},
-                                  {'min_cores_per_node': 1},
-                                  {'min_cores_per_node': 1})
+        servlist, _ = self.calculate(servcalc,
+                                     {'min_cores_per_node': 2},
+                                     {'min_cores_per_node': 1},
+                                     {'min_cores_per_node': 1})
         self.assertEqual([servcalc.cloud_sizes[1].id,
                           servcalc.cloud_sizes[0].id,
                           servcalc.cloud_sizes[0].id],
                          [s.id for s in servlist])
 
-        servlist = self.calculate(servcalc,
-                                  {'min_cores_per_node': 1},
-                                  {'min_cores_per_node': 2},
-                                  {'min_cores_per_node': 1})
+        servlist, _ = self.calculate(servcalc,
+                                     {'min_cores_per_node': 1},
+                                     {'min_cores_per_node': 2},
+                                     {'min_cores_per_node': 1})
         self.assertEqual([servcalc.cloud_sizes[0].id,
                           servcalc.cloud_sizes[1].id,
                           servcalc.cloud_sizes[0].id],
                          [s.id for s in servlist])
 
-        servlist = self.calculate(servcalc,
-                                  {'min_cores_per_node': 1},
-                                  {'min_cores_per_node': 1},
-                                  {'min_cores_per_node': 2})
+        servlist, _ = self.calculate(servcalc,
+                                     {'min_cores_per_node': 1},
+                                     {'min_cores_per_node': 1},
+                                     {'min_cores_per_node': 2})
         self.assertEqual([servcalc.cloud_sizes[0].id,
                           servcalc.cloud_sizes[0].id,
                           servcalc.cloud_sizes[1].id],
@@ -131,16 +137,38 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
                                    unittest.TestCase):
     TEST_CLASS = jobqueue.JobQueueMonitorActor
 
+
     class MockCalculator(object):
         @staticmethod
         def servers_for_queue(queue):
-            return [testutil.MockSize(n) for n in queue]
+            return ([testutil.MockSize(n) for n in queue], {})
+
+
+    class MockCalculatorUnsatisfiableJobs(object):
+        @staticmethod
+        def servers_for_queue(queue):
+            return ([], {k["uuid"]: "Unsatisfiable job mock" for k in queue})
 
 
     def build_monitor(self, side_effect, *args, **kwargs):
         super(JobQueueMonitorActorTestCase, self).build_monitor(*args, **kwargs)
         self.client.jobs().queue().execute.side_effect = side_effect
 
+    @mock.patch("subprocess.check_call")
+    @mock.patch("subprocess.check_output")
+    def test_unsatisfiable_jobs(self, mock_squeue, mock_scancel):
+        #mock_scancel.return_value = ""
+        job_uuid = 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'
+        container_uuid = 'yyyyy-dz642-yyyyyyyyyyyyyyy'
+        mock_squeue.return_value = "1|1024|0|Resources|" + container_uuid + "\n"
+
+        self.build_monitor([{'items': [{'uuid': job_uuid}]}],
+                           self.MockCalculatorUnsatisfiableJobs(), True, True)
+        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
+        self.stop_proxy(self.monitor)
+        self.client.jobs().cancel.assert_called_with(uuid=job_uuid)
+        mock_scancel.assert_called_with(['scancel', '--name='+container_uuid])
+
     @mock.patch("subprocess.check_output")
     def test_subscribers_get_server_lists(self, mock_squeue):
         mock_squeue.return_value = ""