From 2dd214985e9727bf0b5fbf11b0e39c3e7d3cc5c4 Mon Sep 17 00:00:00 2001 From: Lucas Di Pentima Date: Fri, 16 Mar 2018 17:32:18 -0300 Subject: [PATCH] 13166: Order slurm queue by priority Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima --- services/nodemanager/arvnodeman/jobqueue.py | 11 +++++++---- services/nodemanager/tests/integration_test.py | 4 ++-- services/nodemanager/tests/test_jobqueue.py | 12 ++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py index 20849c917a..8f3d7b97a8 100644 --- a/services/nodemanager/arvnodeman/jobqueue.py +++ b/services/nodemanager/arvnodeman/jobqueue.py @@ -154,10 +154,10 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor): queuelist = [] if self.slurm_queue: # cpus, memory, tempory disk space, reason, job name, feature constraints - squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f"]) + squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f|%Q"]) for out in squeue_out.splitlines(): try: - cpu, ram, disk, reason, jobname, features = out.split("|", 5) + cpu, ram, disk, reason, jobname, features, priority = out.split("|", 6) except ValueError: self._logger.warning("ignored malformed line in squeue output: %r", out) continue @@ -177,7 +177,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor): "uuid": jobname, "runtime_constraints": { "instance_type": instance_type, - } + }, + "priority": int(priority) }) break else: @@ -189,8 +190,10 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor): "min_cores_per_node": cpu, "min_ram_mb_per_node": self.coerce_to_mb(ram), "min_scratch_mb_per_node": self.coerce_to_mb(disk) - } + }, + "priority": int(priority) }) + queuelist = sorted(queuelist, key=lambda x: x.get('priority', 1), reverse=True) if self.jobs_queue: queuelist.extend(self._client.jobs().queue().execute()['items']) diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py index 7b8ba391c9..f188f03140 100755 --- a/services/nodemanager/tests/integration_test.py +++ b/services/nodemanager/tests/integration_test.py @@ -58,14 +58,14 @@ def update_script(path, val): def set_squeue(g): global all_jobs update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + - "\n".join("echo '1|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items())) + "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) return 0 def set_queue_unsatisfiable(g): global all_jobs, unsatisfiable_job_scancelled # Simulate a job requesting a 99 core node. update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + - "\n".join("echo '99|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items())) + "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" + "\ntouch %s" % unsatisfiable_job_scancelled) return 0 diff --git a/services/nodemanager/tests/test_jobqueue.py b/services/nodemanager/tests/test_jobqueue.py index 52232453bd..8c10f1b426 100644 --- a/services/nodemanager/tests/test_jobqueue.py +++ b/services/nodemanager/tests/test_jobqueue.py @@ -159,7 +159,7 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin, def test_unsatisfiable_jobs(self, mock_squeue, mock_scancel): job_uuid = 'zzzzz-8i9sb-zzzzzzzzzzzzzzz' container_uuid = 'yyyyy-dz642-yyyyyyyyyyyyyyy' - mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "|\n" + mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "||1234567890\n" self.build_monitor([{'items': [{'uuid': job_uuid}]}], self.MockCalculatorUnsatisfiableJobs(), True, True) @@ -181,8 +181,8 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin, @mock.patch("subprocess.check_output") def test_squeue_server_list(self, mock_squeue): - mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null) -2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null) + mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890 +2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890 """ super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator( @@ -195,8 +195,8 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin, @mock.patch("subprocess.check_output") def test_squeue_server_list_suffix(self, mock_squeue): - mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null) -1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null) + mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890 +1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890 """ super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator( @@ -209,7 +209,7 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin, @mock.patch("subprocess.check_output") def test_squeue_server_list_instancetype_constraint(self, mock_squeue): - mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test\n""" + mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test|1234567890\n""" super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator( [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]), True, True) -- 2.30.2