Merge branch '7478-invalid-size-not-defined'
[arvados.git] / services / nodemanager / arvnodeman / jobqueue.py
index 20849c917a92422b86d214b57e8115fdba4a7529..6c7a93466cae95e248be74e4322fcf8e72f47f00 100644 (file)
@@ -24,6 +24,26 @@ class ServerCalculator(object):
     that would best satisfy the jobs, choosing the cheapest size that
     satisfies each job, and ignoring jobs that can't be satisfied.
     """
     that would best satisfy the jobs, choosing the cheapest size that
     satisfies each job, and ignoring jobs that can't be satisfied.
     """
+    class InvalidCloudSize(object):
+        """
+        Dummy CloudSizeWrapper-like class, to be used when a cloud node doesn't
+        have a recognizable arvados_node_size tag.
+        """
+        def __init__(self):
+            self.id = 'invalid'
+            self.name = 'invalid'
+            self.ram = 0
+            self.disk = 0
+            self.scratch = 0
+            self.cores = 0
+            self.bandwidth = 0
+            self.price = 9999999
+            self.preemptable = False
+            self.extra = {}
+
+        def meets_constraints(self, **kwargs):
+            return False
+
 
     class CloudSizeWrapper(object):
         def __init__(self, real_size, node_mem_scaling, **kwargs):
 
     class CloudSizeWrapper(object):
         def __init__(self, real_size, node_mem_scaling, **kwargs):
@@ -38,7 +58,9 @@ class ServerCalculator(object):
                 self.disk = 0
             self.scratch = self.disk * 1000
             self.ram = int(self.ram * node_mem_scaling)
                 self.disk = 0
             self.scratch = self.disk * 1000
             self.ram = int(self.ram * node_mem_scaling)
+            self.preemptable = False
             for name, override in kwargs.iteritems():
             for name, override in kwargs.iteritems():
+                if name == 'instance_type': continue
                 if not hasattr(self, name):
                     raise ValueError("unrecognized size field '%s'" % (name,))
                 setattr(self, name, override)
                 if not hasattr(self, name):
                     raise ValueError("unrecognized size field '%s'" % (name,))
                 setattr(self, name, override)
@@ -80,10 +102,12 @@ class ServerCalculator(object):
         wants = {'cores': want_value('min_cores_per_node'),
                  'ram': want_value('min_ram_mb_per_node'),
                  'scratch': want_value('min_scratch_mb_per_node')}
         wants = {'cores': want_value('min_cores_per_node'),
                  'ram': want_value('min_ram_mb_per_node'),
                  'scratch': want_value('min_scratch_mb_per_node')}
+        # EC2 node sizes are identified by id. GCE sizes are identified by name.
         for size in self.cloud_sizes:
             if (size.meets_constraints(**wants) and
         for size in self.cloud_sizes:
             if (size.meets_constraints(**wants) and
-                (specified_size is None or size.id == specified_size)):
-                    return size
+                (specified_size is None or
+                    size.id == specified_size or size.name == specified_size)):
+                        return size
         return None
 
     def servers_for_queue(self, queue):
         return None
 
     def servers_for_queue(self, queue):
@@ -101,7 +125,7 @@ class ServerCalculator(object):
                     "Job's min_nodes constraint is greater than the configured "
                     "max_nodes (%d)" % self.max_nodes)
             elif (want_count*cloud_size.price <= self.max_price):
                     "Job's min_nodes constraint is greater than the configured "
                     "max_nodes (%d)" % self.max_nodes)
             elif (want_count*cloud_size.price <= self.max_price):
-                servers.extend([cloud_size.real] * want_count)
+                servers.extend([cloud_size] * want_count)
             else:
                 unsatisfiable_jobs[job['uuid']] = (
                     "Job's price (%d) is above system's max_price "
             else:
                 unsatisfiable_jobs[job['uuid']] = (
                     "Job's price (%d) is above system's max_price "
@@ -115,7 +139,7 @@ class ServerCalculator(object):
         for s in self.cloud_sizes:
             if s.id == sizeid:
                 return s
         for s in self.cloud_sizes:
             if s.id == sizeid:
                 return s
-        return None
+        return self.InvalidCloudSize()
 
 
 class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
 
 
 class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
@@ -153,11 +177,11 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
     def _send_request(self):
         queuelist = []
         if self.slurm_queue:
     def _send_request(self):
         queuelist = []
         if self.slurm_queue:
-            # cpus, memory, tempory disk space, reason, job name, feature constraints
-            squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f"])
+            # cpus, memory, tempory disk space, reason, job name, feature constraints, priority
+            squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f|%Q"])
             for out in squeue_out.splitlines():
                 try:
             for out in squeue_out.splitlines():
                 try:
-                    cpu, ram, disk, reason, jobname, features = out.split("|", 5)
+                    cpu, ram, disk, reason, jobname, features, priority = out.split("|", 6)
                 except ValueError:
                     self._logger.warning("ignored malformed line in squeue output: %r", out)
                     continue
                 except ValueError:
                     self._logger.warning("ignored malformed line in squeue output: %r", out)
                     continue
@@ -177,7 +201,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
                         "uuid": jobname,
                         "runtime_constraints": {
                             "instance_type": instance_type,
                         "uuid": jobname,
                         "runtime_constraints": {
                             "instance_type": instance_type,
-                        }
+                        },
+                        "priority": int(priority)
                     })
                     break
                 else:
                     })
                     break
                 else:
@@ -189,8 +214,10 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
                             "min_cores_per_node": cpu,
                             "min_ram_mb_per_node": self.coerce_to_mb(ram),
                             "min_scratch_mb_per_node": self.coerce_to_mb(disk)
                             "min_cores_per_node": cpu,
                             "min_ram_mb_per_node": self.coerce_to_mb(ram),
                             "min_scratch_mb_per_node": self.coerce_to_mb(disk)
-                        }
+                        },
+                        "priority": int(priority)
                     })
                     })
+            queuelist.sort(key=lambda x: x.get('priority', 1), reverse=True)
 
         if self.jobs_queue:
             queuelist.extend(self._client.jobs().queue().execute()['items'])
 
         if self.jobs_queue:
             queuelist.extend(self._client.jobs().queue().execute()['items'])
@@ -221,5 +248,5 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
                                    job_uuid,
                                    error)
         self._logger.debug("Calculated wishlist: %s",
                                    job_uuid,
                                    error)
         self._logger.debug("Calculated wishlist: %s",
-                           ', '.join(s.name for s in server_list) or "(empty)")
+                           ', '.join("%s (preemptable: %s)" % (s.name, s.preemptable) for s in server_list) or "(empty)")
         return super(JobQueueMonitorActor, self)._got_response(server_list)
         return super(JobQueueMonitorActor, self)._got_response(server_list)