Merge branch '11097-reuse-impure'
[arvados.git] / services / nodemanager / arvnodeman / jobqueue.py
1 #!/usr/bin/env python
2
3 from __future__ import absolute_import, print_function
4
5 import logging
6 import subprocess
7
8 from . import clientactor
9 from .config import ARVADOS_ERRORS
10
11 class ServerCalculator(object):
12     """Generate cloud server wishlists from an Arvados job queue.
13
14     Instantiate this class with a list of cloud node sizes you're willing to
15     use, plus keyword overrides from the configuration.  Then you can pass
16     job queues to servers_for_queue.  It will return a list of node sizes
17     that would best satisfy the jobs, choosing the cheapest size that
18     satisfies each job, and ignoring jobs that can't be satisfied.
19     """
20
21     class CloudSizeWrapper(object):
22         def __init__(self, real_size, **kwargs):
23             self.real = real_size
24             for name in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price',
25                          'extra']:
26                 setattr(self, name, getattr(self.real, name))
27             self.cores = kwargs.pop('cores')
28             self.scratch = self.disk
29             for name, override in kwargs.iteritems():
30                 if not hasattr(self, name):
31                     raise ValueError("unrecognized size field '%s'" % (name,))
32                 setattr(self, name, override)
33
34             if self.price is None:
35                 raise ValueError("Required field 'price' is None")
36
37         def meets_constraints(self, **kwargs):
38             for name, want_value in kwargs.iteritems():
39                 have_value = getattr(self, name)
40                 if (have_value != 0) and (have_value < want_value):
41                     return False
42             return True
43
44
45     def __init__(self, server_list, max_nodes=None, max_price=None):
46         self.cloud_sizes = [self.CloudSizeWrapper(s, **kws)
47                             for s, kws in server_list]
48         self.cloud_sizes.sort(key=lambda s: s.price)
49         self.max_nodes = max_nodes or float('inf')
50         self.max_price = max_price or float('inf')
51         self.logger = logging.getLogger('arvnodeman.jobqueue')
52         self.logged_jobs = set()
53
54     @staticmethod
55     def coerce_int(x, fallback):
56         try:
57             return int(x)
58         except (TypeError, ValueError):
59             return fallback
60
61     def cloud_size_for_constraints(self, constraints):
62         want_value = lambda key: self.coerce_int(constraints.get(key), 0)
63         wants = {'cores': want_value('min_cores_per_node'),
64                  'ram': want_value('min_ram_mb_per_node'),
65                  'scratch': want_value('min_scratch_mb_per_node')}
66         for size in self.cloud_sizes:
67             if size.meets_constraints(**wants):
68                 return size
69         return None
70
71     def servers_for_queue(self, queue):
72         servers = []
73         seen_jobs = set()
74         for job in queue:
75             seen_jobs.add(job['uuid'])
76             constraints = job['runtime_constraints']
77             want_count = max(1, self.coerce_int(constraints.get('min_nodes'), 1))
78             cloud_size = self.cloud_size_for_constraints(constraints)
79             if cloud_size is None:
80                 if job['uuid'] not in self.logged_jobs:
81                     self.logged_jobs.add(job['uuid'])
82                     self.logger.debug("job %s not satisfiable", job['uuid'])
83             elif (want_count <= self.max_nodes) and (want_count*cloud_size.price <= self.max_price):
84                 servers.extend([cloud_size.real] * want_count)
85         self.logged_jobs.intersection_update(seen_jobs)
86         return servers
87
88     def cheapest_size(self):
89         return self.cloud_sizes[0]
90
91     def find_size(self, sizeid):
92         for s in self.cloud_sizes:
93             if s.id == sizeid:
94                 return s
95         return None
96
97 class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
98     """Actor to generate server wishlists from the job queue.
99
100     This actor regularly polls Arvados' job queue, and uses the provided
101     ServerCalculator to turn that into a list of requested node sizes.  That
102     list is sent to subscribers on every poll.
103     """
104
105     CLIENT_ERRORS = ARVADOS_ERRORS
106
107     def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
108         super(JobQueueMonitorActor, self).__init__(
109             client, timer_actor, *args, **kwargs)
110         self._calculator = server_calc
111
112     def _send_request(self):
113         # cpus, memory, tempory disk space, reason, job name
114         squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c %m %d %r %j"])
115         queuelist = []
116         for out in squeue_out.splitlines():
117             cpu, ram, disk, reason, jobname = out.split(" ", 4)
118             if reason in ("Resources", "ReqNodeNotAvail"):
119                 queuelist.append({
120                     "uuid": jobname,
121                     "runtime_constraints": {
122                         "min_cores_per_node": cpu,
123                         "min_ram_mb_per_node": ram,
124                         "min_scratch_mb_per_node": disk
125                     }
126                 })
127
128         queuelist.extend(self._client.jobs().queue().execute()['items'])
129
130         return queuelist
131
132     def _got_response(self, queue):
133         server_list = self._calculator.servers_for_queue(queue)
134         self._logger.debug("Calculated wishlist: %s",
135                            ', '.join(s.name for s in server_list) or "(empty)")
136         return super(JobQueueMonitorActor, self)._got_response(server_list)