From ea47e67af0e7528f0bcb23f3b34019b308eaa68a Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 1 Jun 2017 17:37:09 -0400 Subject: [PATCH] 10312: Adding ability to substitute fake libcloud driver but run full node manager for integration testing. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- .../computenode/dispatch/__init__.py | 1 + services/nodemanager/arvnodeman/config.py | 9 +- .../nodemanager/arvnodeman/fake_driver.py | 47 +++++ services/nodemanager/arvnodeman/jobqueue.py | 23 ++- services/nodemanager/arvnodeman/nodelist.py | 23 ++- services/nodemanager/fake_slurm/sinfo | 2 + services/nodemanager/fake_slurm/squeue | 2 + services/nodemanager/tests/fake.azure.cfg | 187 ++++++++++++++++++ 8 files changed, 273 insertions(+), 21 deletions(-) create mode 100644 services/nodemanager/arvnodeman/fake_driver.py create mode 100755 services/nodemanager/fake_slurm/sinfo create mode 100755 services/nodemanager/fake_slurm/squeue create mode 100644 services/nodemanager/tests/fake.azure.cfg diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index ec8b1d18de..63dac3f0ed 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, print_function import functools import logging import time +import re import libcloud.common.types as cloud_types import pykka diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py index 30e8995baa..ac0d0bd1e4 100644 --- a/services/nodemanager/arvnodeman/config.py +++ b/services/nodemanager/arvnodeman/config.py @@ -103,12 +103,19 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser): def new_cloud_client(self): module = importlib.import_module('arvnodeman.computenode.driver.' + self.get('Cloud', 'provider')) + driver_class = module.ComputeNodeDriver.DEFAULT_DRIVER + if self.get('Cloud', 'driver_class'): + d = self.get('Cloud', 'driver_class').split('.') + mod = '.'.join(d[:-1]) + cls = d[-1] + driver_class = importlib.import_module(mod).__dict__[cls] auth_kwargs = self.get_section('Cloud Credentials') if 'timeout' in auth_kwargs: auth_kwargs['timeout'] = int(auth_kwargs['timeout']) return module.ComputeNodeDriver(auth_kwargs, self.get_section('Cloud List'), - self.get_section('Cloud Create')) + self.get_section('Cloud Create'), + driver_class=driver_class) def node_sizes(self, all_sizes): """Finds all acceptable NodeSizes for our installation. diff --git a/services/nodemanager/arvnodeman/fake_driver.py b/services/nodemanager/arvnodeman/fake_driver.py new file mode 100644 index 0000000000..89a3dbb6b7 --- /dev/null +++ b/services/nodemanager/arvnodeman/fake_driver.py @@ -0,0 +1,47 @@ +import re +import urllib +import ssl + +from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState + +all_nodes = [] + +class FakeDriver(NodeDriver): + def __init__(self, *args, **kwargs): + self.name = "FakeDriver" + + def list_sizes(self, **kwargs): + return [NodeSize("Standard_D3", "Standard_D3", 3500, 200, 0, 0, self), + NodeSize("Standard_D4", "Standard_D4", 7000, 400, 0, 0, self)] + + def list_nodes(self, **kwargs): + return all_nodes + + def create_node(self, name=None, + size=None, + image=None, + auth=None, + ex_storage_account=None, + ex_customdata=None, + ex_resource_group=None, + ex_user_name=None, + ex_tags=None, + ex_network=None): + all_nodes.append(Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})) + ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name + print(ping_url) + ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + ctx.verify_mode = ssl.CERT_NONE + f = urllib.urlopen(ping_url, "", context=ctx) + print(f.read()) + f.close() + return all_nodes[-1] + + def destroy_node(self, cloud_node): + return None + + def get_image(self, img): + pass + + def ex_create_tags(self, cloud_node, tags): + pass diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py index 0340918e73..c0de691b1c 100644 --- a/services/nodemanager/arvnodeman/jobqueue.py +++ b/services/nodemanager/arvnodeman/jobqueue.py @@ -136,16 +136,19 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor): squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j"]) queuelist = [] for out in squeue_out.splitlines(): - cpu, ram, disk, reason, jobname = out.split("|", 4) - if ("ReqNodeNotAvail" in reason) or ("Resources" in reason): - queuelist.append({ - "uuid": jobname, - "runtime_constraints": { - "min_cores_per_node": cpu, - "min_ram_mb_per_node": self.coerce_to_mb(ram), - "min_scratch_mb_per_node": self.coerce_to_mb(disk) - } - }) + try: + cpu, ram, disk, reason, jobname = out.split("|", 4) + if ("ReqNodeNotAvail" in reason) or ("Resources" in reason): + queuelist.append({ + "uuid": jobname, + "runtime_constraints": { + "min_cores_per_node": cpu, + "min_ram_mb_per_node": self.coerce_to_mb(ram), + "min_scratch_mb_per_node": self.coerce_to_mb(disk) + } + }) + except ValueError: + pass queuelist.extend(self._client.jobs().queue().execute()['items']) diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py index 6bf1a8b4de..7bc3a5ebd2 100644 --- a/services/nodemanager/arvnodeman/nodelist.py +++ b/services/nodemanager/arvnodeman/nodelist.py @@ -29,16 +29,19 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor): sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n %t"]) nodestates = {} for out in sinfo_out.splitlines(): - nodename, state = out.split(" ", 2) - if state in ('alloc', 'alloc*', - 'comp', 'comp*', - 'mix', 'mix*', - 'drng', 'drng*'): - nodestates[nodename] = 'busy' - elif state == 'idle': - nodestates[nodename] = 'idle' - else: - nodestates[nodename] = 'down' + try: + nodename, state = out.split(" ", 2) + if state in ('alloc', 'alloc*', + 'comp', 'comp*', + 'mix', 'mix*', + 'drng', 'drng*'): + nodestates[nodename] = 'busy' + elif state == 'idle': + nodestates[nodename] = 'idle' + else: + nodestates[nodename] = 'down' + except ValueError: + pass for n in nodelist: if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates: diff --git a/services/nodemanager/fake_slurm/sinfo b/services/nodemanager/fake_slurm/sinfo new file mode 100755 index 0000000000..e57d0d3a70 --- /dev/null +++ b/services/nodemanager/fake_slurm/sinfo @@ -0,0 +1,2 @@ +#!/bin/sh +echo \ No newline at end of file diff --git a/services/nodemanager/fake_slurm/squeue b/services/nodemanager/fake_slurm/squeue new file mode 100755 index 0000000000..dd114a00b9 --- /dev/null +++ b/services/nodemanager/fake_slurm/squeue @@ -0,0 +1,2 @@ +#!/bin/sh +echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9' diff --git a/services/nodemanager/tests/fake.azure.cfg b/services/nodemanager/tests/fake.azure.cfg new file mode 100644 index 0000000000..7f7629fe4d --- /dev/null +++ b/services/nodemanager/tests/fake.azure.cfg @@ -0,0 +1,187 @@ +# Azure configuration for Arvados Node Manager. +# All times are in seconds unless specified otherwise. + +[Manage] +# The management server responds to http://addr:port/status.json with +# a snapshot of internal state. + +# Management server listening address (default 127.0.0.1) +#address = 0.0.0.0 + +# Management server port number (default -1, server is disabled) +#port = 8989 + +[Daemon] +# The dispatcher can customize the start and stop procedure for +# cloud nodes. For example, the SLURM dispatcher drains nodes +# through SLURM before shutting them down. +#dispatcher = slurm + +# Node Manager will ensure that there are at least this many nodes running at +# all times. If node manager needs to start new idle nodes for the purpose of +# satisfying min_nodes, it will use the cheapest node type. However, depending +# on usage patterns, it may also satisfy min_nodes by keeping alive some +# more-expensive nodes +min_nodes = 0 + +# Node Manager will not start any compute nodes when at least this +# many are running. +max_nodes = 8 + +# Upper limit on rate of spending (in $/hr), will not boot additional nodes +# if total price of already running nodes meets or exceeds this threshold. +# default 0 means no limit. +max_total_price = 0 + +# Poll Azure nodes and Arvados for new information every N seconds. +poll_time = 15 + +# Polls have exponential backoff when services fail to respond. +# This is the longest time to wait between polls. +max_poll_time = 300 + +# If Node Manager can't succesfully poll a service for this long, +# it will never start or stop compute nodes, on the assumption that its +# information is too outdated. +poll_stale_after = 600 + +# If Node Manager boots a cloud node, and it does not pair with an Arvados +# node before this long, assume that there was a cloud bootstrap failure and +# shut it down. Note that normal shutdown windows apply (see the Cloud +# section), so this should be shorter than the first shutdown window value. +boot_fail_after = 1800 + +# "Node stale time" affects two related behaviors. +# 1. If a compute node has been running for at least this long, but it +# isn't paired with an Arvados node, do not shut it down, but leave it alone. +# This prevents the node manager from shutting down a node that might +# actually be doing work, but is having temporary trouble contacting the +# API server. +# 2. When the Node Manager starts a new compute node, it will try to reuse +# an Arvados node that hasn't been updated for this long. +node_stale_after = 14400 + +# Scaling factor to be applied to nodes' available RAM size. Usually there's a +# variable discrepancy between the advertised RAM value on cloud nodes and the +# actual amount available. +# If not set, this value will be set to 0.95 +node_mem_scaling = 0.95 + +# File path for Certificate Authorities +certs_file = /etc/ssl/certs/ca-certificates.crt + +[Logging] +# Log file path +#file = node-manager.log + +# Log level for most Node Manager messages. +# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL. +# WARNING lets you know when polling a service fails. +# INFO additionally lets you know when a compute node is started or stopped. +level = DEBUG + +# You can also set different log levels for specific libraries. +# Pykka is the Node Manager's actor library. +# Setting this to DEBUG will display tracebacks for uncaught +# exceptions in the actors, but it's also very chatty. +pykka = WARNING + +# Setting apiclient to INFO will log the URL of every Arvados API request. +apiclient = WARNING + +[Arvados] +host = 192.168.5.2:8000 +token = 2tnmn9ou33o3vk3bynzyzrc7aedhijo7ufa11j9kyv7509cygx +timeout = 15 + +# Accept an untrusted SSL certificate from the API server? +insecure = yes + +[Cloud] +provider = azure +driver_class = arvnodeman.fake_driver.FakeDriver + +# Shutdown windows define periods of time when a node may and may not be shut +# down. These are windows in full minutes, separated by commas. Counting from +# the time the node is booted, the node WILL NOT shut down for N1 minutes; then +# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes; +# and so on. For example, "20, 999999" means the node may shut down between +# the 20th and 999999th minutes of uptime. +# Azure bills by the minute, so it makes sense to agressively shut down idle +# nodes. Specify at least two windows. You can add as many as you need beyond +# that. +shutdown_windows = 5, 999999 + +[Cloud Credentials] +# Use "azure account list" with the azure CLI to get these values. +tenant_id = 00000000-0000-0000-0000-000000000000 +subscription_id = 00000000-0000-0000-0000-000000000000 + +# The following directions are based on +# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/ +# +# azure config mode arm +# azure ad app create --name "" --home-page "" --identifier-uris "" --password +# azure ad sp create "" +# azure role assignment create --objectId "" -o Owner -c /subscriptions/{subscriptionId}/ +# +# Use for "key" and the for "secret" +# +key = 00000000-0000-0000-0000-000000000000 +secret = PASSWORD +timeout = 60 +region = East US + +[Cloud List] +# The resource group in which the compute node virtual machines will be created +# and listed. +ex_resource_group = ArvadosResourceGroup + +[Cloud Create] +# The image id, in the form "Publisher:Offer:SKU:Version" +image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050 + +# Path to a local ssh key file that will be used to provision new nodes. +ssh_key = /home/peter/.ssh/id_rsa.pub + +# The account name for the admin user that will be provisioned on new nodes. +ex_user_name = arvadosuser + +# The Azure storage account that will be used to store the node OS disk images. +ex_storage_account = arvadosstorage + +# The virtual network the VMs will be associated with. +ex_network = ArvadosNetwork + +# Optional subnet of the virtual network. +#ex_subnet = default + +# Node tags +tag_arvados-class = dynamic-compute +tag_cluster = zyxwv + +# the API server to ping +ping_host = 192.168.5.2:8000 + +# You can define any number of Size sections to list Azure sizes you're willing +# to use. The Node Manager should boot the cheapest size(s) that can run jobs +# in the queue. You must also provide price per hour as the Azure driver +# compute currently does not report prices. +# +# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/ +# for a list of known machine types that may be used as a Size parameter. +# +# Each size section MUST define the number of cores are available in this +# size class (since libcloud does not provide any consistent API for exposing +# this setting). +# You may also want to define the amount of scratch space (expressed +# in GB) for Crunch jobs. You can also override Microsoft's provided +# data fields by setting them here. + +[Size Standard_D3] +cores = 4 +price = 0.56 + +[Size Standard_D4] +cores = 8 +price = 1.12 -- 2.30.2