# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: AGPL-3.0

# Azure configuration for Arvados Node Manager.
# All times are in seconds unless specified otherwise.

[Manage]
# The management server responds to http://addr:port/status.json with
# a snapshot of internal state.

# Management server listening address (default 127.0.0.1)
#address = 0.0.0.0

# Management server port number (default -1, server is disabled)
#port = 8989

[Daemon]
# The dispatcher can customize the start and stop procedure for
# cloud nodes.  For example, the SLURM dispatcher drains nodes
# through SLURM before shutting them down.
#dispatcher = slurm

# Node Manager will ensure that there are at least this many nodes running at
# all times.  If node manager needs to start new idle nodes for the purpose of
# satisfying min_nodes, it will use the cheapest node type.  However, depending
# on usage patterns, it may also satisfy min_nodes by keeping alive some
# more-expensive nodes
min_nodes = 0

# Node Manager will not start any compute nodes when at least this
# many are running.
max_nodes = 8

# Upper limit on rate of spending (in $/hr), will not boot additional nodes
# if total price of already running nodes meets or exceeds this threshold.
# default 0 means no limit.
max_total_price = 0

# Poll Azure nodes and Arvados for new information every N seconds.
poll_time = 0.5

# Polls have exponential backoff when services fail to respond.
# This is the longest time to wait between polls.
max_poll_time = 1

# If Node Manager can't succesfully poll a service for this long,
# it will never start or stop compute nodes, on the assumption that its
# information is too outdated.
poll_stale_after = 1

# If Node Manager boots a cloud node, and it does not pair with an Arvados
# node before this long, assume that there was a cloud bootstrap failure and
# shut it down.  Note that normal shutdown windows apply (see the Cloud
# section), so this should be shorter than the first shutdown window value.
boot_fail_after = 45

# "Node stale time" affects two related behaviors.
# 1. If a compute node has been running for at least this long, but it
# isn't paired with an Arvados node, do not shut it down, but leave it alone.
# This prevents the node manager from shutting down a node that might
# actually be doing work, but is having temporary trouble contacting the
# API server.
# 2. When the Node Manager starts a new compute node, it will try to reuse
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400

# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
# If not set, this value will be set to 0.95
node_mem_scaling = 0.95

# File path for Certificate Authorities
certs_file = /etc/ssl/certs/ca-certificates.crt

[Logging]
# Log file path
#file = node-manager.log

# Log level for most Node Manager messages.
# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
# WARNING lets you know when polling a service fails.
# INFO additionally lets you know when a compute node is started or stopped.
level = DEBUG

# You can also set different log levels for specific libraries.
# Pykka is the Node Manager's actor library.
# Setting this to DEBUG will display tracebacks for uncaught
# exceptions in the actors, but it's also very chatty.
pykka = WARNING

# Setting apiclient to INFO will log the URL of every Arvados API request.
apiclient = WARNING

[Arvados]
host = {host}
token = {token}
timeout = 15
jobs_queue = no

# Accept an untrusted SSL certificate from the API server?
insecure = yes

[Cloud]
provider = ec2
driver_class = {driver_class}

# Shutdown windows define periods of time when a node may and may not be shut
# down.  These are windows in full minutes, separated by commas.  Counting from
# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
# and so on.  For example, "20, 999999" means the node may shut down between
# the 20th and 999999th minutes of uptime.
# Azure bills by the minute, so it makes sense to agressively shut down idle
# nodes.  Specify at least two windows.  You can add as many as you need beyond
# that.
shutdown_windows = 0.05, 999999

[Cloud Credentials]

key = 00000000-0000-0000-0000-000000000000
secret = PASSWORD
timeout = 60
region = East US

[Cloud List]

[Cloud Create]
# The image id
image = fake_image_id

# Path to a local ssh key file that will be used to provision new nodes.
ssh_key = {ssh_key}

# the API server to ping
ping_host = {host}

# You can define any number of Size sections to list Azure sizes you're willing
# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
# in the queue.  You must also provide price per hour as the Azure driver
# compute currently does not report prices.
#
# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
# for a list of known machine types that may be used as a Size parameter.
#
# Each size section MUST define the number of cores are available in this
# size class (since libcloud does not provide any consistent API for exposing
# this setting).
# You may also want to define the amount of scratch space (expressed
# in GB) for Crunch jobs.  You can also override Microsoft's provided
# data fields by setting them here.

[Size m4.xlarge]
cores = 4
price = 0.56
scratch = 250

[Size m4.2xlarge]
cores = 8
price = 1.12
scratch = 500