# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: AGPL-3.0

# EC2 configuration for Arvados Node Manager.
# All times are in seconds unless specified otherwise.

[Manage]
# The management server responds to http://addr:port/status.json with
# a snapshot of internal state.

# Management server listening address (default 127.0.0.1)
#address = 0.0.0.0

# Management server port number (default -1, server is disabled)
#port = 8989

[Daemon]
# The dispatcher can customize the start and stop procedure for
# cloud nodes.  For example, the SLURM dispatcher drains nodes
# through SLURM before shutting them down.
#dispatcher = slurm

# Node Manager will ensure that there are at least this many nodes running at
# all times.  If node manager needs to start new idle nodes for the purpose of
# satisfying min_nodes, it will use the cheapest node type.  However, depending
# on usage patterns, it may also satisfy min_nodes by keeping alive some
# more-expensive nodes
min_nodes = 0

# Node Manager will not start any compute nodes when at least this
# many are running.
max_nodes = 8

# Upper limit on rate of spending (in $/hr), will not boot additional nodes
# if total price of already running nodes meets or exceeds this threshold.
# default 0 means no limit.
max_total_price = 0

# Poll EC2 nodes and Arvados for new information every N seconds.
poll_time = 60

# Polls have exponential backoff when services fail to respond.
# This is the longest time to wait between polls.
max_poll_time = 300

# If Node Manager can't succesfully poll a service for this long,
# it will never start or stop compute nodes, on the assumption that its
# information is too outdated.
poll_stale_after = 600

# If Node Manager boots a cloud node, and it does not pair with an Arvados
# node before this long, assume that there was a cloud bootstrap failure and
# shut it down.  Note that normal shutdown windows apply (see the Cloud
# section), so this should be shorter than the first shutdown window value.
boot_fail_after = 1800

# "Node stale time" affects two related behaviors.
# 1. If a compute node has been running for at least this long, but it
# isn't paired with an Arvados node, do not shut it down, but leave it alone.
# This prevents the node manager from shutting down a node that might
# actually be doing work, but is having temporary trouble contacting the
# API server.
# 2. When the Node Manager starts a new compute node, it will try to reuse
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400

# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
# If not set, this value will be set to 0.95
node_mem_scaling = 0.95

# File path for Certificate Authorities
certs_file = /etc/ssl/certs/ca-certificates.crt

[Logging]
# Log file path
file = /var/log/arvados/node-manager.log

# Log level for most Node Manager messages.
# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
# WARNING lets you know when polling a service fails.
# INFO additionally lets you know when a compute node is started or stopped.
level = INFO

# You can also set different log levels for specific libraries.
# Pykka is the Node Manager's actor library.
# Setting this to DEBUG will display tracebacks for uncaught
# exceptions in the actors, but it's also very chatty.
pykka = WARNING

# Setting apiclient to INFO will log the URL of every Arvados API request.
apiclient = WARNING

[Arvados]
host = zyxwv.arvadosapi.com
token = ARVADOS_TOKEN
timeout = 15
jobs_queue = yes   # Get work request from Arvados jobs queue (jobs API)
slurm_queue = yes  # Get work request from squeue (containers API)

# Accept an untrusted SSL certificate from the API server?
insecure = no

[Cloud]
provider = ec2

# It's usually most cost-effective to shut down compute nodes during narrow
# windows of time.  For example, EC2 bills each node by the hour, so the best
# time to shut down a node is right before a new hour of uptime starts.
# Shutdown windows define these periods of time.  These are windows in
# full minutes, separated by commas.  Counting from the time the node is
# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
# For example, "54, 5, 1" means the node may shut down from the 54th to the
# 59th minute of each hour of uptime.
# Specify at least two windows.  You can add as many as you need beyond that.
shutdown_windows = 54, 5, 1

[Cloud Credentials]
key = KEY
secret = SECRET_KEY
region = us-east-1
timeout = 60

[Cloud List]
# This section defines filters that find compute nodes.
# Tags that you specify here will automatically be added to nodes you create.
# Replace colons in Amazon filters with underscores
# (e.g., write "tag:mytag" as "tag_mytag").
instance-state-name = running
tag_arvados-class = dynamic-compute
tag_cluster = zyxwv

[Cloud Create]
# New compute nodes will send pings to Arvados at this host.
# You may specify a port, and use brackets to disambiguate IPv6 addresses.
ping_host = hostname:port

# Give the name of an SSH key on AWS...
ex_keyname = string

# ... or a file path for an SSH key that can log in to the compute node.
# (One or the other, not both.)
# ssh_key = path

# The EC2 IDs of the image and subnet compute nodes should use.
image_id = idstring
subnet_id = idstring

# Comma-separated EC2 IDs for the security group(s) assigned to each
# compute node.
security_groups = idstring1, idstring2


# You can define any number of Size sections to list EC2 sizes you're
# willing to use.  The Node Manager should boot the cheapest size(s) that
# can run jobs in the queue.
#
# Each size section MUST define the number of cores are available in this
# size class (since libcloud does not provide any consistent API for exposing
# this setting).
# You may also want to define the amount of scratch space (expressed
# in GB) for Crunch jobs.  You can also override Amazon's provided
# data fields (such as price per hour) by setting them here.

[Size m4.large]
cores = 2
price = 0.126
scratch = 100

[Size m4.xlarge]
cores = 4
price = 0.252
scratch = 100