The arvados-docker-cleaner program removes least recently used docker images as needed to keep disk usage below a configured limit.
+{% include 'notebox_begin' %}
+This also removes all containers as soon as they exit, as if they were run with `docker run --rm`. If you need to debug or inspect containers after they stop, temporarily stop arvados-docker-cleaner or run it with `--remove-stopped-containers never`.
+{% include 'notebox_end' %}
+
On Debian-based systems, install runit:
<notextile>
export CRUNCH_JOB_DOCKER_BIN=<span class="userinput">docker.io</span>
fuser -TERM -k $CRUNCH_DISPATCH_LOCKFILE || true
-cd /var/www/arvados-api/services/api
+cd /var/www/arvados-api/current
exec $rvmexec bundle exec ./script/crunch-dispatch.rb 2>&1
</code></pre>
</notextile>
<notextile>
<pre><code>~$ <span class="userinput">keep-web -h</span>
Usage of keep-web:
- -address string
- Address to listen on: "host:port", or ":port" to listen on all interfaces. (default ":80")
- -anonymous-token value
- API token to try when none of the tokens provided in an HTTP request succeed in reading the desired collection. If this flag is used more than once, each token will be attempted in turn until one works. (default [])
+ -allow-anonymous
+ Serve public data to anonymous clients. Try the token supplied in the ARVADOS_API_TOKEN environment variable when none of the tokens provided in an HTTP request succeed in reading the desired collection. (default false)
-attachment-only-host string
Accept credentials, and add "Content-Disposition: attachment" response headers, for requests at this hostname:port. Prohibiting inline display makes it possible to serve untrusted and non-public content from a single origin, i.e., without wildcard DNS or SSL.
+ -listen string
+ Address to listen on: "host:port", or ":port" to listen on all interfaces. (default ":80")
-trust-all-content
Serve non-public content from a single origin. Dangerous: read docs before using!
</code></pre>
<notextile>
<pre><code>export ARVADOS_API_HOST=<span class="userinput">uuid_prefix</span>.your.domain
-exec sudo -u nobody keep-web -address=<span class="userinput">:9002</span> -anonymous-token=<span class="userinput">hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r</span> 2>&1
+export ARVADOS_API_TOKEN="<span class="userinput">hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r</span>"
+exec sudo -u nobody keep-web -listen=<span class="userinput">:9002</span> -allow-anonymous 2>&1
</code></pre>
</notextile>
-Omit the @-anonymous-token@ arguments if you do not want to serve public data.
+Omit the @-allow-anonymous@ argument if you do not want to serve public data.
Set @ARVADOS_API_HOST_INSECURE=1@ if your API server's SSL certificate is not signed by a recognized CA.
On the <strong>API server</strong>, use the following command to create the token:
<notextile>
-<pre><code>~/arvados/services/api/script$ <span class="userinput">RAILS_ENV=production bundle exec ./get_anonymous_user_token.rb</span>
+<pre><code>/var/www/arvados-api/current/script$ <span class="userinput">RAILS_ENV=production bundle exec ./get_anonymous_user_token.rb</span>
hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r
</code></pre></notextile>
}
srun(["srun", "--nodelist=" . $node[0]],
["/bin/sh", "-ec",
- "a=`$docker_bin run --rm $try_user_arg $docker_hash id --user` && " .
+ "a=`$docker_bin run $try_user_arg $docker_hash id --user` && " .
" test \$a -ne 0"],
{fork => 1});
if ($? == 0) {
$command .= "&& exec arv-mount --by-id --allow-other $ENV{TASK_KEEPMOUNT} --exec ";
if ($docker_hash)
{
- my $cidfile = "$ENV{CRUNCH_TMP}/$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}.cid";
+ my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
+ my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
$command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
- $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
+ $command .= "$docker_bin run --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
# We only set memory limits if Docker lets us limit both memory and swap.
# Memory limits alone have been supported longer, but subprocesses tend
# to get SIGKILL if they exceed that without any swap limit set.
"os"
"regexp"
"strings"
+ "time"
)
type StringMatcher func(string) bool
var MissingArvadosApiToken = errors.New("Missing required environment variable ARVADOS_API_TOKEN")
var ErrInvalidArgument = errors.New("Invalid argument")
+// A common failure mode is to reuse a keepalive connection that has been
+// terminated (in a way that we can't detect) for being idle too long.
+// POST and DELETE are not safe to retry automatically, so we minimize
+// such failures by always using a new or recently active socket.
+var MaxIdleConnectionDuration = 30 * time.Second
+
// Indicates an error that was returned by the API server.
type APIServerError struct {
// Address of server returning error, of the form "host:port".
// Discovery document
DiscoveryDoc Dict
+
+ lastClosedIdlesAt time.Time
}
// Create a new ArvadosClient, initialized with standard Arvados environment
return ac, MissingArvadosApiToken
}
+ ac.lastClosedIdlesAt = time.Now()
+
return ac, err
}
req.Header.Add("X-External-Client", "1")
}
+ // POST and DELETE are not safe to retry automatically, so we minimize
+ // such failures by always using a new or recently active socket
+ if method == "POST" || method == "DELETE" {
+ if time.Since(c.lastClosedIdlesAt) > MaxIdleConnectionDuration {
+ c.lastClosedIdlesAt = time.Now()
+ c.Client.Transport.(*http.Transport).CloseIdleConnections()
+ }
+ }
+
// Make the request
var resp *http.Response
if resp, err = c.Client.Do(req); err != nil {
"net/http"
"os"
"testing"
+ "time"
)
// Gocheck boilerplate
func (s *ServerRequiredSuite) TestCreatePipelineTemplate(c *C) {
arv, err := MakeArvadosClient()
- getback := make(Dict)
- err = arv.Create("pipeline_templates",
- Dict{"pipeline_template": Dict{
- "name": "tmp",
- "components": Dict{
- "c1": map[string]string{"script": "script1"},
- "c2": map[string]string{"script": "script2"}}}},
- &getback)
- c.Assert(err, Equals, nil)
- c.Assert(getback["name"], Equals, "tmp")
- c.Assert(getback["components"].(map[string]interface{})["c2"].(map[string]interface{})["script"], Equals, "script2")
-
- uuid := getback["uuid"].(string)
-
- getback = make(Dict)
- err = arv.Get("pipeline_templates", uuid, nil, &getback)
- c.Assert(err, Equals, nil)
- c.Assert(getback["name"], Equals, "tmp")
- c.Assert(getback["components"].(map[string]interface{})["c1"].(map[string]interface{})["script"], Equals, "script1")
-
- getback = make(Dict)
- err = arv.Update("pipeline_templates", uuid,
- Dict{
- "pipeline_template": Dict{"name": "tmp2"}},
- &getback)
- c.Assert(err, Equals, nil)
- c.Assert(getback["name"], Equals, "tmp2")
-
- c.Assert(getback["uuid"].(string), Equals, uuid)
- getback = make(Dict)
- err = arv.Delete("pipeline_templates", uuid, nil, &getback)
- c.Assert(err, Equals, nil)
- c.Assert(getback["name"], Equals, "tmp2")
+ for _, idleConnections := range []bool{
+ false,
+ true,
+ } {
+ if idleConnections {
+ arv.lastClosedIdlesAt = time.Now().Add(-time.Minute)
+ } else {
+ arv.lastClosedIdlesAt = time.Now()
+ }
+
+ getback := make(Dict)
+ err = arv.Create("pipeline_templates",
+ Dict{"pipeline_template": Dict{
+ "name": "tmp",
+ "components": Dict{
+ "c1": map[string]string{"script": "script1"},
+ "c2": map[string]string{"script": "script2"}}}},
+ &getback)
+ c.Assert(err, Equals, nil)
+ c.Assert(getback["name"], Equals, "tmp")
+ c.Assert(getback["components"].(map[string]interface{})["c2"].(map[string]interface{})["script"], Equals, "script2")
+
+ uuid := getback["uuid"].(string)
+
+ getback = make(Dict)
+ err = arv.Get("pipeline_templates", uuid, nil, &getback)
+ c.Assert(err, Equals, nil)
+ c.Assert(getback["name"], Equals, "tmp")
+ c.Assert(getback["components"].(map[string]interface{})["c1"].(map[string]interface{})["script"], Equals, "script1")
+
+ getback = make(Dict)
+ err = arv.Update("pipeline_templates", uuid,
+ Dict{
+ "pipeline_template": Dict{"name": "tmp2"}},
+ &getback)
+ c.Assert(err, Equals, nil)
+ c.Assert(getback["name"], Equals, "tmp2")
+
+ c.Assert(getback["uuid"].(string), Equals, uuid)
+ getback = make(Dict)
+ err = arv.Delete("pipeline_templates", uuid, nil, &getback)
+ c.Assert(err, Equals, nil)
+ c.Assert(getback["name"], Equals, "tmp2")
+ }
}
func (s *ServerRequiredSuite) TestErrorResponse(c *C) {
const (
SpectatorToken = "zw2f4gwx8hw8cjre7yp6v1zylhrhn3m5gvjq73rtpwhmknrybu"
ActiveToken = "3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi"
+ AdminToken = "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h"
AnonymousToken = "4kg6k6lzmp9kj4cpkcoxie964cmvjahbt4fod9zru44k4jqdmi"
+ DataManagerToken = "320mkve8qkswstz7ff61glpk3mhgghmg67wmic7elw4z41pke1"
FooCollection = "zzzzz-4zz18-fy296fx3hot09f7"
NonexistentCollection = "zzzzz-4zz18-totallynotexist"
HelloWorldCollection = "zzzzz-4zz18-4en62shvi99lxd4"
keep_args['-enforce-permissions'] = 'true'
with open(os.path.join(TEST_TMPDIR, "keep.data-manager-token-file"), "w") as f:
keep_args['-data-manager-token-file'] = f.name
- f.write(os.environ['ARVADOS_API_TOKEN'])
+ f.write(auth_token('data_manager'))
keep_args['-never-delete'] = 'false'
api = arvados.api(
api_token: 1a9ffdcga2o7cw8q12dndskomgs1ygli3ns9k2o9hgzgmktc78
expires_at: 2038-01-01 00:00:00
+data_manager:
+ api_client: untrusted
+ user: system_user
+ api_token: 320mkve8qkswstz7ff61glpk3mhgghmg67wmic7elw4z41pke1
+ expires_at: 2038-01-01 00:00:00
+ scopes:
+ - GET /arvados/v1/collections
+ - GET /arvados/v1/keep_services
+ - GET /arvados/v1/keep_services/accessible
+ - GET /arvados/v1/users/current
+ - POST /arvados/v1/logs
+
miniadmin:
api_client: untrusted
user: miniadmin
"time"
)
-const (
- ActiveUserToken = "3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi"
- AdminToken = "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h"
-)
-
var arv arvadosclient.ArvadosClient
var keepClient *keepclient.KeepClient
var keepServers []string
arvadostest.StartKeep(2, false)
arv = makeArvadosClient()
+ arv.ApiToken = arvadostest.DataManagerToken
// keep client
keepClient = &keepclient.KeepClient{
return match[1] + "+" + match[2]
}
+func switchToken(t string) func() {
+ orig := arv.ApiToken
+ restore := func() {
+ arv.ApiToken = orig
+ }
+ arv.ApiToken = t
+ return restore
+}
+
func getCollection(t *testing.T, uuid string) Dict {
+ defer switchToken(arvadostest.AdminToken)()
+
getback := make(Dict)
err := arv.Get("collections", uuid, nil, &getback)
if err != nil {
}
func updateCollection(t *testing.T, uuid string, paramName string, paramValue string) {
+ defer switchToken(arvadostest.AdminToken)()
+
err := arv.Update("collections", uuid, arvadosclient.Dict{
"collection": arvadosclient.Dict{
paramName: paramValue,
type Dict map[string]interface{}
func deleteCollection(t *testing.T, uuid string) {
+ defer switchToken(arvadostest.AdminToken)()
+
getback := make(Dict)
err := arv.Delete("collections", uuid, nil, &getback)
if err != nil {
path := keepServers[i] + "/index"
client := http.Client{}
req, err := http.NewRequest("GET", path, nil)
- req.Header.Add("Authorization", "OAuth2 "+AdminToken)
+ req.Header.Add("Authorization", "OAuth2 "+arvadostest.DataManagerToken)
req.Header.Add("Content-Type", "application/octet-stream")
resp, err := client.Do(req)
defer resp.Body.Close()
func getStatus(t *testing.T, path string) interface{} {
client := http.Client{}
req, err := http.NewRequest("GET", path, nil)
- req.Header.Add("Authorization", "OAuth2 "+AdminToken)
+ req.Header.Add("Authorization", "OAuth2 "+arvadostest.DataManagerToken)
req.Header.Add("Content-Type", "application/octet-stream")
resp, err := client.Do(req)
if err != nil {
defer TearDownDataManagerTest(t)
SetupDataManagerTest(t)
- arv.ApiToken = ActiveUserToken
+ arv.ApiToken = arvadostest.ActiveToken
err := singlerun(arv)
if err == nil {
class DockerImageCleaner(DockerImageUseRecorder):
event_handlers = DockerImageUseRecorder.event_handlers.copy()
- def __init__(self, images, docker_client, events):
+ def __init__(self, images, docker_client, events, remove_containers_onexit=False):
super().__init__(images, docker_client, events)
self.logged_unknown = set()
+ self.remove_containers_onexit = remove_containers_onexit
def new_container(self, event, container_hash):
container_image_id = container_hash['Image']
self.images.add_image(image_hash)
return super().new_container(event, container_hash)
+ def _remove_container(self, cid):
+ try:
+ self.docker_client.remove_container(cid)
+ except docker.errors.APIError as error:
+ logger.warning("Failed to remove container %s: %s", cid, error)
+ else:
+ logger.info("Removed container %s", cid)
+
+ @event_handlers.on('die')
+ def clean_container(self, event=None):
+ if self.remove_containers_onexit:
+ self._remove_container(event['id'])
+
+ def check_stopped_containers(self, remove=False):
+ logger.info("Checking for stopped containers")
+ for c in self.docker_client.containers(filters={'status': 'exited'}):
+ logger.info("Container %s %s", c['Id'], c['Status'])
+ if c['Status'][:6] != 'Exited':
+ logger.error("Unexpected status %s for container %s",
+ c['Status'], c['Id'])
+ elif remove:
+ self._remove_container(c['Id'])
+
@event_handlers.on('destroy')
def clean_images(self, event=None):
for image_id in self.images.should_delete():
parser.add_argument(
'--quota', action='store', type=human_size, required=True,
help="space allowance for Docker images, suffixed with K/M/G/T")
+ parser.add_argument(
+ '--remove-stopped-containers', type=str, default='always',
+ choices=['never', 'onexit', 'always'],
+ help="""when to remove stopped containers (default: always, i.e., remove
+ stopped containers found at startup, and remove containers as
+ soon as they exit)""")
parser.add_argument(
'--verbose', '-v', action='count', default=0,
help="log more information")
images, docker_client, docker_client.events(since=1, until=start_time))
use_recorder.run()
cleaner = DockerImageCleaner(
- images, docker_client, docker_client.events(since=start_time))
- logger.info("Starting cleanup loop")
+ images, docker_client, docker_client.events(since=start_time),
+ remove_containers_onexit=args.remove_stopped_containers != 'never')
+ cleaner.check_stopped_containers(
+ remove=args.remove_stopped_containers == 'always')
+ logger.info("Checking image quota at startup")
cleaner.clean_images()
+ logger.info("Listening for docker events")
cleaner.run()
def main(arguments):
class DockerImageUseRecorderTestCase(unittest.TestCase):
TEST_CLASS = cleaner.DockerImageUseRecorder
+ TEST_CLASS_INIT_KWARGS = {}
def setUp(self):
self.images = mock.MagicMock(name='images')
self.docker_client = mock.MagicMock(name='docker_client')
self.events = []
self.recorder = self.TEST_CLASS(self.images, self.docker_client,
- self.encoded_events)
+ self.encoded_events, **self.TEST_CLASS_INIT_KWARGS)
@property
def encoded_events(self):
self.assertFalse(self.images.del_image.called)
+class DockerContainerCleanerTestCase(DockerImageUseRecorderTestCase):
+ TEST_CLASS = cleaner.DockerImageCleaner
+ TEST_CLASS_INIT_KWARGS = {'remove_containers_onexit': True}
+
+ @mock.patch('arvados_docker.cleaner.logger')
+ def test_failed_container_deletion_handling(self, mockLogger):
+ cid = MockDockerId()
+ self.docker_client.remove_container.side_effect = MockException(500)
+ self.events.append(MockEvent('die', docker_id=cid))
+ self.recorder.run()
+ self.docker_client.remove_container.assert_called_with(cid)
+ self.assertEqual("Failed to remove container %s: %s",
+ mockLogger.warning.call_args[0][0])
+ self.assertEqual(cid,
+ mockLogger.warning.call_args[0][1])
+
+
class HumanSizeTestCase(unittest.TestCase):
def check(self, human_str, count, exp):
self.assertEqual(count * (1024 ** exp),
self.assertLessEqual(test_start_time, event_kwargs[0]['until'])
self.assertIn('since', event_kwargs[1])
self.assertEqual(event_kwargs[0]['until'], event_kwargs[1]['since'])
+
+
+class ContainerRemovalTestCase(unittest.TestCase):
+ LIFECYCLE = ['create', 'attach', 'start', 'resize', 'die', 'destroy']
+
+ def setUp(self):
+ self.args = mock.MagicMock(name='args')
+ self.docker_client = mock.MagicMock(name='docker_client')
+ self.existingCID = MockDockerId()
+ self.docker_client.containers.return_value = [{
+ 'Id': self.existingCID,
+ 'Status': 'Exited (0) 6 weeks ago',
+ }, {
+ # If docker_client.containers() returns non-exited
+ # containers for some reason, do not remove them.
+ 'Id': MockDockerId(),
+ 'Status': 'Running',
+ }]
+ self.newCID = MockDockerId()
+ self.docker_client.events.return_value = [
+ MockEvent(e, docker_id=self.newCID).encoded()
+ for e in self.LIFECYCLE]
+
+ def test_remove_onexit(self):
+ self.args.remove_stopped_containers = 'onexit'
+ cleaner.run(self.args, self.docker_client)
+ self.docker_client.remove_container.assert_called_once_with(self.newCID)
+
+ def test_remove_always(self):
+ self.args.remove_stopped_containers = 'always'
+ cleaner.run(self.args, self.docker_client)
+ self.docker_client.remove_container.assert_any_call(self.existingCID)
+ self.docker_client.remove_container.assert_any_call(self.newCID)
+ self.assertEqual(2, self.docker_client.remove_container.call_count)
+
+ def test_remove_never(self):
+ self.args.remove_stopped_containers = 'never'
+ cleaner.run(self.args, self.docker_client)
+ self.assertEqual(0, self.docker_client.remove_container.call_count)
+
+ def test_container_exited_between_subscribe_events_and_check_existing(self):
+ self.args.remove_stopped_containers = 'always'
+ self.docker_client.events.return_value = [
+ MockEvent(e, docker_id=self.existingCID).encoded()
+ for e in ['die', 'destroy']]
+ cleaner.run(self.args, self.docker_client)
+ # Subscribed to events before getting the list of existing
+ # exited containers?
+ self.docker_client.assert_has_calls([
+ mock.call.events(since=mock.ANY),
+ mock.call.containers(filters={'status':'exited'})])
+ # Asked to delete the container twice?
+ self.docker_client.remove_container.assert_has_calls([mock.call(self.existingCID)] * 2)
+ self.assertEqual(2, self.docker_client.remove_container.call_count)
import (
"flag"
"fmt"
+ "os"
+ "strconv"
)
var anonymousTokens tokenSet
type tokenSet []string
-func (ts *tokenSet) Set(t string) error {
- *ts = append(*ts, t)
- return nil
+func (ts *tokenSet) Set(s string) error {
+ v, err := strconv.ParseBool(s)
+ if v && len(*ts) == 0 {
+ *ts = append(*ts, os.Getenv("ARVADOS_API_TOKEN"))
+ } else if !v {
+ *ts = (*ts)[:0]
+ }
+ return err
}
func (ts *tokenSet) String() string {
- return fmt.Sprintf("%+v", (*ts)[:])
+ return fmt.Sprintf("%v", len(*ts) > 0)
+}
+
+func (ts *tokenSet) IsBoolFlag() bool {
+ return true
}
func init() {
- flag.Var(&anonymousTokens, "anonymous-token",
- "API token to try when none of the tokens provided in an HTTP request succeed in reading the desired collection. Multiple anonymous tokens can be provided by using this flag more than once; each token will be attempted in turn until one works.")
+ flag.Var(&anonymousTokens, "allow-anonymous",
+ "Serve public data to anonymous clients. Try the token supplied in the ARVADOS_API_TOKEN environment variable when none of the tokens provided in an HTTP request succeed in reading the desired collection.")
}
//
// Serve HTTP requests at port 1234 on all interfaces:
//
-// keep-web -address=:1234
+// keep-web -listen=:1234
//
// Serve HTTP requests at port 1234 on the interface with IP address 1.2.3.4:
//
-// keep-web -address=1.2.3.4:1234
+// keep-web -listen=1.2.3.4:1234
//
// Proxy configuration
//
//
// Anonymous downloads
//
-// Use the -anonymous-token option to specify a token to use when clients
-// try to retrieve files without providing their own Arvados API token.
+// Use the -allow-anonymous flag with an ARVADOS_API_TOKEN environment
+// variable to specify a token to use when clients try to retrieve
+// files without providing their own Arvados API token.
//
-// keep-web [...] -anonymous-token=zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+// export ARVADOS_API_TOKEN=zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+// keep-web [...] -allow-anonymous
//
// See http://doc.arvados.org/install/install-keep-web.html for examples.
//
// only when the designated origin matches exactly the Host header
// provided by the client or downstream proxy.
//
-// keep-web -address :9999 -attachment-only-host domain.example:9999
+// keep-web -listen :9999 -attachment-only-host domain.example:9999
//
// Trust All Content mode
//
//
// In such cases you can enable trust-all-content mode.
//
-// keep-web -address :9999 -trust-all-content
+// keep-web -listen :9999 -trust-all-content
//
// When using trust-all-content mode, the only effect of the
// -attachment-only-host option is to add a "Content-Disposition:
// attachment" header.
//
-// keep-web -address :9999 -attachment-only-host domain.example:9999 -trust-all-content
+// keep-web -listen :9999 -attachment-only-host domain.example:9999 -trust-all-content
//
package main
// different token before doing anything with the client). We
// set this dummy value during init so it doesn't clobber the
// one used by "run test servers".
- os.Setenv("ARVADOS_API_TOKEN", "xxx")
+ if os.Getenv("ARVADOS_API_TOKEN") == "" {
+ os.Setenv("ARVADOS_API_TOKEN", "xxx")
+ }
}
func main() {
var address string
func init() {
- flag.StringVar(&address, "address", ":80",
+ flag.StringVar(&address, "listen", ":80",
"Address to listen on: \"host:port\", or \":port\" to listen on all interfaces.")
}
_, rep, err = kc.PutB([]byte("some-more-index-data"))
c.Check(err, Equals, nil)
+ kc.Arvados.ApiToken = arvadostest.DataManagerToken
+
// Invoke GetIndex
for _, spec := range []struct {
prefix string
// in order to permit writes.
const MinFreeKilobytes = BlockSize / 1024
-// Until #6221 is resolved, never_delete must be true.
-// However, allow it to be false in testing with TestDataManagerToken
-const TestDataManagerToken = "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h"
-
// ProcMounts /proc/mounts
var ProcMounts = "/proc/mounts"
&neverDelete,
"never-delete",
true,
- "If set, nothing will be deleted. HTTP 405 will be returned "+
- "for valid DELETE requests.")
+ "If true, nothing will be deleted. "+
+ "Warning: the relevant features in keepstore and data manager have not been extensively tested. "+
+ "You should leave this option alone unless you can afford to lose data.")
flag.StringVar(
&blobSigningKeyFile,
"permission-key-file",
}
}
- if neverDelete != true && dataManagerToken != TestDataManagerToken {
- log.Fatal("never_delete must be true, see #6221")
+ if neverDelete != true {
+ log.Print("never-delete is not set. Warning: the relevant features in keepstore and data manager have not " +
+ "been extensively tested. You should leave this option alone unless you can afford to lose data.")
}
if blobSigningKeyFile != "" {
This actor simply destroys a cloud node, retrying as needed.
"""
+ # Reasons for a shutdown to be cancelled.
+ WINDOW_CLOSED = "shutdown window closed"
+ NODE_BROKEN = "cloud failed to shut down broken node"
+
def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor,
cancellable=True, retry_wait=1, max_retry_wait=180):
# If a ShutdownActor is cancellable, it will ask the
self._monitor = node_monitor.proxy()
self.cloud_node = self._monitor.cloud_node.get()
self.cancellable = cancellable
+ self.cancel_reason = None
self.success = None
def on_start(self):
self.success = success_flag
return super(ComputeNodeShutdownActor, self)._finished()
- def cancel_shutdown(self):
+ def cancel_shutdown(self, reason):
+ self.cancel_reason = reason
+ self._logger.info("Cloud node %s shutdown cancelled: %s.",
+ self.cloud_node.id, reason)
self._finished(success_flag=False)
def _stop_if_window_closed(orig_func):
def stop_wrapper(self, *args, **kwargs):
if (self.cancellable and
(not self._monitor.shutdown_eligible().get())):
- self._logger.info(
- "Cloud node %s shutdown cancelled - no longer eligible.",
- self.cloud_node.id)
- self._later.cancel_shutdown()
+ self._later.cancel_shutdown(self.WINDOW_CLOSED)
return None
else:
return orig_func(self, *args, **kwargs)
@ComputeNodeStateChangeBase._retry()
def shutdown_node(self):
if not self._cloud.destroy_node(self.cloud_node):
- # Force a retry.
- raise cloud_types.LibcloudError("destroy_node failed")
+ if self._cloud.broken(self.cloud_node):
+ self._later.cancel_shutdown(self.NODE_BROKEN)
+ else:
+ # Force a retry.
+ raise cloud_types.LibcloudError("destroy_node failed")
self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
arv_node = self._arvados_node()
if arv_node is None:
# error are still being investigated.
@ShutdownActorBase._retry((subprocess.CalledProcessError, OSError))
- def cancel_shutdown(self):
+ def cancel_shutdown(self, reason):
if self._nodename:
if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
# Resume from "drng" or "drain"
# Node is in a state such as 'idle' or 'alloc' so don't
# try to resume it because that will just raise an error.
pass
- return super(ComputeNodeShutdownActor, self).cancel_shutdown()
+ return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
@ShutdownActorBase._retry((subprocess.CalledProcessError, OSError))
@ShutdownActorBase._stop_if_window_closed
def __init__(self):
self.nodes = {}
self.orphans = {}
+ self._blacklist = set()
# Proxy the methods listed below to self.nodes.
def _proxy_method(name):
def add(self, record):
self.nodes[self.record_key(record)] = record
+ def blacklist(self, key):
+ self._blacklist.add(key)
+
def update_record(self, key, item):
setattr(self.nodes[key], self.RECORD_ATTR, item)
unseen = set(self.nodes.iterkeys())
for item in response:
key = self.item_key(item)
- if key in unseen:
+ if key in self._blacklist:
+ continue
+ elif key in unseen:
unseen.remove(key)
self.update_record(key, item)
else:
self._begin_node_shutdown(record.actor, cancellable=False)
def node_finished_shutdown(self, shutdown_actor):
- success, cloud_node = self._get_actor_attrs(shutdown_actor, 'success',
- 'cloud_node')
+ cloud_node, success, cancel_reason = self._get_actor_attrs(
+ shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
shutdown_actor.stop()
cloud_node_id = cloud_node.id
if not success:
+ if cancel_reason == self._node_shutdown.NODE_BROKEN:
+ self.cloud_nodes.blacklist(cloud_node_id)
del self.shutdowns[cloud_node_id]
elif cloud_node_id in self.booted:
self.booted.pop(cloud_node_id).actor.stop()
class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
def make_mocks(self, cloud_node=None, arvados_node=None,
- shutdown_open=True):
+ shutdown_open=True, node_broken=False):
self.timer = testutil.MockTimer()
self.shutdowns = testutil.MockShutdownTimer()
self.shutdowns._set_state(shutdown_open, 300)
self.cloud_client = mock.MagicMock(name='cloud_client')
+ self.cloud_client.broken.return_value = node_broken
self.arvados_client = mock.MagicMock(name='arvados_client')
self.updates = mock.MagicMock(name='update_mock')
if cloud_node is None:
self.make_actor()
self.check_success_flag(False, 2)
self.assertFalse(self.cloud_client.destroy_node.called)
+ self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
+ self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
def test_shutdown_retries_when_cloud_fails(self):
self.make_mocks()
self.cloud_client.destroy_node.return_value = True
self.check_success_flag(True)
+ def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self):
+ self.make_mocks(node_broken=True)
+ self.cloud_client.destroy_node.return_value = False
+ self.make_actor(start_time=0)
+ self.check_success_flag(False, 2)
+ self.assertEqual(1, self.cloud_client.destroy_node.call_count)
+ self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN,
+ self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
+
def test_late_subscribe(self):
self.make_actor()
subscriber = mock.Mock(name='subscriber_mock')
self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
self.assertTrue(shutdown_proxy.called)
+ def test_broken_node_blackholed_after_cancelled_shutdown(self):
+ cloud_node = testutil.cloud_node_mock(8)
+ wishlist = [testutil.MockSize(8)]
+ self.make_daemon([cloud_node], [testutil.arvados_node_mock(8)],
+ wishlist)
+ self.assertEqual(1, self.alive_monitor_count())
+ self.assertFalse(self.node_setup.start.called)
+ monitor = self.monitor_list()[0].proxy()
+ shutdown_proxy = self.node_shutdown.start().proxy
+ shutdown_proxy().cloud_node.get.return_value = cloud_node
+ shutdown_proxy().success.get.return_value = False
+ shutdown_proxy().cancel_reason.get.return_value = self.node_shutdown.NODE_BROKEN
+ self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
+ self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+ self.daemon.node_finished_shutdown(shutdown_proxy()).get(self.TIMEOUT)
+ self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
+ self.daemon.update_server_wishlist(wishlist).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertEqual(1, self.node_setup.start.call_count)
+
def test_nodes_shutting_down_replaced_below_max_nodes(self):
cloud_node = testutil.cloud_node_mock(6)
self.make_daemon([cloud_node], [testutil.arvados_node_mock(6)])
// srcConfig
var srcConfig apiConfig
srcConfig.APIHost = os.Getenv("ARVADOS_API_HOST")
- srcConfig.APIToken = os.Getenv("ARVADOS_API_TOKEN")
+ srcConfig.APIToken = arvadostest.DataManagerToken
srcConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
// dstConfig
var dstConfig apiConfig
dstConfig.APIHost = os.Getenv("ARVADOS_API_HOST")
- dstConfig.APIToken = os.Getenv("ARVADOS_API_TOKEN")
+ dstConfig.APIToken = arvadostest.DataManagerToken
dstConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
if enforcePermissions {
c.Check(err, IsNil)
c.Assert(srcConfig.APIHost, Equals, os.Getenv("ARVADOS_API_HOST"))
- c.Assert(srcConfig.APIToken, Equals, os.Getenv("ARVADOS_API_TOKEN"))
+ c.Assert(srcConfig.APIToken, Equals, arvadostest.DataManagerToken)
c.Assert(srcConfig.APIHostInsecure, Equals, matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE")))
c.Assert(srcConfig.ExternalClient, Equals, false)
c.Check(err, IsNil)
c.Assert(dstConfig.APIHost, Equals, os.Getenv("ARVADOS_API_HOST"))
- c.Assert(dstConfig.APIToken, Equals, os.Getenv("ARVADOS_API_TOKEN"))
+ c.Assert(dstConfig.APIToken, Equals, arvadostest.DataManagerToken)
c.Assert(dstConfig.APIHostInsecure, Equals, matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE")))
c.Assert(dstConfig.ExternalClient, Equals, false)
c.Check(err, IsNil)
fileContent := "ARVADOS_API_HOST=" + os.Getenv("ARVADOS_API_HOST") + "\n"
- fileContent += "ARVADOS_API_TOKEN=" + os.Getenv("ARVADOS_API_TOKEN") + "\n"
+ fileContent += "ARVADOS_API_TOKEN=" + arvadostest.DataManagerToken + "\n"
fileContent += "ARVADOS_API_HOST_INSECURE=" + os.Getenv("ARVADOS_API_HOST_INSECURE") + "\n"
fileContent += "ARVADOS_EXTERNAL_CLIENT=false\n"
fileContent += "ARVADOS_BLOB_SIGNING_KEY=abcdefg"