From: Peter Amstutz Date: Mon, 4 Dec 2017 18:47:46 +0000 (-0500) Subject: Merge branch '12614-broken-docker' refs #12614 X-Git-Tag: 1.1.2~35 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/82b6440aaa9f265509770150d80e44319dc66fc7?hp=83bd8999a9cc528ff0169d19bd9d974760a59a45 Merge branch '12614-broken-docker' refs #12614 Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go index fc0dda718c..d455dd2c9f 100644 --- a/services/crunch-run/crunchrun.go +++ b/services/crunch-run/crunchrun.go @@ -19,6 +19,7 @@ import ( "os/signal" "path" "path/filepath" + "regexp" "runtime" "runtime/pprof" "sort" @@ -228,12 +229,15 @@ func (runner *ContainerRunner) stopSignals() { } } -var errorBlacklist = []string{"Cannot connect to the Docker daemon"} +var errorBlacklist = []string{ + "(?ms).*[Cc]annot connect to the Docker daemon.*", + "(?ms).*oci runtime error.*starting container process.*container init.*mounting.*to rootfs.*no such file or directory.*", +} var brokenNodeHook *string = flag.String("broken-node-hook", "", "Script to run if node is detected to be broken (for example, Docker daemon is not running)") func (runner *ContainerRunner) checkBrokenNode(goterr error) bool { for _, d := range errorBlacklist { - if strings.Index(goterr.Error(), d) != -1 { + if m, e := regexp.MatchString(d, goterr.Error()); m && e == nil { runner.CrunchLog.Printf("Error suggests node is unable to run containers: %v", goterr) if *brokenNodeHook == "" { runner.CrunchLog.Printf("No broken node hook provided, cannot mark node as broken.") @@ -915,7 +919,7 @@ func (runner *ContainerRunner) StartContainer() error { dockertypes.ContainerStartOptions{}) if err != nil { var advice string - if strings.Contains(err.Error(), "no such file or directory") { + if m, e := regexp.MatchString("(?ms).*(exec|System error).*(no such file or directory|file not found).*", err.Error()); m && e == nil { advice = fmt.Sprintf("\nPossible causes: command %q is missing, the interpreter given in #! is missing, or script has Windows line endings.", runner.Container.Command[0]) } return fmt.Errorf("could not start container: %v%s", err, advice) diff --git a/services/crunch-run/crunchrun_test.go b/services/crunch-run/crunchrun_test.go index 97faa89fb1..e1d9fed730 100644 --- a/services/crunch-run/crunchrun_test.go +++ b/services/crunch-run/crunchrun_test.go @@ -130,6 +130,19 @@ func (t *TestDockerClient) ContainerCreate(ctx context.Context, config *dockerco } func (t *TestDockerClient) ContainerStart(ctx context.Context, container string, options dockertypes.ContainerStartOptions) error { + if t.finish == 3 { + return errors.New(`Error response from daemon: oci runtime error: container_linux.go:247: starting container process caused "process_linux.go:359: container init caused \"rootfs_linux.go:54: mounting \\\"/tmp/keep453790790/by_id/99999999999999999999999999999999+99999/myGenome\\\" to rootfs \\\"/tmp/docker/overlay2/9999999999999999999999999999999999999999999999999999999999999999/merged\\\" at \\\"/tmp/docker/overlay2/9999999999999999999999999999999999999999999999999999999999999999/merged/keep/99999999999999999999999999999999+99999/myGenome\\\" caused \\\"no such file or directory\\\"\""`) + } + if t.finish == 4 { + return errors.New(`panic: standard_init_linux.go:175: exec user process caused "no such file or directory"`) + } + if t.finish == 5 { + return errors.New(`Error response from daemon: Cannot start container 41f26cbc43bcc1280f4323efb1830a394ba8660c9d1c2b564ba42bf7f7694845: [8] System error: no such file or directory`) + } + if t.finish == 6 { + return errors.New(`Error response from daemon: Cannot start container 58099cd76c834f3dc2a4fb76c8028f049ae6d4fdf0ec373e1f2cfea030670c2d: [8] System error: exec: "foobar": executable file not found in $PATH`) + } + if container == "abcde" { // t.fn gets executed in ContainerWait return nil @@ -1835,3 +1848,91 @@ func (s *TestSuite) TestFullBrokenDocker2(c *C) { c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*unable to run containers.*") c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*No broken node hook.*") } + +func (s *TestSuite) TestFullBrokenDocker3(c *C) { + ech := "" + brokenNodeHook = &ech + + api, _, _ := FullRunHelper(c, `{ + "command": ["echo", "hello world"], + "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122", + "cwd": ".", + "environment": {}, + "mounts": {"/tmp": {"kind": "tmp"} }, + "output_path": "/tmp", + "priority": 1, + "runtime_constraints": {} +}`, nil, 3, func(t *TestDockerClient) { + t.logWriter.Write(dockerLog(1, "hello world\n")) + t.logWriter.Close() + }) + + c.Check(api.CalledWith("container.state", "Cancelled"), NotNil) + c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*unable to run containers.*") +} + +func (s *TestSuite) TestBadCommand1(c *C) { + ech := "" + brokenNodeHook = &ech + + api, _, _ := FullRunHelper(c, `{ + "command": ["echo", "hello world"], + "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122", + "cwd": ".", + "environment": {}, + "mounts": {"/tmp": {"kind": "tmp"} }, + "output_path": "/tmp", + "priority": 1, + "runtime_constraints": {} +}`, nil, 4, func(t *TestDockerClient) { + t.logWriter.Write(dockerLog(1, "hello world\n")) + t.logWriter.Close() + }) + + c.Check(api.CalledWith("container.state", "Cancelled"), NotNil) + c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*Possible causes:.*is missing.*") +} + +func (s *TestSuite) TestBadCommand2(c *C) { + ech := "" + brokenNodeHook = &ech + + api, _, _ := FullRunHelper(c, `{ + "command": ["echo", "hello world"], + "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122", + "cwd": ".", + "environment": {}, + "mounts": {"/tmp": {"kind": "tmp"} }, + "output_path": "/tmp", + "priority": 1, + "runtime_constraints": {} +}`, nil, 5, func(t *TestDockerClient) { + t.logWriter.Write(dockerLog(1, "hello world\n")) + t.logWriter.Close() + }) + + c.Check(api.CalledWith("container.state", "Cancelled"), NotNil) + c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*Possible causes:.*is missing.*") +} + +func (s *TestSuite) TestBadCommand3(c *C) { + ech := "" + brokenNodeHook = &ech + + api, _, _ := FullRunHelper(c, `{ + "command": ["echo", "hello world"], + "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122", + "cwd": ".", + "environment": {}, + "mounts": {"/tmp": {"kind": "tmp"} }, + "output_path": "/tmp", + "priority": 1, + "runtime_constraints": {} +}`, nil, 6, func(t *TestDockerClient) { + t.logWriter.Write(dockerLog(1, "hello world\n")) + t.logWriter.Close() + }) + + c.Check(api.CalledWith("container.state", "Cancelled"), NotNil) + c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*Possible causes:.*is missing.*") +}