X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0393a615c6c795e66e021188a1a3c00c04ca68e8..de9a5e2703cca69b0ba6e8e3e6102ee267b7447e:/services/crunch-run/background.go diff --git a/services/crunch-run/background.go b/services/crunch-run/background.go index ee054f056f..0d4612908c 100644 --- a/services/crunch-run/background.go +++ b/services/crunch-run/background.go @@ -32,22 +32,38 @@ type procinfo struct { } // Detach acquires a lock for the given uuid, and starts the current -// program as a child process (with -detached prepended to the given +// program as a child process (with -no-detach prepended to the given // arguments so the child knows not to detach again). The lock is // passed along to the child process. func Detach(uuid string, args []string, stdout, stderr io.Writer) int { return exitcode(stderr, detach(uuid, args, stdout, stderr)) } func detach(uuid string, args []string, stdout, stderr io.Writer) error { - lockfile, err := os.OpenFile(filepath.Join(lockdir, lockprefix+uuid+locksuffix), os.O_CREATE|os.O_RDWR, 0700) + lockfile, err := func() (*os.File, error) { + // We must hold the dir-level lock between + // opening/creating the lockfile and acquiring LOCK_EX + // on it, to avoid racing with the ListProcess's + // alive-checking and garbage collection. + dirlock, err := lockall() + if err != nil { + return nil, err + } + defer dirlock.Close() + lockfile, err := os.OpenFile(filepath.Join(lockdir, lockprefix+uuid+locksuffix), os.O_CREATE|os.O_RDWR, 0700) + if err != nil { + return nil, err + } + err = syscall.Flock(int(lockfile.Fd()), syscall.LOCK_EX|syscall.LOCK_NB) + if err != nil { + lockfile.Close() + return nil, err + } + return lockfile, nil + }() if err != nil { return err } defer lockfile.Close() - err = syscall.Flock(int(lockfile.Fd()), syscall.LOCK_EX|syscall.LOCK_NB) - if err != nil { - return err - } lockfile.Truncate(0) outfile, err := ioutil.TempFile("", "crunch-run-"+uuid+"-stdout-") @@ -62,7 +78,7 @@ func detach(uuid string, args []string, stdout, stderr io.Writer) error { } defer errfile.Close() - cmd := exec.Command(args[0], append([]string{"-detached"}, args[1:]...)...) + cmd := exec.Command(args[0], append([]string{"-no-detach"}, args[1:]...)...) cmd.Stdout = outfile cmd.Stderr = errfile // Child inherits lockfile. @@ -80,6 +96,7 @@ func detach(uuid string, args []string, stdout, stderr io.Writer) error { w := io.MultiWriter(stdout, lockfile) err = json.NewEncoder(w).Encode(procinfo{ + UUID: uuid, PID: cmd.Process.Pid, Stdout: outfile.Name(), Stderr: errfile.Name(), @@ -138,8 +155,11 @@ func kill(uuid string, signal syscall.Signal, stdout, stderr io.Writer) error { // List UUIDs of active crunch-run processes. func ListProcesses(stdout, stderr io.Writer) int { - return exitcode(stderr, filepath.Walk(lockdir, func(path string, info os.FileInfo, err error) error { - if info.IsDir() { + // filepath.Walk does not follow symlinks, so we must walk + // lockdir+"/." in case lockdir itself is a symlink. + walkdir := lockdir + "/." + return exitcode(stderr, filepath.Walk(walkdir, func(path string, info os.FileInfo, err error) error { + if info.IsDir() && path != walkdir { return filepath.SkipDir } if name := info.Name(); !strings.HasPrefix(name, lockprefix) || !strings.HasSuffix(name, locksuffix) { @@ -156,19 +176,24 @@ func ListProcesses(stdout, stderr io.Writer) int { } defer f.Close() - // TODO: Do this check without risk of disrupting lock - // acquisition during races, e.g., by connecting to a - // unix socket or checking /proc/$pid/fd/$n -> - // lockfile. - err = syscall.Flock(int(f.Fd()), syscall.LOCK_SH) + // Ensure other processes don't acquire this lockfile + // after we have decided it is abandoned but before we + // have deleted it. + dirlock, err := lockall() + if err != nil { + return err + } + err = syscall.Flock(int(f.Fd()), syscall.LOCK_SH|syscall.LOCK_NB) if err == nil { // lockfile is stale err := os.Remove(path) + dirlock.Close() if err != nil { fmt.Fprintln(stderr, err) } return nil } + dirlock.Close() var pi procinfo err = json.NewDecoder(f).Decode(&pi) @@ -195,3 +220,21 @@ func exitcode(stderr io.Writer, err error) int { } return 0 } + +// Acquire a dir-level lock. Must be held while creating or deleting +// container-specific lockfiles, to avoid races during the intervals +// when those container-specific lockfiles are open but not locked. +// +// Caller releases the lock by closing the returned file. +func lockall() (*os.File, error) { + f, err := os.OpenFile(filepath.Join(lockdir, lockprefix+"all"+locksuffix), os.O_CREATE|os.O_RDWR, 0700) + if err != nil { + return nil, err + } + err = syscall.Flock(int(f.Fd()), syscall.LOCK_EX) + if err != nil { + f.Close() + return nil, err + } + return f, nil +}