1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
22 lockprefix = "crunch-run-"
26 // procinfo is saved in each process's lockfile.
27 type procinfo struct {
34 // Detach acquires a lock for the given uuid, and starts the current
35 // program as a child process (with -no-detach prepended to the given
36 // arguments so the child knows not to detach again). The lock is
37 // passed along to the child process.
38 func Detach(uuid string, args []string, stdout, stderr io.Writer) int {
39 return exitcode(stderr, detach(uuid, args, stdout, stderr))
41 func detach(uuid string, args []string, stdout, stderr io.Writer) error {
42 lockfile, err := func() (*os.File, error) {
43 // We must hold the dir-level lock between
44 // opening/creating the lockfile and acquiring LOCK_EX
45 // on it, to avoid racing with the ListProcess's
46 // alive-checking and garbage collection.
47 dirlock, err := lockall()
52 lockfilename := filepath.Join(lockdir, lockprefix+uuid+locksuffix)
53 lockfile, err := os.OpenFile(lockfilename, os.O_CREATE|os.O_RDWR, 0700)
55 return nil, fmt.Errorf("open %s: %s", lockfilename, err)
57 err = syscall.Flock(int(lockfile.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
60 return nil, fmt.Errorf("lock %s: %s", lockfilename, err)
67 defer lockfile.Close()
70 outfile, err := ioutil.TempFile("", "crunch-run-"+uuid+"-stdout-")
75 errfile, err := ioutil.TempFile("", "crunch-run-"+uuid+"-stderr-")
77 os.Remove(outfile.Name())
82 cmd := exec.Command(args[0], append([]string{"-no-detach"}, args[1:]...)...)
85 // Child inherits lockfile.
86 cmd.ExtraFiles = []*os.File{lockfile}
87 // Ensure child isn't interrupted even if we receive signals
88 // from parent (sshd) while sending lockfile content to
90 cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
93 os.Remove(outfile.Name())
94 os.Remove(errfile.Name())
95 return fmt.Errorf("exec %s: %s", cmd.Path, err)
98 w := io.MultiWriter(stdout, lockfile)
99 err = json.NewEncoder(w).Encode(procinfo{
101 PID: cmd.Process.Pid,
102 Stdout: outfile.Name(),
103 Stderr: errfile.Name(),
106 os.Remove(outfile.Name())
107 os.Remove(errfile.Name())
113 // KillProcess finds the crunch-run process corresponding to the given
114 // uuid, and sends the given signal to it. It then waits up to 1
115 // second for the process to die. It returns 0 if the process is
116 // successfully killed or didn't exist in the first place.
117 func KillProcess(uuid string, signal syscall.Signal, stdout, stderr io.Writer) int {
118 return exitcode(stderr, kill(uuid, signal, stdout, stderr))
121 func kill(uuid string, signal syscall.Signal, stdout, stderr io.Writer) error {
122 path := filepath.Join(lockdir, lockprefix+uuid+locksuffix)
123 f, err := os.Open(path)
124 if os.IsNotExist(err) {
126 } else if err != nil {
127 return fmt.Errorf("open %s: %s", path, err)
132 err = json.NewDecoder(f).Decode(&pi)
134 return fmt.Errorf("decode %s: %s\n", path, err)
137 if pi.UUID != uuid || pi.PID == 0 {
138 return fmt.Errorf("%s: bogus procinfo: %+v", path, pi)
141 proc, err := os.FindProcess(pi.PID)
143 return fmt.Errorf("%s: find process %d: %s", uuid, pi.PID, err)
146 err = proc.Signal(signal)
147 for deadline := time.Now().Add(time.Second); err == nil && time.Now().Before(deadline); time.Sleep(time.Second / 100) {
148 err = proc.Signal(syscall.Signal(0))
151 return fmt.Errorf("%s: pid %d: sent signal %d (%s) but process is still alive", uuid, pi.PID, signal, signal)
153 fmt.Fprintf(stderr, "%s: pid %d: %s\n", uuid, pi.PID, err)
157 // List UUIDs of active crunch-run processes.
158 func ListProcesses(stdout, stderr io.Writer) int {
159 // filepath.Walk does not follow symlinks, so we must walk
160 // lockdir+"/." in case lockdir itself is a symlink.
161 walkdir := lockdir + "/."
162 return exitcode(stderr, filepath.Walk(walkdir, func(path string, info os.FileInfo, err error) error {
163 if info.IsDir() && path != walkdir {
164 return filepath.SkipDir
166 if name := info.Name(); !strings.HasPrefix(name, lockprefix) || !strings.HasSuffix(name, locksuffix) {
169 if info.Size() == 0 {
170 // race: process has opened/locked but hasn't yet written pid/uuid
174 f, err := os.Open(path)
180 // Ensure other processes don't acquire this lockfile
181 // after we have decided it is abandoned but before we
183 dirlock, err := lockall()
187 err = syscall.Flock(int(f.Fd()), syscall.LOCK_SH|syscall.LOCK_NB)
190 err := os.Remove(path)
193 fmt.Fprintf(stderr, "unlink %s: %s\n", f.Name(), err)
200 err = json.NewDecoder(f).Decode(&pi)
202 fmt.Fprintf(stderr, "%s: %s\n", path, err)
205 if pi.UUID == "" || pi.PID == 0 {
206 fmt.Fprintf(stderr, "%s: bogus procinfo: %+v", path, pi)
210 fmt.Fprintln(stdout, pi.UUID)
215 // If err is nil, return 0 ("success"); otherwise, print err to stderr
217 func exitcode(stderr io.Writer, err error) int {
219 fmt.Fprintln(stderr, err)
225 // Acquire a dir-level lock. Must be held while creating or deleting
226 // container-specific lockfiles, to avoid races during the intervals
227 // when those container-specific lockfiles are open but not locked.
229 // Caller releases the lock by closing the returned file.
230 func lockall() (*os.File, error) {
231 lockfile := filepath.Join(lockdir, lockprefix+"all"+locksuffix)
232 f, err := os.OpenFile(lockfile, os.O_CREATE|os.O_RDWR, 0700)
234 return nil, fmt.Errorf("open %s: %s", lockfile, err)
236 err = syscall.Flock(int(f.Fd()), syscall.LOCK_EX)
239 return nil, fmt.Errorf("lock %s: %s", lockfile, err)