279327ba18811ba8ad6339600cc124460f2fc35c
[arvados.git] / services / crunch-dispatch-local / crunch-dispatch-local.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 // Dispatcher service for Crunch that runs containers locally.
8
9 import (
10         "context"
11         "flag"
12         "fmt"
13         "log"
14         "os"
15         "os/exec"
16         "os/signal"
17         "sync"
18         "syscall"
19         "time"
20
21         "git.curoverse.com/arvados.git/sdk/go/arvados"
22         "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
23         "git.curoverse.com/arvados.git/sdk/go/dispatch"
24 )
25
26 var version = "dev"
27
28 func main() {
29         err := doMain()
30         if err != nil {
31                 log.Fatalf("%q", err)
32         }
33 }
34
35 var (
36         runningCmds      map[string]*exec.Cmd
37         runningCmdsMutex sync.Mutex
38         waitGroup        sync.WaitGroup
39         crunchRunCommand *string
40 )
41
42 func doMain() error {
43         flags := flag.NewFlagSet("crunch-dispatch-local", flag.ExitOnError)
44
45         pollInterval := flags.Int(
46                 "poll-interval",
47                 10,
48                 "Interval in seconds to poll for queued containers")
49
50         crunchRunCommand = flags.String(
51                 "crunch-run-command",
52                 "/usr/bin/crunch-run",
53                 "Crunch command to run container")
54
55         getVersion := flags.Bool(
56                 "version",
57                 false,
58                 "Print version information and exit.")
59
60         // Parse args; omit the first arg which is the command name
61         flags.Parse(os.Args[1:])
62
63         // Print version information if requested
64         if *getVersion {
65                 fmt.Printf("crunch-dispatch-local %s\n", version)
66                 return nil
67         }
68
69         log.Printf("crunch-dispatch-local %s started", version)
70
71         runningCmds = make(map[string]*exec.Cmd)
72
73         arv, err := arvadosclient.MakeArvadosClient()
74         if err != nil {
75                 log.Printf("Error making Arvados client: %v", err)
76                 return err
77         }
78         arv.Retries = 25
79
80         dispatcher := dispatch.Dispatcher{
81                 Arv:          arv,
82                 RunContainer: run,
83                 PollPeriod:   time.Duration(*pollInterval) * time.Second,
84         }
85
86         ctx, cancel := context.WithCancel(context.Background())
87         err = dispatcher.Run(ctx)
88         if err != nil {
89                 return err
90         }
91
92         c := make(chan os.Signal, 1)
93         signal.Notify(c, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
94         sig := <-c
95         log.Printf("Received %s, shutting down", sig)
96         signal.Stop(c)
97
98         cancel()
99
100         runningCmdsMutex.Lock()
101         // Finished dispatching; interrupt any crunch jobs that are still running
102         for _, cmd := range runningCmds {
103                 cmd.Process.Signal(os.Interrupt)
104         }
105         runningCmdsMutex.Unlock()
106
107         // Wait for all running crunch jobs to complete / terminate
108         waitGroup.Wait()
109
110         return nil
111 }
112
113 func startFunc(container arvados.Container, cmd *exec.Cmd) error {
114         return cmd.Start()
115 }
116
117 var startCmd = startFunc
118
119 // Run a container.
120 //
121 // If the container is Locked, start a new crunch-run process and wait until
122 // crunch-run completes.  If the priority is set to zero, set an interrupt
123 // signal to the crunch-run process.
124 //
125 // If the container is in any other state, or is not Complete/Cancelled after
126 // crunch-run terminates, mark the container as Cancelled.
127 func run(dispatcher *dispatch.Dispatcher,
128         container arvados.Container,
129         status <-chan arvados.Container) {
130
131         uuid := container.UUID
132
133         if container.State == dispatch.Locked {
134                 waitGroup.Add(1)
135
136                 cmd := exec.Command(*crunchRunCommand, uuid)
137                 cmd.Stdin = nil
138                 cmd.Stderr = os.Stderr
139                 cmd.Stdout = os.Stderr
140
141                 log.Printf("Starting container %v", uuid)
142
143                 // Add this crunch job to the list of runningCmds only if we
144                 // succeed in starting crunch-run.
145
146                 runningCmdsMutex.Lock()
147                 if err := startCmd(container, cmd); err != nil {
148                         runningCmdsMutex.Unlock()
149                         log.Printf("Error starting %v for %v: %q", *crunchRunCommand, uuid, err)
150                         dispatcher.UpdateState(uuid, dispatch.Cancelled)
151                 } else {
152                         runningCmds[uuid] = cmd
153                         runningCmdsMutex.Unlock()
154
155                         // Need to wait for crunch-run to exit
156                         done := make(chan struct{})
157
158                         go func() {
159                                 if _, err := cmd.Process.Wait(); err != nil {
160                                         log.Printf("Error while waiting for crunch job to finish for %v: %q", uuid, err)
161                                 }
162                                 log.Printf("sending done")
163                                 done <- struct{}{}
164                         }()
165
166                 Loop:
167                         for {
168                                 select {
169                                 case <-done:
170                                         break Loop
171                                 case c := <-status:
172                                         // Interrupt the child process if priority changes to 0
173                                         if (c.State == dispatch.Locked || c.State == dispatch.Running) && c.Priority == 0 {
174                                                 log.Printf("Sending SIGINT to pid %d to cancel container %v", cmd.Process.Pid, uuid)
175                                                 cmd.Process.Signal(os.Interrupt)
176                                         }
177                                 }
178                         }
179                         close(done)
180
181                         log.Printf("Finished container run for %v", uuid)
182
183                         // Remove the crunch job from runningCmds
184                         runningCmdsMutex.Lock()
185                         delete(runningCmds, uuid)
186                         runningCmdsMutex.Unlock()
187                 }
188                 waitGroup.Done()
189         }
190
191         // If the container is not finalized, then change it to "Cancelled".
192         err := dispatcher.Arv.Get("containers", uuid, nil, &container)
193         if err != nil {
194                 log.Printf("Error getting final container state: %v", err)
195         }
196         if container.State == dispatch.Locked || container.State == dispatch.Running {
197                 log.Printf("After %s process termination, container state for %v is %q.  Updating it to %q",
198                         *crunchRunCommand, container.State, uuid, dispatch.Cancelled)
199                 dispatcher.UpdateState(uuid, dispatch.Cancelled)
200         }
201
202         // drain any subsequent status changes
203         for range status {
204         }
205
206         log.Printf("Finalized container %v", uuid)
207 }