8 "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
9 "git.curoverse.com/arvados.git/sdk/go/keepclient"
10 "git.curoverse.com/arvados.git/sdk/go/manifest"
11 "github.com/curoverse/dockerclient"
24 // IArvadosClient is the minimal Arvados API methods used by crunch-run.
25 type IArvadosClient interface {
26 Create(resourceType string, parameters arvadosclient.Dict, output interface{}) error
27 Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
28 Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) (err error)
31 // ErrCancelled is the error returned when the container is cancelled.
32 var ErrCancelled = errors.New("Cancelled")
34 // IKeepClient is the minimal Keep API methods used by crunch-run.
35 type IKeepClient interface {
36 PutHB(hash string, buf []byte) (string, int, error)
37 ManifestFileReader(m manifest.Manifest, filename string) (keepclient.ReadCloserWithLen, error)
40 // Mount describes the mount points to create inside the container.
42 Kind string `json:"kind"`
43 Writable bool `json:"writable"`
44 PortableDataHash string `json:"portable_data_hash"`
45 UUID string `json:"uuid"`
46 DeviceType string `json:"device_type"`
49 // Collection record returned by the API server.
50 type CollectionRecord struct {
51 ManifestText string `json:"manifest_text"`
52 PortableDataHash string `json:"portable_data_hash"`
55 // ContainerRecord is the container record returned by the API server.
56 type ContainerRecord struct {
57 UUID string `json:"uuid"`
58 Command []string `json:"command"`
59 ContainerImage string `json:"container_image"`
60 Cwd string `json:"cwd"`
61 Environment map[string]string `json:"environment"`
62 Mounts map[string]Mount `json:"mounts"`
63 OutputPath string `json:"output_path"`
64 Priority int `json:"priority"`
65 RuntimeConstraints map[string]interface{} `json:"runtime_constraints"`
66 State string `json:"state"`
67 Output string `json:"output"`
70 // NewLogWriter is a factory function to create a new log writer.
71 type NewLogWriter func(name string) io.WriteCloser
73 type RunArvMount func([]string) (*exec.Cmd, error)
75 type MkTempDir func(string, string) (string, error)
77 // ThinDockerClient is the minimal Docker client interface used by crunch-run.
78 type ThinDockerClient interface {
79 StopContainer(id string, timeout int) error
80 InspectImage(id string) (*dockerclient.ImageInfo, error)
81 LoadImage(reader io.Reader) error
82 CreateContainer(config *dockerclient.ContainerConfig, name string, authConfig *dockerclient.AuthConfig) (string, error)
83 StartContainer(id string, config *dockerclient.HostConfig) error
84 AttachContainer(id string, options *dockerclient.AttachOptions) (io.ReadCloser, error)
85 Wait(id string) <-chan dockerclient.WaitResult
86 RemoveImage(name string, force bool) ([]*dockerclient.ImageDelete, error)
89 // ContainerRunner is the main stateful struct used for a single execution of a
91 type ContainerRunner struct {
92 Docker ThinDockerClient
93 ArvClient IArvadosClient
96 dockerclient.ContainerConfig
100 loggingDone chan bool
101 CrunchLog *ThrottledLogger
102 Stdout *ThrottledLogger
103 Stderr *ThrottledLogger
104 LogCollection *CollectionWriter
111 CleanupTempDir []string
114 CancelLock sync.Mutex
116 SigChan chan os.Signal
117 ArvMountExit chan error
121 // SetupSignals sets up signal handling to gracefully terminate the underlying
122 // Docker container and update state when receiving a TERM, INT or QUIT signal.
123 func (runner *ContainerRunner) SetupSignals() {
124 runner.SigChan = make(chan os.Signal, 1)
125 signal.Notify(runner.SigChan, syscall.SIGTERM)
126 signal.Notify(runner.SigChan, syscall.SIGINT)
127 signal.Notify(runner.SigChan, syscall.SIGQUIT)
129 go func(sig <-chan os.Signal) {
131 if !runner.Cancelled {
132 runner.CancelLock.Lock()
133 runner.Cancelled = true
134 if runner.ContainerID != "" {
135 runner.Docker.StopContainer(runner.ContainerID, 10)
137 runner.CancelLock.Unlock()
143 // LoadImage determines the docker image id from the container record and
144 // checks if it is available in the local Docker image store. If not, it loads
145 // the image from Keep.
146 func (runner *ContainerRunner) LoadImage() (err error) {
148 runner.CrunchLog.Printf("Fetching Docker image from collection '%s'", runner.ContainerRecord.ContainerImage)
150 var collection CollectionRecord
151 err = runner.ArvClient.Get("collections", runner.ContainerRecord.ContainerImage, nil, &collection)
153 return fmt.Errorf("While getting container image collection: %v", err)
155 manifest := manifest.Manifest{Text: collection.ManifestText}
156 var img, imageID string
157 for ms := range manifest.StreamIter() {
158 img = ms.FileStreamSegments[0].Name
159 if !strings.HasSuffix(img, ".tar") {
160 return fmt.Errorf("First file in the container image collection does not end in .tar")
162 imageID = img[:len(img)-4]
165 runner.CrunchLog.Printf("Using Docker image id '%s'", imageID)
167 _, err = runner.Docker.InspectImage(imageID)
169 runner.CrunchLog.Print("Loading Docker image from keep")
171 var readCloser io.ReadCloser
172 readCloser, err = runner.Kc.ManifestFileReader(manifest, img)
174 return fmt.Errorf("While creating ManifestFileReader for container image: %v", err)
177 err = runner.Docker.LoadImage(readCloser)
179 return fmt.Errorf("While loading container image into Docker: %v", err)
182 runner.CrunchLog.Print("Docker image is available")
185 runner.ContainerConfig.Image = imageID
190 func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string) (c *exec.Cmd, err error) {
191 c = exec.Command("arv-mount", arvMountCmd...)
192 nt := NewThrottledLogger(runner.NewLogWriter("arv-mount"))
201 statReadme := make(chan bool)
202 runner.ArvMountExit = make(chan error)
207 time.Sleep(100 * time.Millisecond)
208 _, err = os.Stat(fmt.Sprintf("%s/by_id/README", runner.ArvMountPoint))
218 runner.ArvMountExit <- c.Wait()
219 close(runner.ArvMountExit)
225 case err := <-runner.ArvMountExit:
226 runner.ArvMount = nil
234 func (runner *ContainerRunner) SetupMounts() (err error) {
235 runner.ArvMountPoint, err = runner.MkTempDir("", "keep")
237 return fmt.Errorf("While creating keep mount temp dir: %v", err)
240 runner.CleanupTempDir = append(runner.CleanupTempDir, runner.ArvMountPoint)
244 arvMountCmd := []string{"--foreground", "--allow-other", "--read-write"}
245 collectionPaths := []string{}
248 for bind, mnt := range runner.ContainerRecord.Mounts {
249 if mnt.Kind == "collection" {
251 if mnt.UUID != "" && mnt.PortableDataHash != "" {
252 return fmt.Errorf("Cannot specify both 'uuid' and 'portable_data_hash' for a collection mount")
256 return fmt.Errorf("Writing to existing collections currently not permitted.")
259 src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.UUID)
260 } else if mnt.PortableDataHash != "" {
262 return fmt.Errorf("Can never write to a collection specified by portable data hash")
264 src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.PortableDataHash)
266 src = fmt.Sprintf("%s/tmp%d", runner.ArvMountPoint, tmpcount)
267 arvMountCmd = append(arvMountCmd, "--mount-tmp")
268 arvMountCmd = append(arvMountCmd, fmt.Sprintf("tmp%d", tmpcount))
272 if bind == runner.ContainerRecord.OutputPath {
273 runner.HostOutputDir = src
275 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", src, bind))
277 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s:ro", src, bind))
279 collectionPaths = append(collectionPaths, src)
280 } else if mnt.Kind == "tmp" {
281 if bind == runner.ContainerRecord.OutputPath {
282 runner.HostOutputDir, err = runner.MkTempDir("", "")
284 return fmt.Errorf("While creating mount temp dir: %v", err)
286 st, staterr := os.Stat(runner.HostOutputDir)
288 return fmt.Errorf("While Stat on temp dir: %v", staterr)
290 err = os.Chmod(runner.HostOutputDir, st.Mode()|os.ModeSetgid|0777)
292 return fmt.Errorf("While Chmod temp dir: %v", err)
294 runner.CleanupTempDir = append(runner.CleanupTempDir, runner.HostOutputDir)
295 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", runner.HostOutputDir, bind))
297 runner.Binds = append(runner.Binds, bind)
300 return fmt.Errorf("Unknown mount kind '%s'", mnt.Kind)
304 if runner.HostOutputDir == "" {
305 return fmt.Errorf("Output path does not correspond to a writable mount point")
309 arvMountCmd = append(arvMountCmd, "--mount-by-pdh", "by_id")
311 arvMountCmd = append(arvMountCmd, "--mount-by-id", "by_id")
313 arvMountCmd = append(arvMountCmd, runner.ArvMountPoint)
315 runner.ArvMount, err = runner.RunArvMount(arvMountCmd)
317 return fmt.Errorf("While trying to start arv-mount: %v", err)
320 for _, p := range collectionPaths {
323 return fmt.Errorf("While checking that input files exist: %v", err)
330 func (runner *ContainerRunner) ProcessDockerAttach(containerReader io.Reader) {
331 // Handle docker log protocol
332 // https://docs.docker.com/engine/reference/api/docker_remote_api_v1.15/#attach-to-a-container
334 header := make([]byte, 8)
336 _, readerr := io.ReadAtLeast(containerReader, header, 8)
339 readsize := int64(header[7]) | (int64(header[6]) << 8) | (int64(header[5]) << 16) | (int64(header[4]) << 24)
342 _, readerr = io.CopyN(runner.Stdout, containerReader, readsize)
345 _, readerr = io.CopyN(runner.Stderr, containerReader, readsize)
350 if readerr != io.EOF {
351 runner.CrunchLog.Printf("While reading docker logs: %v", readerr)
354 closeerr := runner.Stdout.Close()
356 runner.CrunchLog.Printf("While closing stdout logs: %v", closeerr)
359 closeerr = runner.Stderr.Close()
361 runner.CrunchLog.Printf("While closing stderr logs: %v", closeerr)
364 runner.loggingDone <- true
365 close(runner.loggingDone)
371 // AttachLogs connects the docker container stdout and stderr logs to the
372 // Arvados logger which logs to Keep and the API server logs table.
373 func (runner *ContainerRunner) AttachStreams() (err error) {
375 runner.CrunchLog.Print("Attaching container streams")
377 var containerReader io.Reader
378 containerReader, err = runner.Docker.AttachContainer(runner.ContainerID,
379 &dockerclient.AttachOptions{Stream: true, Stdout: true, Stderr: true})
381 return fmt.Errorf("While attaching container stdout/stderr streams: %v", err)
384 runner.loggingDone = make(chan bool)
386 runner.Stdout = NewThrottledLogger(runner.NewLogWriter("stdout"))
387 runner.Stderr = NewThrottledLogger(runner.NewLogWriter("stderr"))
389 go runner.ProcessDockerAttach(containerReader)
394 // StartContainer creates the container and runs it.
395 func (runner *ContainerRunner) StartContainer() (err error) {
396 runner.CrunchLog.Print("Creating Docker container")
398 runner.CancelLock.Lock()
399 defer runner.CancelLock.Unlock()
401 if runner.Cancelled {
405 runner.ContainerConfig.Cmd = runner.ContainerRecord.Command
406 if runner.ContainerRecord.Cwd != "." {
407 runner.ContainerConfig.WorkingDir = runner.ContainerRecord.Cwd
409 for k, v := range runner.ContainerRecord.Environment {
410 runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
412 runner.ContainerConfig.NetworkDisabled = true
413 runner.ContainerID, err = runner.Docker.CreateContainer(&runner.ContainerConfig, "", nil)
415 return fmt.Errorf("While creating container: %v", err)
417 hostConfig := &dockerclient.HostConfig{Binds: runner.Binds,
418 LogConfig: dockerclient.LogConfig{Type: "none"}}
420 err = runner.AttachStreams()
425 runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
426 err = runner.Docker.StartContainer(runner.ContainerID, hostConfig)
428 return fmt.Errorf("While starting container: %v", err)
434 // WaitFinish waits for the container to terminate, capture the exit code, and
435 // close the stdout/stderr logging.
436 func (runner *ContainerRunner) WaitFinish() error {
437 runner.CrunchLog.Print("Waiting for container to finish")
439 result := runner.Docker.Wait(runner.ContainerID)
442 return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
444 runner.ExitCode = &wr.ExitCode
446 // wait for stdout/stderr to complete
452 // HandleOutput sets the output, unmounts the FUSE mount, and deletes temporary directories
453 func (runner *ContainerRunner) CaptureOutput() error {
454 if runner.finalState != "Complete" {
458 if runner.HostOutputDir == "" {
462 _, err := os.Stat(runner.HostOutputDir)
464 return fmt.Errorf("While checking host output path: %v", err)
467 var manifestText string
469 collectionMetafile := fmt.Sprintf("%s/.arvados#collection", runner.HostOutputDir)
470 _, err = os.Stat(collectionMetafile)
473 cw := CollectionWriter{runner.Kc, nil, sync.Mutex{}}
474 manifestText, err = cw.WriteTree(runner.HostOutputDir, runner.CrunchLog.Logger)
476 return fmt.Errorf("While uploading output files: %v", err)
479 // FUSE mount directory
480 file, openerr := os.Open(collectionMetafile)
482 return fmt.Errorf("While opening FUSE metafile: %v", err)
486 rec := CollectionRecord{}
487 err = json.NewDecoder(file).Decode(&rec)
489 return fmt.Errorf("While reading FUSE metafile: %v", err)
491 manifestText = rec.ManifestText
494 var response CollectionRecord
495 err = runner.ArvClient.Create("collections",
497 "collection": arvadosclient.Dict{
498 "manifest_text": manifestText}},
501 return fmt.Errorf("While creating output collection: %v", err)
504 runner.OutputPDH = new(string)
505 *runner.OutputPDH = response.PortableDataHash
510 func (runner *ContainerRunner) CleanupDirs() {
511 if runner.ArvMount != nil {
512 umount := exec.Command("fusermount", "-z", "-u", runner.ArvMountPoint)
513 umnterr := umount.Run()
515 runner.CrunchLog.Printf("While running fusermount: %v", umnterr)
518 mnterr := <-runner.ArvMountExit
520 runner.CrunchLog.Printf("Arv-mount exit error: %v", mnterr)
524 for _, tmpdir := range runner.CleanupTempDir {
525 rmerr := os.RemoveAll(tmpdir)
527 runner.CrunchLog.Printf("While cleaning up temporary directory %s: %v", tmpdir, rmerr)
532 // CommitLogs posts the collection containing the final container logs.
533 func (runner *ContainerRunner) CommitLogs() error {
534 runner.CrunchLog.Print(runner.finalState)
535 runner.CrunchLog.Close()
537 // Closing CrunchLog above allows it to be committed to Keep at this
538 // point, but re-open crunch log with ArvClient in case there are any
539 // other further (such as failing to write the log to Keep!) while
541 runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID,
544 mt, err := runner.LogCollection.ManifestText()
546 return fmt.Errorf("While creating log manifest: %v", err)
549 var response CollectionRecord
550 err = runner.ArvClient.Create("collections",
552 "collection": arvadosclient.Dict{
553 "name": "logs for " + runner.ContainerRecord.UUID,
554 "manifest_text": mt}},
557 return fmt.Errorf("While creating log collection: %v", err)
560 runner.LogsPDH = new(string)
561 *runner.LogsPDH = response.PortableDataHash
566 // UpdateContainerRecordRunning updates the container state to "Running"
567 func (runner *ContainerRunner) UpdateContainerRecordRunning() error {
568 return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID,
569 arvadosclient.Dict{"container": arvadosclient.Dict{"state": "Running"}}, nil)
572 // UpdateContainerRecordComplete updates the container record state on API
573 // server to "Complete" or "Cancelled"
574 func (runner *ContainerRunner) UpdateContainerRecordComplete() error {
575 update := arvadosclient.Dict{}
576 if runner.LogsPDH != nil {
577 update["log"] = *runner.LogsPDH
579 if runner.ExitCode != nil {
580 update["exit_code"] = *runner.ExitCode
582 if runner.OutputPDH != nil {
583 update["output"] = runner.OutputPDH
586 update["state"] = runner.finalState
588 return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID, arvadosclient.Dict{"container": update}, nil)
591 // NewArvLogWriter creates an ArvLogWriter
592 func (runner *ContainerRunner) NewArvLogWriter(name string) io.WriteCloser {
593 return &ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID, name, runner.LogCollection.Open(name + ".txt")}
596 // Run the full container lifecycle.
597 func (runner *ContainerRunner) Run() (err error) {
598 runner.CrunchLog.Printf("Executing container '%s'", runner.ContainerRecord.UUID)
600 hostname, hosterr := os.Hostname()
602 runner.CrunchLog.Printf("Error getting hostname '%v'", hosterr)
604 runner.CrunchLog.Printf("Executing on host '%s'", hostname)
607 var runerr, waiterr error
611 runner.CrunchLog.Print(err)
614 if runner.Cancelled {
615 runner.finalState = "Cancelled"
617 runner.finalState = "Complete"
620 // (6) capture output
621 outputerr := runner.CaptureOutput()
622 if outputerr != nil {
623 runner.CrunchLog.Print(outputerr)
626 // (7) clean up temporary directories
630 logerr := runner.CommitLogs()
632 runner.CrunchLog.Print(logerr)
635 // (9) update container record with results
636 updateerr := runner.UpdateContainerRecordComplete()
637 if updateerr != nil {
638 runner.CrunchLog.Print(updateerr)
641 runner.CrunchLog.Close()
646 } else if waiterr != nil {
648 } else if logerr != nil {
650 } else if updateerr != nil {
656 err = runner.ArvClient.Get("containers", runner.ContainerRecord.UUID, nil, &runner.ContainerRecord)
658 return fmt.Errorf("While getting container record: %v", err)
661 // (1) setup signal handling
662 runner.SetupSignals()
664 // (2) check for and/or load image
665 err = runner.LoadImage()
667 return fmt.Errorf("While loading container image: %v", err)
670 // (3) set up FUSE mount and binds
671 err = runner.SetupMounts()
673 return fmt.Errorf("While setting up mounts: %v", err)
676 // (3) create and start container
677 err = runner.StartContainer()
679 if err == ErrCancelled {
685 // (4) update container record state
686 err = runner.UpdateContainerRecordRunning()
688 runner.CrunchLog.Print(err)
691 // (5) wait for container to finish
692 waiterr = runner.WaitFinish()
697 // NewContainerRunner creates a new container runner.
698 func NewContainerRunner(api IArvadosClient,
700 docker ThinDockerClient,
701 containerUUID string) *ContainerRunner {
703 cr := &ContainerRunner{ArvClient: api, Kc: kc, Docker: docker}
704 cr.NewLogWriter = cr.NewArvLogWriter
705 cr.RunArvMount = cr.ArvMountCmd
706 cr.MkTempDir = ioutil.TempDir
707 cr.LogCollection = &CollectionWriter{kc, nil, sync.Mutex{}}
708 cr.ContainerRecord.UUID = containerUUID
709 cr.CrunchLog = NewThrottledLogger(cr.NewLogWriter("crunch-run"))
710 cr.CrunchLog.Immediate = log.New(os.Stderr, containerUUID+" ", 0)
717 containerId := flag.Arg(0)
719 api, err := arvadosclient.MakeArvadosClient()
721 log.Fatalf("%s: %v", containerId, err)
725 var kc *keepclient.KeepClient
726 kc, err = keepclient.MakeKeepClient(&api)
728 log.Fatalf("%s: %v", containerId, err)
732 var docker *dockerclient.DockerClient
733 docker, err = dockerclient.NewDockerClient("unix:///var/run/docker.sock", nil)
735 log.Fatalf("%s: %v", containerId, err)
738 cr := NewContainerRunner(api, kc, docker, containerId)
742 log.Fatalf("%s: %v", containerId, err)