8 "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
9 "git.curoverse.com/arvados.git/sdk/go/keepclient"
10 "git.curoverse.com/arvados.git/sdk/go/manifest"
11 "github.com/curoverse/dockerclient"
25 // IArvadosClient is the minimal Arvados API methods used by crunch-run.
26 type IArvadosClient interface {
27 Create(resourceType string, parameters arvadosclient.Dict, output interface{}) error
28 Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
29 Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) (err error)
30 Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) (err error)
33 // ErrCancelled is the error returned when the container is cancelled.
34 var ErrCancelled = errors.New("Cancelled")
36 // IKeepClient is the minimal Keep API methods used by crunch-run.
37 type IKeepClient interface {
38 PutHB(hash string, buf []byte) (string, int, error)
39 ManifestFileReader(m manifest.Manifest, filename string) (keepclient.ReadCloserWithLen, error)
42 // Mount describes the mount points to create inside the container.
44 Kind string `json:"kind"`
45 Writable bool `json:"writable"`
46 PortableDataHash string `json:"portable_data_hash"`
47 UUID string `json:"uuid"`
48 DeviceType string `json:"device_type"`
49 Path string `json:"path"`
52 // Collection record returned by the API server.
53 type CollectionRecord struct {
54 ManifestText string `json:"manifest_text"`
55 PortableDataHash string `json:"portable_data_hash"`
58 type RuntimeConstraints struct {
62 // ContainerRecord is the container record returned by the API server.
63 type ContainerRecord struct {
64 UUID string `json:"uuid"`
65 Command []string `json:"command"`
66 ContainerImage string `json:"container_image"`
67 Cwd string `json:"cwd"`
68 Environment map[string]string `json:"environment"`
69 Mounts map[string]Mount `json:"mounts"`
70 OutputPath string `json:"output_path"`
71 Priority int `json:"priority"`
72 RuntimeConstraints RuntimeConstraints `json:"runtime_constraints"`
73 State string `json:"state"`
74 Output string `json:"output"`
77 // APIClientAuthorization is an arvados#api_client_authorization resource.
78 type APIClientAuthorization struct {
79 UUID string `json:"uuid"`
80 APIToken string `json:"api_token"`
83 // NewLogWriter is a factory function to create a new log writer.
84 type NewLogWriter func(name string) io.WriteCloser
86 type RunArvMount func(args []string, tok string) (*exec.Cmd, error)
88 type MkTempDir func(string, string) (string, error)
90 // ThinDockerClient is the minimal Docker client interface used by crunch-run.
91 type ThinDockerClient interface {
92 StopContainer(id string, timeout int) error
93 InspectImage(id string) (*dockerclient.ImageInfo, error)
94 LoadImage(reader io.Reader) error
95 CreateContainer(config *dockerclient.ContainerConfig, name string, authConfig *dockerclient.AuthConfig) (string, error)
96 StartContainer(id string, config *dockerclient.HostConfig) error
97 AttachContainer(id string, options *dockerclient.AttachOptions) (io.ReadCloser, error)
98 Wait(id string) <-chan dockerclient.WaitResult
99 RemoveImage(name string, force bool) ([]*dockerclient.ImageDelete, error)
102 // ContainerRunner is the main stateful struct used for a single execution of a
104 type ContainerRunner struct {
105 Docker ThinDockerClient
106 ArvClient IArvadosClient
109 dockerclient.ContainerConfig
114 loggingDone chan bool
115 CrunchLog *ThrottledLogger
116 Stdout io.WriteCloser
117 Stderr *ThrottledLogger
118 LogCollection *CollectionWriter
125 CleanupTempDir []string
128 CancelLock sync.Mutex
130 SigChan chan os.Signal
131 ArvMountExit chan error
135 // SetupSignals sets up signal handling to gracefully terminate the underlying
136 // Docker container and update state when receiving a TERM, INT or QUIT signal.
137 func (runner *ContainerRunner) SetupSignals() {
138 runner.SigChan = make(chan os.Signal, 1)
139 signal.Notify(runner.SigChan, syscall.SIGTERM)
140 signal.Notify(runner.SigChan, syscall.SIGINT)
141 signal.Notify(runner.SigChan, syscall.SIGQUIT)
143 go func(sig <-chan os.Signal) {
145 if !runner.Cancelled {
146 runner.CancelLock.Lock()
147 runner.Cancelled = true
148 if runner.ContainerID != "" {
149 runner.Docker.StopContainer(runner.ContainerID, 10)
151 runner.CancelLock.Unlock()
157 // LoadImage determines the docker image id from the container record and
158 // checks if it is available in the local Docker image store. If not, it loads
159 // the image from Keep.
160 func (runner *ContainerRunner) LoadImage() (err error) {
162 runner.CrunchLog.Printf("Fetching Docker image from collection '%s'", runner.ContainerRecord.ContainerImage)
164 var collection CollectionRecord
165 err = runner.ArvClient.Get("collections", runner.ContainerRecord.ContainerImage, nil, &collection)
167 return fmt.Errorf("While getting container image collection: %v", err)
169 manifest := manifest.Manifest{Text: collection.ManifestText}
170 var img, imageID string
171 for ms := range manifest.StreamIter() {
172 img = ms.FileStreamSegments[0].Name
173 if !strings.HasSuffix(img, ".tar") {
174 return fmt.Errorf("First file in the container image collection does not end in .tar")
176 imageID = img[:len(img)-4]
179 runner.CrunchLog.Printf("Using Docker image id '%s'", imageID)
181 _, err = runner.Docker.InspectImage(imageID)
183 runner.CrunchLog.Print("Loading Docker image from keep")
185 var readCloser io.ReadCloser
186 readCloser, err = runner.Kc.ManifestFileReader(manifest, img)
188 return fmt.Errorf("While creating ManifestFileReader for container image: %v", err)
191 err = runner.Docker.LoadImage(readCloser)
193 return fmt.Errorf("While loading container image into Docker: %v", err)
196 runner.CrunchLog.Print("Docker image is available")
199 runner.ContainerConfig.Image = imageID
204 func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (c *exec.Cmd, err error) {
205 c = exec.Command("arv-mount", arvMountCmd...)
207 // Copy our environment, but override ARVADOS_API_TOKEN with
208 // the container auth token.
210 for _, s := range os.Environ() {
211 if !strings.HasPrefix(s, "ARVADOS_API_TOKEN=") {
212 c.Env = append(c.Env, s)
215 c.Env = append(c.Env, "ARVADOS_API_TOKEN="+token)
217 nt := NewThrottledLogger(runner.NewLogWriter("arv-mount"))
226 statReadme := make(chan bool)
227 runner.ArvMountExit = make(chan error)
232 time.Sleep(100 * time.Millisecond)
233 _, err = os.Stat(fmt.Sprintf("%s/by_id/README", runner.ArvMountPoint))
243 runner.ArvMountExit <- c.Wait()
244 close(runner.ArvMountExit)
250 case err := <-runner.ArvMountExit:
251 runner.ArvMount = nil
259 func (runner *ContainerRunner) SetupMounts() (err error) {
260 runner.ArvMountPoint, err = runner.MkTempDir("", "keep")
262 return fmt.Errorf("While creating keep mount temp dir: %v", err)
265 runner.CleanupTempDir = append(runner.CleanupTempDir, runner.ArvMountPoint)
269 arvMountCmd := []string{"--foreground", "--allow-other", "--read-write"}
270 collectionPaths := []string{}
273 for bind, mnt := range runner.ContainerRecord.Mounts {
274 if bind == "stdout" {
275 // Is it a "file" mount kind?
276 if mnt.Kind != "file" {
277 return fmt.Errorf("Unsupported mount kind '%s' for stdout. Only 'file' is supported.", mnt.Kind)
280 // Does path start with OutputPath?
281 prefix := runner.ContainerRecord.OutputPath
282 if !strings.HasSuffix(prefix, "/") {
285 if !strings.HasPrefix(mnt.Path, prefix) {
286 return fmt.Errorf("Stdout path does not start with OutputPath: %s, %s", mnt.Path, prefix)
290 if mnt.Kind == "collection" {
292 if mnt.UUID != "" && mnt.PortableDataHash != "" {
293 return fmt.Errorf("Cannot specify both 'uuid' and 'portable_data_hash' for a collection mount")
297 return fmt.Errorf("Writing to existing collections currently not permitted.")
300 src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.UUID)
301 } else if mnt.PortableDataHash != "" {
303 return fmt.Errorf("Can never write to a collection specified by portable data hash")
305 src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.PortableDataHash)
307 src = fmt.Sprintf("%s/tmp%d", runner.ArvMountPoint, tmpcount)
308 arvMountCmd = append(arvMountCmd, "--mount-tmp")
309 arvMountCmd = append(arvMountCmd, fmt.Sprintf("tmp%d", tmpcount))
313 if bind == runner.ContainerRecord.OutputPath {
314 runner.HostOutputDir = src
316 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", src, bind))
318 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s:ro", src, bind))
320 collectionPaths = append(collectionPaths, src)
321 } else if mnt.Kind == "tmp" {
322 if bind == runner.ContainerRecord.OutputPath {
323 runner.HostOutputDir, err = runner.MkTempDir("", "")
325 return fmt.Errorf("While creating mount temp dir: %v", err)
327 st, staterr := os.Stat(runner.HostOutputDir)
329 return fmt.Errorf("While Stat on temp dir: %v", staterr)
331 err = os.Chmod(runner.HostOutputDir, st.Mode()|os.ModeSetgid|0777)
333 return fmt.Errorf("While Chmod temp dir: %v", err)
335 runner.CleanupTempDir = append(runner.CleanupTempDir, runner.HostOutputDir)
336 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", runner.HostOutputDir, bind))
338 runner.Binds = append(runner.Binds, bind)
343 if runner.HostOutputDir == "" {
344 return fmt.Errorf("Output path does not correspond to a writable mount point")
348 arvMountCmd = append(arvMountCmd, "--mount-by-pdh", "by_id")
350 arvMountCmd = append(arvMountCmd, "--mount-by-id", "by_id")
352 arvMountCmd = append(arvMountCmd, runner.ArvMountPoint)
354 token, err := runner.ContainerToken()
356 return fmt.Errorf("could not get container token: %s", err)
359 runner.ArvMount, err = runner.RunArvMount(arvMountCmd, token)
361 return fmt.Errorf("While trying to start arv-mount: %v", err)
364 for _, p := range collectionPaths {
367 return fmt.Errorf("While checking that input files exist: %v", err)
374 func (runner *ContainerRunner) ProcessDockerAttach(containerReader io.Reader) {
375 // Handle docker log protocol
376 // https://docs.docker.com/engine/reference/api/docker_remote_api_v1.15/#attach-to-a-container
378 header := make([]byte, 8)
380 _, readerr := io.ReadAtLeast(containerReader, header, 8)
383 readsize := int64(header[7]) | (int64(header[6]) << 8) | (int64(header[5]) << 16) | (int64(header[4]) << 24)
386 _, readerr = io.CopyN(runner.Stdout, containerReader, readsize)
389 _, readerr = io.CopyN(runner.Stderr, containerReader, readsize)
394 if readerr != io.EOF {
395 runner.CrunchLog.Printf("While reading docker logs: %v", readerr)
398 closeerr := runner.Stdout.Close()
400 runner.CrunchLog.Printf("While closing stdout logs: %v", closeerr)
403 closeerr = runner.Stderr.Close()
405 runner.CrunchLog.Printf("While closing stderr logs: %v", closeerr)
408 runner.loggingDone <- true
409 close(runner.loggingDone)
415 // AttachLogs connects the docker container stdout and stderr logs to the
416 // Arvados logger which logs to Keep and the API server logs table.
417 func (runner *ContainerRunner) AttachStreams() (err error) {
419 runner.CrunchLog.Print("Attaching container streams")
421 var containerReader io.Reader
422 containerReader, err = runner.Docker.AttachContainer(runner.ContainerID,
423 &dockerclient.AttachOptions{Stream: true, Stdout: true, Stderr: true})
425 return fmt.Errorf("While attaching container stdout/stderr streams: %v", err)
428 runner.loggingDone = make(chan bool)
430 if stdoutMnt, ok := runner.ContainerRecord.Mounts["stdout"]; ok {
431 stdoutPath := stdoutMnt.Path[len(runner.ContainerRecord.OutputPath):]
432 index := strings.LastIndex(stdoutPath, "/")
434 subdirs := stdoutPath[:index]
436 st, err := os.Stat(runner.HostOutputDir)
438 return fmt.Errorf("While Stat on temp dir: %v", err)
440 stdoutPath := path.Join(runner.HostOutputDir, subdirs)
441 err = os.MkdirAll(stdoutPath, st.Mode()|os.ModeSetgid|0777)
443 return fmt.Errorf("While MkdirAll %q: %v", stdoutPath, err)
447 stdoutFile, err := os.Create(path.Join(runner.HostOutputDir, stdoutPath))
449 return fmt.Errorf("While creating stdout file: %v", err)
451 runner.Stdout = stdoutFile
453 runner.Stdout = NewThrottledLogger(runner.NewLogWriter("stdout"))
455 runner.Stderr = NewThrottledLogger(runner.NewLogWriter("stderr"))
457 go runner.ProcessDockerAttach(containerReader)
462 // StartContainer creates the container and runs it.
463 func (runner *ContainerRunner) StartContainer() (err error) {
464 runner.CrunchLog.Print("Creating Docker container")
466 runner.CancelLock.Lock()
467 defer runner.CancelLock.Unlock()
469 if runner.Cancelled {
473 runner.ContainerConfig.Cmd = runner.ContainerRecord.Command
474 if runner.ContainerRecord.Cwd != "." {
475 runner.ContainerConfig.WorkingDir = runner.ContainerRecord.Cwd
478 for k, v := range runner.ContainerRecord.Environment {
479 runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
481 if wantAPI := runner.ContainerRecord.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
482 tok, err := runner.ContainerToken()
486 runner.ContainerConfig.Env = append(runner.ContainerConfig.Env,
487 "ARVADOS_API_TOKEN="+tok,
488 "ARVADOS_API_HOST="+os.Getenv("ARVADOS_API_HOST"),
489 "ARVADOS_API_HOST_INSECURE="+os.Getenv("ARVADOS_API_HOST_INSECURE"),
493 runner.ContainerConfig.NetworkDisabled = true
494 runner.ContainerID, err = runner.Docker.CreateContainer(&runner.ContainerConfig, "", nil)
496 return fmt.Errorf("While creating container: %v", err)
498 hostConfig := &dockerclient.HostConfig{Binds: runner.Binds,
499 LogConfig: dockerclient.LogConfig{Type: "none"}}
501 err = runner.AttachStreams()
506 runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
507 err = runner.Docker.StartContainer(runner.ContainerID, hostConfig)
509 return fmt.Errorf("While starting container: %v", err)
515 // WaitFinish waits for the container to terminate, capture the exit code, and
516 // close the stdout/stderr logging.
517 func (runner *ContainerRunner) WaitFinish() error {
518 runner.CrunchLog.Print("Waiting for container to finish")
520 result := runner.Docker.Wait(runner.ContainerID)
523 return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
525 runner.ExitCode = &wr.ExitCode
527 // wait for stdout/stderr to complete
533 // HandleOutput sets the output, unmounts the FUSE mount, and deletes temporary directories
534 func (runner *ContainerRunner) CaptureOutput() error {
535 if runner.finalState != "Complete" {
539 if runner.HostOutputDir == "" {
543 _, err := os.Stat(runner.HostOutputDir)
545 return fmt.Errorf("While checking host output path: %v", err)
548 var manifestText string
550 collectionMetafile := fmt.Sprintf("%s/.arvados#collection", runner.HostOutputDir)
551 _, err = os.Stat(collectionMetafile)
554 cw := CollectionWriter{runner.Kc, nil, sync.Mutex{}}
555 manifestText, err = cw.WriteTree(runner.HostOutputDir, runner.CrunchLog.Logger)
557 return fmt.Errorf("While uploading output files: %v", err)
560 // FUSE mount directory
561 file, openerr := os.Open(collectionMetafile)
563 return fmt.Errorf("While opening FUSE metafile: %v", err)
567 rec := CollectionRecord{}
568 err = json.NewDecoder(file).Decode(&rec)
570 return fmt.Errorf("While reading FUSE metafile: %v", err)
572 manifestText = rec.ManifestText
575 var response CollectionRecord
576 err = runner.ArvClient.Create("collections",
578 "collection": arvadosclient.Dict{
579 "manifest_text": manifestText}},
582 return fmt.Errorf("While creating output collection: %v", err)
585 runner.OutputPDH = new(string)
586 *runner.OutputPDH = response.PortableDataHash
591 func (runner *ContainerRunner) CleanupDirs() {
592 if runner.ArvMount != nil {
593 umount := exec.Command("fusermount", "-z", "-u", runner.ArvMountPoint)
594 umnterr := umount.Run()
596 runner.CrunchLog.Printf("While running fusermount: %v", umnterr)
599 mnterr := <-runner.ArvMountExit
601 runner.CrunchLog.Printf("Arv-mount exit error: %v", mnterr)
605 for _, tmpdir := range runner.CleanupTempDir {
606 rmerr := os.RemoveAll(tmpdir)
608 runner.CrunchLog.Printf("While cleaning up temporary directory %s: %v", tmpdir, rmerr)
613 // CommitLogs posts the collection containing the final container logs.
614 func (runner *ContainerRunner) CommitLogs() error {
615 runner.CrunchLog.Print(runner.finalState)
616 runner.CrunchLog.Close()
618 // Closing CrunchLog above allows it to be committed to Keep at this
619 // point, but re-open crunch log with ArvClient in case there are any
620 // other further (such as failing to write the log to Keep!) while
622 runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID,
625 mt, err := runner.LogCollection.ManifestText()
627 return fmt.Errorf("While creating log manifest: %v", err)
630 var response CollectionRecord
631 err = runner.ArvClient.Create("collections",
633 "collection": arvadosclient.Dict{
634 "name": "logs for " + runner.ContainerRecord.UUID,
635 "manifest_text": mt}},
638 return fmt.Errorf("While creating log collection: %v", err)
641 runner.LogsPDH = new(string)
642 *runner.LogsPDH = response.PortableDataHash
647 // UpdateContainerRecordRunning updates the container state to "Running"
648 func (runner *ContainerRunner) UpdateContainerRecordRunning() error {
649 return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID,
650 arvadosclient.Dict{"container": arvadosclient.Dict{"state": "Running"}}, nil)
653 // ContainerToken returns the api_token the container (and any
654 // arv-mount processes) are allowed to use.
655 func (runner *ContainerRunner) ContainerToken() (string, error) {
656 if runner.token != "" {
657 return runner.token, nil
660 var auth APIClientAuthorization
661 err := runner.ArvClient.Call("GET", "containers", runner.ContainerRecord.UUID, "auth", nil, &auth)
665 runner.token = auth.APIToken
666 return runner.token, nil
669 // UpdateContainerRecordComplete updates the container record state on API
670 // server to "Complete" or "Cancelled"
671 func (runner *ContainerRunner) UpdateContainerRecordComplete() error {
672 update := arvadosclient.Dict{}
673 if runner.LogsPDH != nil {
674 update["log"] = *runner.LogsPDH
676 if runner.ExitCode != nil {
677 update["exit_code"] = *runner.ExitCode
679 if runner.OutputPDH != nil {
680 update["output"] = runner.OutputPDH
683 update["state"] = runner.finalState
685 return runner.ArvClient.Update("containers", runner.ContainerRecord.UUID, arvadosclient.Dict{"container": update}, nil)
688 // NewArvLogWriter creates an ArvLogWriter
689 func (runner *ContainerRunner) NewArvLogWriter(name string) io.WriteCloser {
690 return &ArvLogWriter{runner.ArvClient, runner.ContainerRecord.UUID, name, runner.LogCollection.Open(name + ".txt")}
693 // Run the full container lifecycle.
694 func (runner *ContainerRunner) Run() (err error) {
695 runner.CrunchLog.Printf("Executing container '%s'", runner.ContainerRecord.UUID)
697 hostname, hosterr := os.Hostname()
699 runner.CrunchLog.Printf("Error getting hostname '%v'", hosterr)
701 runner.CrunchLog.Printf("Executing on host '%s'", hostname)
704 var runerr, waiterr error
708 runner.CrunchLog.Print(err)
711 if runner.Cancelled {
712 runner.finalState = "Cancelled"
714 runner.finalState = "Complete"
717 // (6) capture output
718 outputerr := runner.CaptureOutput()
719 if outputerr != nil {
720 runner.CrunchLog.Print(outputerr)
723 // (7) clean up temporary directories
727 logerr := runner.CommitLogs()
729 runner.CrunchLog.Print(logerr)
732 // (9) update container record with results
733 updateerr := runner.UpdateContainerRecordComplete()
734 if updateerr != nil {
735 runner.CrunchLog.Print(updateerr)
738 runner.CrunchLog.Close()
743 } else if waiterr != nil {
745 } else if logerr != nil {
747 } else if updateerr != nil {
753 err = runner.ArvClient.Get("containers", runner.ContainerRecord.UUID, nil, &runner.ContainerRecord)
755 return fmt.Errorf("While getting container record: %v", err)
758 // (1) setup signal handling
759 runner.SetupSignals()
761 // (2) check for and/or load image
762 err = runner.LoadImage()
764 return fmt.Errorf("While loading container image: %v", err)
767 // (3) set up FUSE mount and binds
768 err = runner.SetupMounts()
770 return fmt.Errorf("While setting up mounts: %v", err)
773 // (3) create and start container
774 err = runner.StartContainer()
776 if err == ErrCancelled {
782 // (4) update container record state
783 err = runner.UpdateContainerRecordRunning()
785 runner.CrunchLog.Print(err)
788 // (5) wait for container to finish
789 waiterr = runner.WaitFinish()
794 // NewContainerRunner creates a new container runner.
795 func NewContainerRunner(api IArvadosClient,
797 docker ThinDockerClient,
798 containerUUID string) *ContainerRunner {
800 cr := &ContainerRunner{ArvClient: api, Kc: kc, Docker: docker}
801 cr.NewLogWriter = cr.NewArvLogWriter
802 cr.RunArvMount = cr.ArvMountCmd
803 cr.MkTempDir = ioutil.TempDir
804 cr.LogCollection = &CollectionWriter{kc, nil, sync.Mutex{}}
805 cr.ContainerRecord.UUID = containerUUID
806 cr.CrunchLog = NewThrottledLogger(cr.NewLogWriter("crunch-run"))
807 cr.CrunchLog.Immediate = log.New(os.Stderr, containerUUID+" ", 0)
814 containerId := flag.Arg(0)
816 api, err := arvadosclient.MakeArvadosClient()
818 log.Fatalf("%s: %v", containerId, err)
822 var kc *keepclient.KeepClient
823 kc, err = keepclient.MakeKeepClient(&api)
825 log.Fatalf("%s: %v", containerId, err)
829 var docker *dockerclient.DockerClient
830 docker, err = dockerclient.NewDockerClient("unix:///var/run/docker.sock", nil)
832 log.Fatalf("%s: %v", containerId, err)
835 cr := NewContainerRunner(api, kc, docker, containerId)
839 log.Fatalf("%s: %v", containerId, err)