8 "git.curoverse.com/arvados.git/lib/crunchstat"
9 "git.curoverse.com/arvados.git/sdk/go/arvados"
10 "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
11 "git.curoverse.com/arvados.git/sdk/go/keepclient"
12 "git.curoverse.com/arvados.git/sdk/go/manifest"
13 "github.com/curoverse/dockerclient"
29 // IArvadosClient is the minimal Arvados API methods used by crunch-run.
30 type IArvadosClient interface {
31 Create(resourceType string, parameters arvadosclient.Dict, output interface{}) error
32 Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
33 Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
34 Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error
35 Discovery(key string) (interface{}, error)
38 // ErrCancelled is the error returned when the container is cancelled.
39 var ErrCancelled = errors.New("Cancelled")
41 // IKeepClient is the minimal Keep API methods used by crunch-run.
42 type IKeepClient interface {
43 PutHB(hash string, buf []byte) (string, int, error)
44 ManifestFileReader(m manifest.Manifest, filename string) (keepclient.Reader, error)
47 // NewLogWriter is a factory function to create a new log writer.
48 type NewLogWriter func(name string) io.WriteCloser
50 type RunArvMount func(args []string, tok string) (*exec.Cmd, error)
52 type MkTempDir func(string, string) (string, error)
54 // ThinDockerClient is the minimal Docker client interface used by crunch-run.
55 type ThinDockerClient interface {
56 StopContainer(id string, timeout int) error
57 InspectImage(id string) (*dockerclient.ImageInfo, error)
58 LoadImage(reader io.Reader) error
59 CreateContainer(config *dockerclient.ContainerConfig, name string, authConfig *dockerclient.AuthConfig) (string, error)
60 StartContainer(id string, config *dockerclient.HostConfig) error
61 AttachContainer(id string, options *dockerclient.AttachOptions) (io.ReadCloser, error)
62 Wait(id string) <-chan dockerclient.WaitResult
63 RemoveImage(name string, force bool) ([]*dockerclient.ImageDelete, error)
66 // ContainerRunner is the main stateful struct used for a single execution of a
68 type ContainerRunner struct {
69 Docker ThinDockerClient
70 ArvClient IArvadosClient
73 dockerclient.ContainerConfig
74 dockerclient.HostConfig
80 CrunchLog *ThrottledLogger
82 Stderr *ThrottledLogger
83 LogCollection *CollectionWriter
90 CleanupTempDir []string
95 SigChan chan os.Signal
96 ArvMountExit chan error
98 trashLifetime time.Duration
100 statLogger io.WriteCloser
101 statReporter *crunchstat.Reporter
102 statInterval time.Duration
104 // What we expect the container's cgroup parent to be.
105 expectCgroupParent string
106 // What we tell docker to use as the container's cgroup
107 // parent. Note: Ideally we would use the same field for both
108 // expectCgroupParent and setCgroupParent, and just make it
109 // default to "docker". However, when using docker < 1.10 with
110 // systemd, specifying a non-empty cgroup parent (even the
111 // default value "docker") hits a docker bug
112 // (https://github.com/docker/docker/issues/17126). Using two
113 // separate fields makes it possible to use the "expect cgroup
114 // parent to be X" feature even on sites where the "specify
115 // cgroup parent" feature breaks.
116 setCgroupParent string
119 // SetupSignals sets up signal handling to gracefully terminate the underlying
120 // Docker container and update state when receiving a TERM, INT or QUIT signal.
121 func (runner *ContainerRunner) SetupSignals() {
122 runner.SigChan = make(chan os.Signal, 1)
123 signal.Notify(runner.SigChan, syscall.SIGTERM)
124 signal.Notify(runner.SigChan, syscall.SIGINT)
125 signal.Notify(runner.SigChan, syscall.SIGQUIT)
127 go func(sig <-chan os.Signal) {
129 if !runner.Cancelled {
130 runner.CancelLock.Lock()
131 runner.Cancelled = true
132 if runner.ContainerID != "" {
133 runner.Docker.StopContainer(runner.ContainerID, 10)
135 runner.CancelLock.Unlock()
141 // LoadImage determines the docker image id from the container record and
142 // checks if it is available in the local Docker image store. If not, it loads
143 // the image from Keep.
144 func (runner *ContainerRunner) LoadImage() (err error) {
146 runner.CrunchLog.Printf("Fetching Docker image from collection '%s'", runner.Container.ContainerImage)
148 var collection arvados.Collection
149 err = runner.ArvClient.Get("collections", runner.Container.ContainerImage, nil, &collection)
151 return fmt.Errorf("While getting container image collection: %v", err)
153 manifest := manifest.Manifest{Text: collection.ManifestText}
154 var img, imageID string
155 for ms := range manifest.StreamIter() {
156 img = ms.FileStreamSegments[0].Name
157 if !strings.HasSuffix(img, ".tar") {
158 return fmt.Errorf("First file in the container image collection does not end in .tar")
160 imageID = img[:len(img)-4]
163 runner.CrunchLog.Printf("Using Docker image id '%s'", imageID)
165 _, err = runner.Docker.InspectImage(imageID)
167 runner.CrunchLog.Print("Loading Docker image from keep")
169 var readCloser io.ReadCloser
170 readCloser, err = runner.Kc.ManifestFileReader(manifest, img)
172 return fmt.Errorf("While creating ManifestFileReader for container image: %v", err)
175 err = runner.Docker.LoadImage(readCloser)
177 return fmt.Errorf("While loading container image into Docker: %v", err)
180 runner.CrunchLog.Print("Docker image is available")
183 runner.ContainerConfig.Image = imageID
188 func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (c *exec.Cmd, err error) {
189 c = exec.Command("arv-mount", arvMountCmd...)
191 // Copy our environment, but override ARVADOS_API_TOKEN with
192 // the container auth token.
194 for _, s := range os.Environ() {
195 if !strings.HasPrefix(s, "ARVADOS_API_TOKEN=") {
196 c.Env = append(c.Env, s)
199 c.Env = append(c.Env, "ARVADOS_API_TOKEN="+token)
201 nt := NewThrottledLogger(runner.NewLogWriter("arv-mount"))
210 statReadme := make(chan bool)
211 runner.ArvMountExit = make(chan error)
216 time.Sleep(100 * time.Millisecond)
217 _, err = os.Stat(fmt.Sprintf("%s/by_id/README", runner.ArvMountPoint))
227 runner.ArvMountExit <- c.Wait()
228 close(runner.ArvMountExit)
234 case err := <-runner.ArvMountExit:
235 runner.ArvMount = nil
243 func (runner *ContainerRunner) SetupArvMountPoint(prefix string) (err error) {
244 if runner.ArvMountPoint == "" {
245 runner.ArvMountPoint, err = runner.MkTempDir("", prefix)
250 func (runner *ContainerRunner) SetupMounts() (err error) {
251 err = runner.SetupArvMountPoint("keep")
253 return fmt.Errorf("While creating keep mount temp dir: %v", err)
256 runner.CleanupTempDir = append(runner.CleanupTempDir, runner.ArvMountPoint)
260 arvMountCmd := []string{"--foreground", "--allow-other", "--read-write"}
262 if runner.Container.RuntimeConstraints.KeepCacheRAM > 0 {
263 arvMountCmd = append(arvMountCmd, "--file-cache", fmt.Sprintf("%d", runner.Container.RuntimeConstraints.KeepCacheRAM))
266 collectionPaths := []string{}
268 needCertMount := true
271 for bind, _ := range runner.Container.Mounts {
272 binds = append(binds, bind)
276 for _, bind := range binds {
277 mnt := runner.Container.Mounts[bind]
278 if bind == "stdout" {
279 // Is it a "file" mount kind?
280 if mnt.Kind != "file" {
281 return fmt.Errorf("Unsupported mount kind '%s' for stdout. Only 'file' is supported.", mnt.Kind)
284 // Does path start with OutputPath?
285 prefix := runner.Container.OutputPath
286 if !strings.HasSuffix(prefix, "/") {
289 if !strings.HasPrefix(mnt.Path, prefix) {
290 return fmt.Errorf("Stdout path does not start with OutputPath: %s, %s", mnt.Path, prefix)
294 if bind == "/etc/arvados/ca-certificates.crt" {
295 needCertMount = false
298 if strings.HasPrefix(bind, runner.Container.OutputPath+"/") && bind != runner.Container.OutputPath+"/" {
299 if mnt.Kind != "collection" {
300 return fmt.Errorf("Only mount points of kind 'collection' are supported underneath the output_path: %v", bind)
305 case mnt.Kind == "collection":
307 if mnt.UUID != "" && mnt.PortableDataHash != "" {
308 return fmt.Errorf("Cannot specify both 'uuid' and 'portable_data_hash' for a collection mount")
312 return fmt.Errorf("Writing to existing collections currently not permitted.")
315 src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.UUID)
316 } else if mnt.PortableDataHash != "" {
318 return fmt.Errorf("Can never write to a collection specified by portable data hash")
320 src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.PortableDataHash)
322 src = fmt.Sprintf("%s/tmp%d", runner.ArvMountPoint, tmpcount)
323 arvMountCmd = append(arvMountCmd, "--mount-tmp")
324 arvMountCmd = append(arvMountCmd, fmt.Sprintf("tmp%d", tmpcount))
328 if bind == runner.Container.OutputPath {
329 runner.HostOutputDir = src
330 } else if strings.HasPrefix(bind, runner.Container.OutputPath+"/") {
331 return fmt.Errorf("Writable mount points are not permitted underneath the output_path: %v", bind)
333 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", src, bind))
335 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s:ro", src, bind))
337 collectionPaths = append(collectionPaths, src)
339 case mnt.Kind == "tmp" && bind == runner.Container.OutputPath:
340 runner.HostOutputDir, err = runner.MkTempDir("", "")
342 return fmt.Errorf("While creating mount temp dir: %v", err)
344 st, staterr := os.Stat(runner.HostOutputDir)
346 return fmt.Errorf("While Stat on temp dir: %v", staterr)
348 err = os.Chmod(runner.HostOutputDir, st.Mode()|os.ModeSetgid|0777)
350 return fmt.Errorf("While Chmod temp dir: %v", err)
352 runner.CleanupTempDir = append(runner.CleanupTempDir, runner.HostOutputDir)
353 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", runner.HostOutputDir, bind))
355 case mnt.Kind == "tmp":
356 runner.Binds = append(runner.Binds, bind)
358 case mnt.Kind == "json":
359 jsondata, err := json.Marshal(mnt.Content)
361 return fmt.Errorf("encoding json data: %v", err)
363 // Create a tempdir with a single file
364 // (instead of just a tempfile): this way we
365 // can ensure the file is world-readable
366 // inside the container, without having to
367 // make it world-readable on the docker host.
368 tmpdir, err := runner.MkTempDir("", "")
370 return fmt.Errorf("creating temp dir: %v", err)
372 runner.CleanupTempDir = append(runner.CleanupTempDir, tmpdir)
373 tmpfn := filepath.Join(tmpdir, "mountdata.json")
374 err = ioutil.WriteFile(tmpfn, jsondata, 0644)
376 return fmt.Errorf("writing temp file: %v", err)
378 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s:ro", tmpfn, bind))
382 if runner.HostOutputDir == "" {
383 return fmt.Errorf("Output path does not correspond to a writable mount point")
386 if wantAPI := runner.Container.RuntimeConstraints.API; needCertMount && wantAPI != nil && *wantAPI {
387 for _, certfile := range arvadosclient.CertFiles {
388 _, err := os.Stat(certfile)
390 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:/etc/arvados/ca-certificates.crt:ro", certfile))
397 arvMountCmd = append(arvMountCmd, "--mount-by-pdh", "by_id")
399 arvMountCmd = append(arvMountCmd, "--mount-by-id", "by_id")
401 arvMountCmd = append(arvMountCmd, runner.ArvMountPoint)
403 token, err := runner.ContainerToken()
405 return fmt.Errorf("could not get container token: %s", err)
408 runner.ArvMount, err = runner.RunArvMount(arvMountCmd, token)
410 return fmt.Errorf("While trying to start arv-mount: %v", err)
413 for _, p := range collectionPaths {
416 return fmt.Errorf("While checking that input files exist: %v", err)
423 func (runner *ContainerRunner) ProcessDockerAttach(containerReader io.Reader) {
424 // Handle docker log protocol
425 // https://docs.docker.com/engine/reference/api/docker_remote_api_v1.15/#attach-to-a-container
427 header := make([]byte, 8)
429 _, readerr := io.ReadAtLeast(containerReader, header, 8)
432 readsize := int64(header[7]) | (int64(header[6]) << 8) | (int64(header[5]) << 16) | (int64(header[4]) << 24)
435 _, readerr = io.CopyN(runner.Stdout, containerReader, readsize)
438 _, readerr = io.CopyN(runner.Stderr, containerReader, readsize)
443 if readerr != io.EOF {
444 runner.CrunchLog.Printf("While reading docker logs: %v", readerr)
447 closeerr := runner.Stdout.Close()
449 runner.CrunchLog.Printf("While closing stdout logs: %v", closeerr)
452 closeerr = runner.Stderr.Close()
454 runner.CrunchLog.Printf("While closing stderr logs: %v", closeerr)
457 if runner.statReporter != nil {
458 runner.statReporter.Stop()
459 closeerr = runner.statLogger.Close()
461 runner.CrunchLog.Printf("While closing crunchstat logs: %v", closeerr)
465 runner.loggingDone <- true
466 close(runner.loggingDone)
472 func (runner *ContainerRunner) StartCrunchstat() {
473 runner.statLogger = NewThrottledLogger(runner.NewLogWriter("crunchstat"))
474 runner.statReporter = &crunchstat.Reporter{
475 CID: runner.ContainerID,
476 Logger: log.New(runner.statLogger, "", 0),
477 CgroupParent: runner.expectCgroupParent,
478 CgroupRoot: runner.cgroupRoot,
479 PollPeriod: runner.statInterval,
481 runner.statReporter.Start()
484 // AttachLogs connects the docker container stdout and stderr logs to the
485 // Arvados logger which logs to Keep and the API server logs table.
486 func (runner *ContainerRunner) AttachStreams() (err error) {
488 runner.CrunchLog.Print("Attaching container streams")
490 var containerReader io.Reader
491 containerReader, err = runner.Docker.AttachContainer(runner.ContainerID,
492 &dockerclient.AttachOptions{Stream: true, Stdout: true, Stderr: true})
494 return fmt.Errorf("While attaching container stdout/stderr streams: %v", err)
497 runner.loggingDone = make(chan bool)
499 if stdoutMnt, ok := runner.Container.Mounts["stdout"]; ok {
500 stdoutPath := stdoutMnt.Path[len(runner.Container.OutputPath):]
501 index := strings.LastIndex(stdoutPath, "/")
503 subdirs := stdoutPath[:index]
505 st, err := os.Stat(runner.HostOutputDir)
507 return fmt.Errorf("While Stat on temp dir: %v", err)
509 stdoutPath := path.Join(runner.HostOutputDir, subdirs)
510 err = os.MkdirAll(stdoutPath, st.Mode()|os.ModeSetgid|0777)
512 return fmt.Errorf("While MkdirAll %q: %v", stdoutPath, err)
516 stdoutFile, err := os.Create(path.Join(runner.HostOutputDir, stdoutPath))
518 return fmt.Errorf("While creating stdout file: %v", err)
520 runner.Stdout = stdoutFile
522 runner.Stdout = NewThrottledLogger(runner.NewLogWriter("stdout"))
524 runner.Stderr = NewThrottledLogger(runner.NewLogWriter("stderr"))
526 go runner.ProcessDockerAttach(containerReader)
531 // CreateContainer creates the docker container.
532 func (runner *ContainerRunner) CreateContainer() error {
533 runner.CrunchLog.Print("Creating Docker container")
535 runner.ContainerConfig.Cmd = runner.Container.Command
536 if runner.Container.Cwd != "." {
537 runner.ContainerConfig.WorkingDir = runner.Container.Cwd
540 for k, v := range runner.Container.Environment {
541 runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
543 if wantAPI := runner.Container.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
544 tok, err := runner.ContainerToken()
548 runner.ContainerConfig.Env = append(runner.ContainerConfig.Env,
549 "ARVADOS_API_TOKEN="+tok,
550 "ARVADOS_API_HOST="+os.Getenv("ARVADOS_API_HOST"),
551 "ARVADOS_API_HOST_INSECURE="+os.Getenv("ARVADOS_API_HOST_INSECURE"),
553 runner.ContainerConfig.NetworkDisabled = false
555 runner.ContainerConfig.NetworkDisabled = true
559 runner.ContainerID, err = runner.Docker.CreateContainer(&runner.ContainerConfig, "", nil)
561 return fmt.Errorf("While creating container: %v", err)
564 runner.HostConfig = dockerclient.HostConfig{
566 CgroupParent: runner.setCgroupParent,
567 LogConfig: dockerclient.LogConfig{
572 return runner.AttachStreams()
575 // StartContainer starts the docker container created by CreateContainer.
576 func (runner *ContainerRunner) StartContainer() error {
577 runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
578 err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
580 return fmt.Errorf("could not start container: %v", err)
585 // WaitFinish waits for the container to terminate, capture the exit code, and
586 // close the stdout/stderr logging.
587 func (runner *ContainerRunner) WaitFinish() error {
588 runner.CrunchLog.Print("Waiting for container to finish")
590 result := runner.Docker.Wait(runner.ContainerID)
593 return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
595 runner.ExitCode = &wr.ExitCode
597 // wait for stdout/stderr to complete
603 // HandleOutput sets the output, unmounts the FUSE mount, and deletes temporary directories
604 func (runner *ContainerRunner) CaptureOutput() error {
605 if runner.finalState != "Complete" {
609 if wantAPI := runner.Container.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
610 // Output may have been set directly by the container, so
611 // refresh the container record to check.
612 err := runner.ArvClient.Get("containers", runner.Container.UUID,
613 nil, &runner.Container)
617 if runner.Container.Output != "" {
618 // Container output is already set.
619 runner.OutputPDH = &runner.Container.Output
624 if runner.HostOutputDir == "" {
628 _, err := os.Stat(runner.HostOutputDir)
630 return fmt.Errorf("While checking host output path: %v", err)
633 var manifestText string
635 collectionMetafile := fmt.Sprintf("%s/.arvados#collection", runner.HostOutputDir)
636 _, err = os.Stat(collectionMetafile)
639 cw := CollectionWriter{runner.Kc, nil, sync.Mutex{}}
640 manifestText, err = cw.WriteTree(runner.HostOutputDir, runner.CrunchLog.Logger)
642 return fmt.Errorf("While uploading output files: %v", err)
645 // FUSE mount directory
646 file, openerr := os.Open(collectionMetafile)
648 return fmt.Errorf("While opening FUSE metafile: %v", err)
652 var rec arvados.Collection
653 err = json.NewDecoder(file).Decode(&rec)
655 return fmt.Errorf("While reading FUSE metafile: %v", err)
657 manifestText = rec.ManifestText
660 // Pre-populate output from the configured mount points
662 for bind, _ := range runner.Container.Mounts {
663 binds = append(binds, bind)
667 for _, bind := range binds {
668 mnt := runner.Container.Mounts[bind]
670 bindSuffix := strings.TrimPrefix(bind, runner.Container.OutputPath)
672 if bindSuffix == bind || len(bindSuffix) <= 0 {
673 // either does not start with OutputPath or is OutputPath itself
677 if strings.HasPrefix(bindSuffix, "/") == false {
678 bindSuffix = "/" + bindSuffix
681 if mnt.ExcludeFromOutput == true {
685 idx := strings.Index(mnt.PortableDataHash, "/")
687 mnt.Path = mnt.PortableDataHash[idx:]
688 mnt.PortableDataHash = mnt.PortableDataHash[0:idx]
691 // append to manifest_text
692 m, err := runner.getCollectionManifestForPath(mnt, bindSuffix)
697 manifestText = manifestText + m
701 var response arvados.Collection
702 err = runner.ArvClient.Create("collections",
704 "collection": arvadosclient.Dict{
705 "trash_at": time.Now().Add(runner.trashLifetime).Format(time.RFC3339),
706 "name": "output for " + runner.Container.UUID,
707 "manifest_text": manifestText}},
710 return fmt.Errorf("While creating output collection: %v", err)
712 runner.OutputPDH = &response.PortableDataHash
716 var outputCollections = make(map[string]arvados.Collection)
718 // Fetch the collection for the mnt.PortableDataHash
719 // Return the manifest_text fragment corresponding to the specified mnt.Path
720 // after making any required updates.
722 // If mnt.Path is not specified,
723 // return the entire manifest_text after replacing any "." with bindSuffix
724 // If mnt.Path corresponds to one stream,
725 // return the manifest_text for that stream after replacing that stream name with bindSuffix
726 // Otherwise, check if a filename in any one stream is being sought. Return the manifest_text
727 // for that stream after replacing stream name with bindSuffix minus the last word
728 // and the file name with last word of the bindSuffix
729 // Allowed path examples:
732 // "path":"/subdir1/subdir2"
733 // "path":"/subdir/filename" etc
734 func (runner *ContainerRunner) getCollectionManifestForPath(mnt arvados.Mount, bindSuffix string) (string, error) {
735 collection := outputCollections[mnt.PortableDataHash]
736 if collection.PortableDataHash == "" {
737 err := runner.ArvClient.Get("collections", mnt.PortableDataHash, nil, &collection)
739 return "", fmt.Errorf("While getting collection for %v: %v", mnt.PortableDataHash, err)
741 outputCollections[mnt.PortableDataHash] = collection
744 if collection.ManifestText == "" {
745 runner.CrunchLog.Printf("No manifest text for collection %v", collection.PortableDataHash)
749 manifest := manifest.Manifest{Text: collection.ManifestText}
750 manifestText := manifest.NormalizedManifestForPath(mnt.Path)
752 if manifestText == "" {
753 // It could be denormalized manifest
754 mntPath := strings.Trim(mnt.Path, "/")
755 manifestText = strings.Replace(collection.ManifestText, "./", "."+bindSuffix+"/", -1)
756 manifestText = strings.Replace(manifestText, ". ", "."+bindSuffix+" ", -1)
758 for _, stream := range strings.Split(manifestText, "\n") {
759 if strings.Index(stream, mntPath) == -1 {
763 for _, token := range strings.Split(manifestText, " ") {
764 if strings.Index(token, ":") == -1 {
765 wanted += " " + token
766 } else if strings.Index(token, ":"+mntPath) >= 0 {
767 wanted += " " + token + "\n"
775 if mnt.Path == "" || mnt.Path == "/" {
776 // no path specified; return the entire manifest text after making adjustments
777 manifestText = strings.Replace(manifestText, "./", "."+bindSuffix+"/", -1)
778 manifestText = strings.Replace(manifestText, ". ", "."+bindSuffix+" ", -1)
780 // either a single stream or file from a stream is being sought
781 bindIdx := strings.LastIndex(bindSuffix, "/")
782 var bindSubdir, bindFileName string
784 bindSubdir = "." + bindSuffix[0:bindIdx]
785 bindFileName = bindSuffix[bindIdx+1:]
788 if strings.HasSuffix(mntPath, "/") {
789 mntPath = mntPath[0 : len(mntPath)-1]
791 pathIdx := strings.LastIndex(mntPath, "/")
792 var pathSubdir, pathFileName string
794 pathSubdir = "." + mntPath[0:pathIdx]
795 pathFileName = mntPath[pathIdx+1:]
798 if strings.Index(manifestText, "."+mntPath+" ") != -1 {
799 // path refers to this complete stream
800 manifestText = strings.Replace(manifestText, "."+mntPath, "."+bindSuffix, -1)
802 // look for a matching file in this stream
803 manifestText = strings.Replace(manifestText, ":"+pathFileName, ":"+bindFileName, -1)
804 manifestText = strings.Replace(manifestText, pathSubdir, bindSubdir, -1)
808 if manifestText == "" {
809 runner.CrunchLog.Printf("No manifest segment found for bind '%v' with path '%v'", bindSuffix, mnt.Path)
812 return manifestText, nil
815 func (runner *ContainerRunner) loadDiscoveryVars() {
816 tl, err := runner.ArvClient.Discovery("defaultTrashLifetime")
818 log.Fatalf("getting defaultTrashLifetime from discovery document: %s", err)
820 runner.trashLifetime = time.Duration(tl.(float64)) * time.Second
823 func (runner *ContainerRunner) CleanupDirs() {
824 if runner.ArvMount != nil {
825 umount := exec.Command("fusermount", "-z", "-u", runner.ArvMountPoint)
826 umnterr := umount.Run()
828 runner.CrunchLog.Printf("While running fusermount: %v", umnterr)
831 mnterr := <-runner.ArvMountExit
833 runner.CrunchLog.Printf("Arv-mount exit error: %v", mnterr)
837 for _, tmpdir := range runner.CleanupTempDir {
838 rmerr := os.RemoveAll(tmpdir)
840 runner.CrunchLog.Printf("While cleaning up temporary directory %s: %v", tmpdir, rmerr)
845 // CommitLogs posts the collection containing the final container logs.
846 func (runner *ContainerRunner) CommitLogs() error {
847 runner.CrunchLog.Print(runner.finalState)
848 runner.CrunchLog.Close()
850 // Closing CrunchLog above allows it to be committed to Keep at this
851 // point, but re-open crunch log with ArvClient in case there are any
852 // other further (such as failing to write the log to Keep!) while
854 runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{runner.ArvClient, runner.Container.UUID,
857 if runner.LogsPDH != nil {
858 // If we have already assigned something to LogsPDH,
859 // we must be closing the re-opened log, which won't
860 // end up getting attached to the container record and
861 // therefore doesn't need to be saved as a collection
862 // -- it exists only to send logs to other channels.
866 mt, err := runner.LogCollection.ManifestText()
868 return fmt.Errorf("While creating log manifest: %v", err)
871 var response arvados.Collection
872 err = runner.ArvClient.Create("collections",
874 "collection": arvadosclient.Dict{
875 "trash_at": time.Now().Add(runner.trashLifetime).Format(time.RFC3339),
876 "name": "logs for " + runner.Container.UUID,
877 "manifest_text": mt}},
880 return fmt.Errorf("While creating log collection: %v", err)
882 runner.LogsPDH = &response.PortableDataHash
886 // UpdateContainerRunning updates the container state to "Running"
887 func (runner *ContainerRunner) UpdateContainerRunning() error {
888 runner.CancelLock.Lock()
889 defer runner.CancelLock.Unlock()
890 if runner.Cancelled {
893 return runner.ArvClient.Update("containers", runner.Container.UUID,
894 arvadosclient.Dict{"container": arvadosclient.Dict{"state": "Running"}}, nil)
897 // ContainerToken returns the api_token the container (and any
898 // arv-mount processes) are allowed to use.
899 func (runner *ContainerRunner) ContainerToken() (string, error) {
900 if runner.token != "" {
901 return runner.token, nil
904 var auth arvados.APIClientAuthorization
905 err := runner.ArvClient.Call("GET", "containers", runner.Container.UUID, "auth", nil, &auth)
909 runner.token = auth.APIToken
910 return runner.token, nil
913 // UpdateContainerComplete updates the container record state on API
914 // server to "Complete" or "Cancelled"
915 func (runner *ContainerRunner) UpdateContainerFinal() error {
916 update := arvadosclient.Dict{}
917 update["state"] = runner.finalState
918 if runner.LogsPDH != nil {
919 update["log"] = *runner.LogsPDH
921 if runner.finalState == "Complete" {
922 if runner.ExitCode != nil {
923 update["exit_code"] = *runner.ExitCode
925 if runner.OutputPDH != nil {
926 update["output"] = *runner.OutputPDH
929 return runner.ArvClient.Update("containers", runner.Container.UUID, arvadosclient.Dict{"container": update}, nil)
932 // IsCancelled returns the value of Cancelled, with goroutine safety.
933 func (runner *ContainerRunner) IsCancelled() bool {
934 runner.CancelLock.Lock()
935 defer runner.CancelLock.Unlock()
936 return runner.Cancelled
939 // NewArvLogWriter creates an ArvLogWriter
940 func (runner *ContainerRunner) NewArvLogWriter(name string) io.WriteCloser {
941 return &ArvLogWriter{runner.ArvClient, runner.Container.UUID, name, runner.LogCollection.Open(name + ".txt")}
944 // Run the full container lifecycle.
945 func (runner *ContainerRunner) Run() (err error) {
946 runner.CrunchLog.Printf("Executing container '%s'", runner.Container.UUID)
948 hostname, hosterr := os.Hostname()
950 runner.CrunchLog.Printf("Error getting hostname '%v'", hosterr)
952 runner.CrunchLog.Printf("Executing on host '%s'", hostname)
955 // Clean up temporary directories _after_ finalizing
956 // everything (if we've made any by then)
957 defer runner.CleanupDirs()
959 runner.finalState = "Queued"
962 // checkErr prints e (unless it's nil) and sets err to
963 // e (unless err is already non-nil). Thus, if err
964 // hasn't already been assigned when Run() returns,
965 // this cleanup func will cause Run() to return the
966 // first non-nil error that is passed to checkErr().
967 checkErr := func(e error) {
971 runner.CrunchLog.Print(e)
977 // Log the error encountered in Run(), if any
980 if runner.finalState == "Queued" {
981 runner.CrunchLog.Close()
982 runner.UpdateContainerFinal()
986 if runner.IsCancelled() {
987 runner.finalState = "Cancelled"
988 // but don't return yet -- we still want to
989 // capture partial output and write logs
992 checkErr(runner.CaptureOutput())
993 checkErr(runner.CommitLogs())
994 checkErr(runner.UpdateContainerFinal())
996 // The real log is already closed, but then we opened
997 // a new one in case we needed to log anything while
999 runner.CrunchLog.Close()
1002 err = runner.ArvClient.Get("containers", runner.Container.UUID, nil, &runner.Container)
1004 err = fmt.Errorf("While getting container record: %v", err)
1008 // setup signal handling
1009 runner.SetupSignals()
1011 // check for and/or load image
1012 err = runner.LoadImage()
1014 runner.finalState = "Cancelled"
1015 err = fmt.Errorf("While loading container image: %v", err)
1019 // set up FUSE mount and binds
1020 err = runner.SetupMounts()
1022 runner.finalState = "Cancelled"
1023 err = fmt.Errorf("While setting up mounts: %v", err)
1027 err = runner.CreateContainer()
1032 runner.StartCrunchstat()
1034 if runner.IsCancelled() {
1038 err = runner.UpdateContainerRunning()
1042 runner.finalState = "Cancelled"
1044 err = runner.StartContainer()
1049 err = runner.WaitFinish()
1051 runner.finalState = "Complete"
1056 // NewContainerRunner creates a new container runner.
1057 func NewContainerRunner(api IArvadosClient,
1059 docker ThinDockerClient,
1060 containerUUID string) *ContainerRunner {
1062 cr := &ContainerRunner{ArvClient: api, Kc: kc, Docker: docker}
1063 cr.NewLogWriter = cr.NewArvLogWriter
1064 cr.RunArvMount = cr.ArvMountCmd
1065 cr.MkTempDir = ioutil.TempDir
1066 cr.LogCollection = &CollectionWriter{kc, nil, sync.Mutex{}}
1067 cr.Container.UUID = containerUUID
1068 cr.CrunchLog = NewThrottledLogger(cr.NewLogWriter("crunch-run"))
1069 cr.CrunchLog.Immediate = log.New(os.Stderr, containerUUID+" ", 0)
1070 cr.loadDiscoveryVars()
1075 statInterval := flag.Duration("crunchstat-interval", 10*time.Second, "sampling period for periodic resource usage reporting")
1076 cgroupRoot := flag.String("cgroup-root", "/sys/fs/cgroup", "path to sysfs cgroup tree")
1077 cgroupParent := flag.String("cgroup-parent", "docker", "name of container's parent cgroup (ignored if -cgroup-parent-subsystem is used)")
1078 cgroupParentSubsystem := flag.String("cgroup-parent-subsystem", "", "use current cgroup for given subsystem as parent cgroup for container")
1079 caCertsPath := flag.String("ca-certs", "", "Path to TLS root certificates")
1082 containerId := flag.Arg(0)
1084 if *caCertsPath != "" {
1085 arvadosclient.CertFiles = []string{*caCertsPath}
1088 api, err := arvadosclient.MakeArvadosClient()
1090 log.Fatalf("%s: %v", containerId, err)
1094 var kc *keepclient.KeepClient
1095 kc, err = keepclient.MakeKeepClient(api)
1097 log.Fatalf("%s: %v", containerId, err)
1101 var docker *dockerclient.DockerClient
1102 docker, err = dockerclient.NewDockerClient("unix:///var/run/docker.sock", nil)
1104 log.Fatalf("%s: %v", containerId, err)
1107 cr := NewContainerRunner(api, kc, docker, containerId)
1108 cr.statInterval = *statInterval
1109 cr.cgroupRoot = *cgroupRoot
1110 cr.expectCgroupParent = *cgroupParent
1111 if *cgroupParentSubsystem != "" {
1112 p := findCgroup(*cgroupParentSubsystem)
1113 cr.setCgroupParent = p
1114 cr.expectCgroupParent = p
1119 log.Fatalf("%s: %v", containerId, err)