From: Tom Clegg Date: Mon, 18 Oct 2021 19:30:29 +0000 (-0400) Subject: 16347: Change default to enable local keepstore when possible. X-Git-Tag: 2.4.0~191^2~7 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/53b2e5895715c73febffb563ebc89153339e02ab 16347: Change default to enable local keepstore when possible. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml index 67286edfe3..c863bbcbce 100644 --- a/lib/config/config.default.yml +++ b/lib/config/config.default.yml @@ -920,11 +920,17 @@ Clusters: # # A zero value disables this feature. # - # Note that when this feature is enabled, the entire cluster - # configuration file, including the system root token, is copied - # to the worker node and held in memory for the duration of the - # container. - LocalKeepBlobBuffersPerVCPU: 0 + # In order for this feature to be activated, no volume may use + # AccessViaHosts, and each volume must have Replication higher + # than Collections.DefaultReplication. If these requirements are + # not satisfied, the feature is disabled automatically + # regardless of the value given here. + # + # Note that when this configuration is enabled, the entire + # cluster configuration file, including the system root token, + # is copied to the worker node and held in memory for the + # duration of the container. + LocalKeepBlobBuffersPerVCPU: 1 # When running a dedicated keepstore process for a container # (see LocalKeepBlobBuffersPerVCPU), write keepstore log diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go index d2a68f29fd..4742c64058 100644 --- a/lib/config/generated_config.go +++ b/lib/config/generated_config.go @@ -926,11 +926,17 @@ Clusters: # # A zero value disables this feature. # - # Note that when this feature is enabled, the entire cluster - # configuration file, including the system root token, is copied - # to the worker node and held in memory for the duration of the - # container. - LocalKeepBlobBuffersPerVCPU: 0 + # In order for this feature to be activated, no volume may use + # AccessViaHosts, and each volume must have Replication higher + # than Collections.DefaultReplication. If these requirements are + # not satisfied, the feature is disabled automatically + # regardless of the value given here. + # + # Note that when this configuration is enabled, the entire + # cluster configuration file, including the system root token, + # is copied to the worker node and held in memory for the + # duration of the container. + LocalKeepBlobBuffersPerVCPU: 1 # When running a dedicated keepstore process for a container # (see LocalKeepBlobBuffersPerVCPU), write keepstore log diff --git a/lib/config/load.go b/lib/config/load.go index a7419331f0..956a47b1a4 100644 --- a/lib/config/load.go +++ b/lib/config/load.go @@ -298,7 +298,6 @@ func (ldr *Loader) Load() (*arvados.Config, error) { ldr.checkEnum("Containers.LocalKeepLogsToContainerLog", cc.Containers.LocalKeepLogsToContainerLog, "none", "all", "errors"), ldr.checkEmptyKeepstores(cc), ldr.checkUnlistedKeepstores(cc), - ldr.checkLocalKeepstoreVolumes(cc), ldr.checkStorageClasses(cc), // TODO: check non-empty Rendezvous on // services other than Keepstore @@ -372,18 +371,6 @@ cluster: return nil } -func (ldr *Loader) checkLocalKeepstoreVolumes(cc arvados.Cluster) error { - if cc.Containers.LocalKeepBlobBuffersPerVCPU < 1 { - return nil - } - for _, vol := range cc.Volumes { - if len(vol.AccessViaHosts) == 0 { - return nil - } - } - return fmt.Errorf("LocalKeepBlobBuffersPerVCPU is %d, but no volumes would be accessible from a worker instance", cc.Containers.LocalKeepBlobBuffersPerVCPU) -} - func (ldr *Loader) checkStorageClasses(cc arvados.Cluster) error { classOnVolume := map[string]bool{} for volid, vol := range cc.Volumes { diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go index c9456ccc74..ba5673f917 100644 --- a/lib/crunchrun/crunchrun.go +++ b/lib/crunchrun/crunchrun.go @@ -1776,11 +1776,18 @@ func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s } if keepstore == nil { - // Nothing is written to keepstoreLogbuf, no need to - // call SetWriter. + // Log explanation (if any) for why we're not running + // a local keepstore. + var buf bytes.Buffer + keepstoreLogbuf.SetWriter(&buf) + if buf.Len() > 0 { + cr.CrunchLog.Printf("%s", strings.TrimSpace(buf.String())) + } } else if logWhat := conf.Cluster.Containers.LocalKeepLogsToContainerLog; logWhat == "none" { + cr.CrunchLog.Printf("using local keepstore process (pid %d) at %s", keepstore.Process.Pid, os.Getenv("ARVADOS_KEEP_SERVICES")) keepstoreLogbuf.SetWriter(io.Discard) } else { + cr.CrunchLog.Printf("using local keepstore process (pid %d) at %s, writing logs to keepstore.txt in log collection", keepstore.Process.Pid, os.Getenv("ARVADOS_KEEP_SERVICES")) logwriter, err := cr.NewLogWriter("keepstore") if err != nil { log.Print(err) @@ -1896,6 +1903,16 @@ func startLocalKeepstore(configData ConfigData, logbuf io.Writer) (*exec.Cmd, er if configData.Cluster == nil || configData.KeepBuffers < 1 { return nil, nil } + for uuid, vol := range configData.Cluster.Volumes { + if len(vol.AccessViaHosts) > 0 { + fmt.Fprintf(logbuf, "not starting a local keepstore process because a volume (%s) uses AccessViaHosts\n", uuid) + return nil, nil + } + if !vol.ReadOnly && vol.Replication < configData.Cluster.Collections.DefaultReplication { + fmt.Fprintf(logbuf, "not starting a local keepstore process because a writable volume (%s) has replication less than Collections.DefaultReplication (%d < %d)\n", uuid, vol.Replication, configData.Cluster.Collections.DefaultReplication) + return nil, nil + } + } // Rather than have an alternate way to tell keepstore how // many buffers to use when starting it this way, we just