1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
24 "git.arvados.org/arvados.git/sdk/go/arvados"
25 "github.com/ghodss/yaml"
26 "github.com/imdario/mergo"
27 "github.com/prometheus/client_golang/prometheus"
28 "github.com/sirupsen/logrus"
29 "golang.org/x/crypto/ssh"
30 "golang.org/x/sys/unix"
33 //go:embed config.default.yml
34 var DefaultYAML []byte
36 var ErrNoClustersDefined = errors.New("config does not define any clusters")
40 Logger logrus.FieldLogger
41 SkipDeprecated bool // Don't load deprecated config keys
42 SkipLegacy bool // Don't load legacy config files
43 SkipAPICalls bool // Don't do checks that call RailsAPI/controller
48 CrunchDispatchSlurmPath string
52 KeepBalancePath string
55 // UTC time for configdata: either the modtime of the file we
56 // read configdata from, or the time when we read configdata
58 sourceTimestamp time.Time
59 // UTC time when configdata was read.
60 loadTimestamp time.Time
63 // NewLoader returns a new Loader with Stdin and Logger set to the
64 // given values, and all config paths set to their default values.
65 func NewLoader(stdin io.Reader, logger logrus.FieldLogger) *Loader {
66 ldr := &Loader{Stdin: stdin, Logger: logger}
67 // Calling SetupFlags on a throwaway FlagSet has the side
68 // effect of assigning default values to the configurable
70 ldr.SetupFlags(flag.NewFlagSet("", flag.ContinueOnError))
74 // SetupFlags configures a flagset so arguments like -config X can be
75 // used to change the loader's Path fields.
77 // ldr := NewLoader(os.Stdin, logrus.New())
78 // flagset := flag.NewFlagSet("", flag.ContinueOnError)
79 // ldr.SetupFlags(flagset)
80 // // ldr.Path == "/etc/arvados/config.yml"
81 // flagset.Parse([]string{"-config", "/tmp/c.yaml"})
82 // // ldr.Path == "/tmp/c.yaml"
83 func (ldr *Loader) SetupFlags(flagset *flag.FlagSet) {
84 flagset.StringVar(&ldr.Path, "config", arvados.DefaultConfigFile, "Site configuration `file` (default may be overridden by setting an ARVADOS_CONFIG environment variable)")
86 flagset.StringVar(&ldr.KeepstorePath, "legacy-keepstore-config", defaultKeepstoreConfigPath, "Legacy keepstore configuration `file`")
87 flagset.StringVar(&ldr.KeepWebPath, "legacy-keepweb-config", defaultKeepWebConfigPath, "Legacy keep-web configuration `file`")
88 flagset.StringVar(&ldr.CrunchDispatchSlurmPath, "legacy-crunch-dispatch-slurm-config", defaultCrunchDispatchSlurmConfigPath, "Legacy crunch-dispatch-slurm configuration `file`")
89 flagset.StringVar(&ldr.WebsocketPath, "legacy-ws-config", defaultWebsocketConfigPath, "Legacy arvados-ws configuration `file`")
90 flagset.StringVar(&ldr.KeepproxyPath, "legacy-keepproxy-config", defaultKeepproxyConfigPath, "Legacy keepproxy configuration `file`")
91 flagset.StringVar(&ldr.GitHttpdPath, "legacy-git-httpd-config", defaultGitHttpdConfigPath, "Legacy arvados-git-httpd configuration `file`")
92 flagset.StringVar(&ldr.KeepBalancePath, "legacy-keepbalance-config", defaultKeepBalanceConfigPath, "Legacy keep-balance configuration `file`")
93 flagset.BoolVar(&ldr.SkipLegacy, "skip-legacy", false, "Don't load legacy config files")
97 // MungeLegacyConfigArgs checks args for a -config flag whose argument
98 // is a regular file (or a symlink to one), but doesn't have a
99 // top-level "Clusters" key and therefore isn't a valid cluster
100 // configuration file. If it finds such a flag, it replaces -config
101 // with legacyConfigArg (e.g., "-legacy-keepstore-config").
103 // This is used by programs that still need to accept "-config" as a
104 // way to specify a per-component config file until their config has
107 // If any errors are encountered while reading or parsing a config
108 // file, the given args are not munged. We presume the same errors
109 // will be encountered again and reported later on when trying to load
110 // cluster configuration from the same file, regardless of which
111 // struct we end up using.
112 func (ldr *Loader) MungeLegacyConfigArgs(lgr logrus.FieldLogger, args []string, legacyConfigArg string) []string {
113 munged := append([]string(nil), args...)
114 for i := 0; i < len(args); i++ {
115 if !strings.HasPrefix(args[i], "-") || strings.SplitN(strings.TrimPrefix(args[i], "-"), "=", 2)[0] != "config" {
119 if strings.Contains(args[i], "=") {
120 operand = strings.SplitN(args[i], "=", 2)[1]
121 } else if i+1 < len(args) && !strings.HasPrefix(args[i+1], "-") {
127 if fi, err := os.Stat(operand); err != nil || !fi.Mode().IsRegular() {
130 f, err := os.Open(operand)
135 buf, err := ioutil.ReadAll(f)
139 var cfg arvados.Config
140 err = yaml.Unmarshal(buf, &cfg)
144 if len(cfg.Clusters) == 0 {
145 lgr.Warnf("%s is not a cluster config file -- interpreting %s as %s (please migrate your config!)", operand, "-config", legacyConfigArg)
146 if operand == args[i] {
147 munged[i-1] = legacyConfigArg
149 munged[i] = legacyConfigArg + "=" + operand
154 // Disable legacy config loading for components other than the
155 // one that was specified
156 if legacyConfigArg != "-legacy-keepstore-config" {
157 ldr.KeepstorePath = ""
159 if legacyConfigArg != "-legacy-crunch-dispatch-slurm-config" {
160 ldr.CrunchDispatchSlurmPath = ""
162 if legacyConfigArg != "-legacy-ws-config" {
163 ldr.WebsocketPath = ""
165 if legacyConfigArg != "-legacy-keepweb-config" {
168 if legacyConfigArg != "-legacy-keepproxy-config" {
169 ldr.KeepproxyPath = ""
171 if legacyConfigArg != "-legacy-git-httpd-config" {
172 ldr.GitHttpdPath = ""
174 if legacyConfigArg != "-legacy-keepbalance-config" {
175 ldr.KeepBalancePath = ""
181 func (ldr *Loader) loadBytes(path string) (buf []byte, sourceTime, loadTime time.Time, err error) {
182 loadTime = time.Now().UTC()
184 buf, err = ioutil.ReadAll(ldr.Stdin)
185 sourceTime = loadTime
188 f, err := os.Open(path)
197 sourceTime = fi.ModTime().UTC()
198 buf, err = ioutil.ReadAll(f)
202 func (ldr *Loader) Load() (*arvados.Config, error) {
203 if ldr.configdata == nil {
204 buf, sourceTime, loadTime, err := ldr.loadBytes(ldr.Path)
209 ldr.sourceTimestamp = sourceTime
210 ldr.loadTimestamp = loadTime
213 // FIXME: We should reject YAML if the same key is used twice
214 // in a map/object, like {foo: bar, foo: baz}. Maybe we'll get
215 // this fixed free when we upgrade ghodss/yaml to a version
216 // that uses go-yaml v3.
218 // Load the config into a dummy map to get the cluster ID
219 // keys, discarding the values; then set up defaults for each
220 // cluster ID; then load the real config on top of the
223 Clusters map[string]struct{}
225 err := yaml.Unmarshal(ldr.configdata, &dummy)
229 if len(dummy.Clusters) == 0 {
230 return nil, ErrNoClustersDefined
233 // We can't merge deep structs here; instead, we unmarshal the
234 // default & loaded config files into generic maps, merge
235 // those, and then json-encode+decode the result into the
236 // config struct type.
237 var merged map[string]interface{}
238 for id := range dummy.Clusters {
239 var src map[string]interface{}
240 err = yaml.Unmarshal(bytes.Replace(DefaultYAML, []byte(" xxxxx:"), []byte(" "+id+":"), -1), &src)
242 return nil, fmt.Errorf("loading defaults for %s: %s", id, err)
244 err = mergo.Merge(&merged, src, mergo.WithOverride)
246 return nil, fmt.Errorf("merging defaults for %s: %s", id, err)
249 var src map[string]interface{}
250 err = yaml.Unmarshal(ldr.configdata, &src)
252 return nil, fmt.Errorf("loading config data: %s", err)
254 ldr.logExtraKeys(merged, src, "")
255 removeSampleKeys(merged)
256 // We merge the loaded config into the default, overriding any existing keys.
257 // Make sure we do not override a default with a key that has a 'null' value.
259 err = mergo.Merge(&merged, src, mergo.WithOverride)
261 return nil, fmt.Errorf("merging config data: %s", err)
264 // map[string]interface{} => json => arvados.Config
265 var cfg arvados.Config
269 errEnc = json.NewEncoder(pw).Encode(merged)
272 err = json.NewDecoder(pr).Decode(&cfg)
277 return nil, fmt.Errorf("transcoding config data: %s", err)
280 var loadFuncs []func(*arvados.Config) error
281 if !ldr.SkipDeprecated {
282 loadFuncs = append(loadFuncs,
283 ldr.applyDeprecatedConfig,
284 ldr.applyDeprecatedVolumeDriverParameters,
288 // legacy file is required when either:
289 // * a non-default location was specified
290 // * no primary config was loaded, and this is the
291 // legacy config file for the current component
292 loadFuncs = append(loadFuncs,
293 ldr.loadOldEnvironmentVariables,
294 ldr.loadOldKeepstoreConfig,
295 ldr.loadOldKeepWebConfig,
296 ldr.loadOldCrunchDispatchSlurmConfig,
297 ldr.loadOldWebsocketConfig,
298 ldr.loadOldKeepproxyConfig,
299 ldr.loadOldGitHttpdConfig,
300 ldr.loadOldKeepBalanceConfig,
303 loadFuncs = append(loadFuncs,
304 ldr.setImplicitStorageClasses,
305 ldr.setLoopbackInstanceType,
307 for _, f := range loadFuncs {
314 // Preprocess/automate some configs
315 for id, cc := range cfg.Clusters {
316 ldr.autofillPreemptible("Clusters."+id, &cc)
318 if strings.Count(cc.Users.AnonymousUserToken, "/") == 3 {
319 // V2 token, strip it to just a secret
320 tmp := strings.Split(cc.Users.AnonymousUserToken, "/")
321 cc.Users.AnonymousUserToken = tmp[2]
324 cfg.Clusters[id] = cc
327 // Check for known mistakes
328 for id, cc := range cfg.Clusters {
329 for remote := range cc.RemoteClusters {
330 if remote == "*" || remote == "SAMPLE" {
333 err = ldr.checkClusterID(fmt.Sprintf("Clusters.%s.RemoteClusters.%s", id, remote), remote, true)
338 for _, err = range []error{
339 ldr.checkClusterID(fmt.Sprintf("Clusters.%s", id), id, false),
340 ldr.checkClusterID(fmt.Sprintf("Clusters.%s.Login.LoginCluster", id), cc.Login.LoginCluster, true),
341 ldr.checkToken(fmt.Sprintf("Clusters.%s.ManagementToken", id), cc.ManagementToken, true, false),
342 ldr.checkToken(fmt.Sprintf("Clusters.%s.SystemRootToken", id), cc.SystemRootToken, true, false),
343 ldr.checkToken(fmt.Sprintf("Clusters.%s.Users.AnonymousUserToken", id), cc.Users.AnonymousUserToken, false, true),
344 ldr.checkToken(fmt.Sprintf("Clusters.%s.Collections.BlobSigningKey", id), cc.Collections.BlobSigningKey, true, false),
345 checkKeyConflict(fmt.Sprintf("Clusters.%s.PostgreSQL.Connection", id), cc.PostgreSQL.Connection),
346 ldr.checkEnum("Containers.LocalKeepLogsToContainerLog", cc.Containers.LocalKeepLogsToContainerLog, "none", "all", "errors"),
347 ldr.checkEmptyKeepstores(cc),
348 ldr.checkUnlistedKeepstores(cc),
349 ldr.checkLocalKeepBlobBuffers(cc),
350 ldr.checkStorageClasses(cc),
351 ldr.checkCUDAVersions(cc),
352 // TODO: check non-empty Rendezvous on
353 // services other than Keepstore
360 cfg.SourceTimestamp = ldr.sourceTimestamp
361 cfg.SourceSHA256 = fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))
365 var acceptableClusterIDRe = regexp.MustCompile(`^[a-z0-9]{5}$`)
367 func (ldr *Loader) checkClusterID(label, clusterID string, emptyStringOk bool) error {
368 if emptyStringOk && clusterID == "" {
370 } else if !acceptableClusterIDRe.MatchString(clusterID) {
371 return fmt.Errorf("%s: cluster ID should be 5 lowercase alphanumeric characters", label)
376 var acceptableTokenRe = regexp.MustCompile(`^[a-zA-Z0-9]+$`)
377 var acceptableTokenLength = 32
379 func (ldr *Loader) checkToken(label, token string, mandatory bool, acceptV2 bool) error {
382 // when a token is not mandatory, the acceptable length and content is only checked if its length is non-zero
385 if ldr.Logger != nil {
386 ldr.Logger.Warnf("%s: secret token is not set (use %d+ random characters from a-z, A-Z, 0-9)", label, acceptableTokenLength)
389 } else if !acceptableTokenRe.MatchString(token) {
391 return fmt.Errorf("%s: unacceptable characters in token (only a-z, A-Z, 0-9 are acceptable)", label)
393 // Test for a proper V2 token
394 tmp := strings.SplitN(token, "/", 3)
396 return fmt.Errorf("%s: unacceptable characters in token (only a-z, A-Z, 0-9 are acceptable)", label)
398 if !strings.HasPrefix(token, "v2/") {
399 return fmt.Errorf("%s: unacceptable characters in token (only a-z, A-Z, 0-9 are acceptable)", label)
401 if !acceptableTokenRe.MatchString(tmp[2]) {
402 return fmt.Errorf("%s: unacceptable characters in V2 token secret (only a-z, A-Z, 0-9 are acceptable)", label)
404 if len(tmp[2]) < acceptableTokenLength {
405 ldr.Logger.Warnf("%s: secret is too short (should be at least %d characters)", label, acceptableTokenLength)
407 } else if len(token) < acceptableTokenLength {
408 if ldr.Logger != nil {
409 ldr.Logger.Warnf("%s: token is too short (should be at least %d characters)", label, acceptableTokenLength)
415 func (ldr *Loader) checkEnum(label, value string, accepted ...string) error {
416 for _, s := range accepted {
421 return fmt.Errorf("%s: unacceptable value %q: must be one of %q", label, value, accepted)
424 func (ldr *Loader) setLoopbackInstanceType(cfg *arvados.Config) error {
425 for id, cc := range cfg.Clusters {
426 if !cc.Containers.CloudVMs.Enable || cc.Containers.CloudVMs.Driver != "loopback" {
429 if len(cc.InstanceTypes) == 1 {
432 if len(cc.InstanceTypes) > 1 {
433 return fmt.Errorf("Clusters.%s.InstanceTypes: cannot use multiple InstanceTypes with loopback driver", id)
435 // No InstanceTypes configured. Fill in implicit
437 hostram, err := getHostRAM()
441 scratch, err := getFilesystemSize(os.TempDir())
445 cc.InstanceTypes = arvados.InstanceTypeMap{"localhost": {
447 ProviderType: "localhost",
448 VCPUs: runtime.NumCPU(),
451 IncludedScratch: scratch,
454 cfg.Clusters[id] = cc
459 func getFilesystemSize(path string) (arvados.ByteSize, error) {
460 var stat unix.Statfs_t
461 err := unix.Statfs(path, &stat)
465 return arvados.ByteSize(stat.Blocks * uint64(stat.Bsize)), nil
468 var reMemTotal = regexp.MustCompile(`(^|\n)MemTotal: *(\d+) kB\n`)
470 func getHostRAM() (arvados.ByteSize, error) {
471 buf, err := os.ReadFile("/proc/meminfo")
475 m := reMemTotal.FindSubmatch(buf)
477 return 0, errors.New("error parsing /proc/meminfo: no MemTotal")
479 kb, err := strconv.ParseInt(string(m[2]), 10, 64)
481 return 0, fmt.Errorf("error parsing /proc/meminfo: %q: %w", m[2], err)
483 return arvados.ByteSize(kb) * 1024, nil
486 func (ldr *Loader) setImplicitStorageClasses(cfg *arvados.Config) error {
488 for id, cc := range cfg.Clusters {
489 if len(cc.StorageClasses) > 0 {
492 for _, vol := range cc.Volumes {
493 if len(vol.StorageClasses) > 0 {
497 // No explicit StorageClasses config info at all; fill
498 // in implicit defaults.
499 for id, vol := range cc.Volumes {
500 vol.StorageClasses = map[string]bool{"default": true}
503 cc.StorageClasses = map[string]arvados.StorageClassConfig{"default": {Default: true}}
504 cfg.Clusters[id] = cc
509 func (ldr *Loader) checkLocalKeepBlobBuffers(cc arvados.Cluster) error {
510 kbb := cc.Containers.LocalKeepBlobBuffersPerVCPU
514 for uuid, vol := range cc.Volumes {
515 if len(vol.AccessViaHosts) > 0 {
516 ldr.Logger.Warnf("LocalKeepBlobBuffersPerVCPU is %d but will not be used because at least one volume (%s) uses AccessViaHosts -- suggest changing to 0", kbb, uuid)
519 if !vol.ReadOnly && vol.Replication < cc.Collections.DefaultReplication {
520 ldr.Logger.Warnf("LocalKeepBlobBuffersPerVCPU is %d but will not be used because at least one volume (%s) has lower replication than DefaultReplication (%d < %d) -- suggest changing to 0", kbb, uuid, vol.Replication, cc.Collections.DefaultReplication)
527 func (ldr *Loader) checkStorageClasses(cc arvados.Cluster) error {
528 classOnVolume := map[string]bool{}
529 for volid, vol := range cc.Volumes {
530 if len(vol.StorageClasses) == 0 {
531 return fmt.Errorf("%s: volume has no StorageClasses listed", volid)
533 for classid := range vol.StorageClasses {
534 if _, ok := cc.StorageClasses[classid]; !ok {
535 return fmt.Errorf("%s: volume refers to storage class %q that is not defined in StorageClasses", volid, classid)
537 classOnVolume[classid] = true
541 for classid, sc := range cc.StorageClasses {
542 if !classOnVolume[classid] && len(cc.Volumes) > 0 {
543 ldr.Logger.Warnf("there are no volumes providing storage class %q", classid)
550 return fmt.Errorf("there is no default storage class (at least one entry in StorageClasses must have Default: true)")
555 func (ldr *Loader) checkCUDAVersions(cc arvados.Cluster) error {
556 for _, it := range cc.InstanceTypes {
557 if it.CUDA.DeviceCount == 0 {
561 _, err := strconv.ParseFloat(it.CUDA.DriverVersion, 64)
563 return fmt.Errorf("InstanceType %q has invalid CUDA.DriverVersion %q, expected format X.Y (%v)", it.Name, it.CUDA.DriverVersion, err)
565 _, err = strconv.ParseFloat(it.CUDA.HardwareCapability, 64)
567 return fmt.Errorf("InstanceType %q has invalid CUDA.HardwareCapability %q, expected format X.Y (%v)", it.Name, it.CUDA.HardwareCapability, err)
573 func checkKeyConflict(label string, m map[string]string) error {
574 saw := map[string]bool{}
576 k = strings.ToLower(k)
578 return fmt.Errorf("%s: multiple entries for %q (fix by using same capitalization as default/example file)", label, k)
585 func removeNullKeys(m map[string]interface{}) {
586 for k, v := range m {
590 if v, _ := v.(map[string]interface{}); v != nil {
596 func removeSampleKeys(m map[string]interface{}) {
598 for _, v := range m {
599 if v, _ := v.(map[string]interface{}); v != nil {
605 func (ldr *Loader) logExtraKeys(expected, supplied map[string]interface{}, prefix string) {
606 if ldr.Logger == nil {
609 for k, vsupp := range supplied {
611 // entry will be dropped in removeSampleKeys anyway
614 vexp, ok := expected[k]
615 if expected["SAMPLE"] != nil {
616 // use the SAMPLE entry's keys as the
617 // "expected" map when checking vsupp
619 vexp = expected["SAMPLE"]
621 // check for a case-insensitive match
623 for ek := range expected {
624 if strings.EqualFold(k, ek) {
625 hint = " (perhaps you meant " + ek + "?)"
626 // If we don't delete this, it
627 // will end up getting merged,
629 // merging/overriding the
635 ldr.Logger.Warnf("deprecated or unknown config entry: %s%s%s", prefix, k, hint)
638 if vsupp, ok := vsupp.(map[string]interface{}); !ok {
639 // if vsupp is a map but vexp isn't map, this
640 // will be caught elsewhere; see TestBadType.
642 } else if vexp, ok := vexp.(map[string]interface{}); !ok {
643 ldr.Logger.Warnf("unexpected object in config entry: %s%s", prefix, k)
645 ldr.logExtraKeys(vexp, vsupp, prefix+k+".")
650 func (ldr *Loader) autofillPreemptible(label string, cc *arvados.Cluster) {
651 if factor := cc.Containers.PreemptiblePriceFactor; factor > 0 {
652 for name, it := range cc.InstanceTypes {
654 it.Preemptible = true
655 it.Price = it.Price * factor
656 it.Name = name + ".preemptible"
657 if it2, exists := cc.InstanceTypes[it.Name]; exists && it2 != it {
658 ldr.Logger.Warnf("%s.InstanceTypes[%s]: already exists, so not automatically adding a preemptible variant of %s", label, it.Name, name)
661 cc.InstanceTypes[it.Name] = it
668 // RegisterMetrics registers metrics showing the timestamp and content
669 // hash of the currently loaded config.
671 // Must not be called more than once for a given registry. Must not be
672 // called before Load(). Metrics are not updated by subsequent calls
674 func (ldr *Loader) RegisterMetrics(reg *prometheus.Registry) {
675 hash := fmt.Sprintf("%x", sha256.Sum256(ldr.configdata))
676 vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
677 Namespace: "arvados",
679 Name: "source_timestamp_seconds",
680 Help: "Timestamp of config file when it was loaded.",
681 }, []string{"sha256"})
682 vec.WithLabelValues(hash).Set(float64(ldr.sourceTimestamp.UnixNano()) / 1e9)
683 reg.MustRegister(vec)
685 vec = prometheus.NewGaugeVec(prometheus.GaugeOpts{
686 Namespace: "arvados",
688 Name: "load_timestamp_seconds",
689 Help: "Time when config file was loaded.",
690 }, []string{"sha256"})
691 vec.WithLabelValues(hash).Set(float64(ldr.loadTimestamp.UnixNano()) / 1e9)
692 reg.MustRegister(vec)
695 // Load an SSH private key from the given confvalue, which is either
696 // the literal key or an absolute path to a file containing the key.
697 func LoadSSHKey(confvalue string) (ssh.Signer, error) {
698 if fnm := strings.TrimPrefix(confvalue, "file://"); fnm != confvalue && strings.HasPrefix(fnm, "/") {
699 keydata, err := os.ReadFile(fnm)
703 return ssh.ParsePrivateKey(keydata)
705 return ssh.ParsePrivateKey([]byte(confvalue))