17244: Use cpu.max to get available fraction of CPUs.
authorTom Clegg <tom@curii.com>
Mon, 14 Aug 2023 19:09:27 +0000 (15:09 -0400)
committerTom Clegg <tom@curii.com>
Mon, 14 Aug 2023 20:34:07 +0000 (16:34 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

12 files changed:
lib/crunchstat/crunchstat.go
lib/crunchstat/crunchstat_test.go
tools/crunchstat-summary/tests/container_9tee4-dz642-lymtndkpy39eibk.txt.gz.report
tools/crunchstat-summary/tests/container_request_9tee4-xvhdp-kk0ja1cl8b2kr1y-crunchstat.txt.gz
tools/crunchstat-summary/tests/container_request_9tee4-xvhdp-kk0ja1cl8b2kr1y-crunchstat.txt.gz.report
tools/crunchstat-summary/tests/container_request_9tee4-xvhdp-kk0ja1cl8b2kr1y.txt.gz.report
tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz
tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz.report
tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz
tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz.report
tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz
tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz.report

index 7e6a180b516b8a5d808c5f476cb42c1d2381056d..bbd0a7fd2f0acae244857bb1eb466cc2fe2f0246 100644 (file)
@@ -71,6 +71,7 @@ type Reporter struct {
 
        // available cgroup hierarchies
        statFiles struct {
+               cpuMax            string // v2
                cpusetCpus        string // v1,v2 (via /proc/$PID/cpuset)
                cpuacctStat       string // v1 (via /proc/$PID/cgroup => cpuacct)
                cpuStat           string // v2
@@ -275,6 +276,7 @@ func (r *Reporter) findStatFiles() {
                pathkey  string
                file     string
        }{
+               {&r.statFiles.cpuMax, "unified", "cpu.max"},
                {&r.statFiles.cpusetCpus, "cpuset", "cpuset.cpus.effective"},
                {&r.statFiles.cpusetCpus, "cpuset", "cpuset.cpus"},
                {&r.statFiles.cpuacctStat, "cpuacct", "cpuacct.stat"},
@@ -352,7 +354,7 @@ func (r *Reporter) reportMemoryMax(logger logPrinter, source, statName string, v
 
 func (r *Reporter) LogMaxima(logger logPrinter, memLimits map[string]int64) {
        if r.lastCPUSample.hasData {
-               logger.Printf("Total CPU usage was %f user and %f sys on %d CPUs",
+               logger.Printf("Total CPU usage was %f user and %f sys on %.2f CPUs",
                        r.lastCPUSample.user, r.lastCPUSample.sys, r.lastCPUSample.cpus)
        }
        for disk, sample := range r.lastDiskIOSample {
@@ -733,27 +735,54 @@ type cpuSample struct {
        sampleTime time.Time
        user       float64
        sys        float64
-       cpus       int64
+       cpus       float64
 }
 
-// Return the number of CPUs available in the container. Return 0 if
-// we can't figure out the real number of CPUs.
-func (r *Reporter) getCPUCount() int64 {
-       buf, err := fs.ReadFile(r.FS, r.statFiles.cpusetCpus)
-       if err != nil {
-               return 0
+// Return the number of virtual CPUs available in the container. This
+// can be based on a scheduling ratio (which is not necessarily a
+// whole number) or a restricted set of accessible CPUs.
+//
+// Return the number of host processors based on /proc/cpuinfo if
+// cgroupfs doesn't reveal anything.
+//
+// Return 0 if even that doesn't work.
+func (r *Reporter) getCPUCount() float64 {
+       if buf, err := fs.ReadFile(r.FS, r.statFiles.cpuMax); err == nil {
+               // cpu.max looks like "150000 100000" if CPU usage is
+               // restricted to 150% (docker run --cpus=1.5), or "max
+               // 100000\n" if not.
+               var max, period int64
+               if _, err := fmt.Sscanf(string(buf), "%d %d", &max, &period); err == nil {
+                       return float64(max) / float64(period)
+               }
+       }
+       if buf, err := fs.ReadFile(r.FS, r.statFiles.cpusetCpus); err == nil {
+               // cpuset.cpus looks like "0,4-7\n" if only CPUs
+               // 0,4,5,6,7 are available.
+               cpus := 0
+               for _, v := range bytes.Split(buf, []byte{','}) {
+                       var min, max int
+                       n, _ := fmt.Sscanf(string(v), "%d-%d", &min, &max)
+                       if n == 2 {
+                               cpus += (max - min) + 1
+                       } else {
+                               cpus++
+                       }
+               }
+               return float64(cpus)
        }
-       cpus := int64(0)
-       for _, v := range bytes.Split(buf, []byte{','}) {
-               var min, max int64
-               n, _ := fmt.Sscanf(string(v), "%d-%d", &min, &max)
-               if n == 2 {
-                       cpus += (max - min) + 1
-               } else {
-                       cpus++
+       if buf, err := fs.ReadFile(r.FS, "proc/cpuinfo"); err == nil {
+               // cpuinfo has a line like "processor\t: 0\n" for each
+               // CPU.
+               cpus := 0
+               for _, line := range bytes.Split(buf, []byte{'\n'}) {
+                       if bytes.HasPrefix(line, []byte("processor\t:")) {
+                               cpus++
+                       }
                }
+               return float64(cpus)
        }
-       return cpus
+       return 0
 }
 
 func (r *Reporter) doCPUStats() {
@@ -809,7 +838,7 @@ func (r *Reporter) doCPUStats() {
                        nextSample.user-r.lastCPUSample.user,
                        nextSample.sys-r.lastCPUSample.sys)
        }
-       r.Logger.Printf("cpu %.4f user %.4f sys %d cpus%s\n",
+       r.Logger.Printf("cpu %.4f user %.4f sys %.2f cpus%s\n",
                nextSample.user, nextSample.sys, nextSample.cpus, delta)
        r.lastCPUSample = nextSample
 }
@@ -900,8 +929,10 @@ func (r *Reporter) dumpSourceFiles(destdir string) error {
        todo := []string{
                fmt.Sprintf("proc/%d/cgroup", r.pid),
                fmt.Sprintf("proc/%d/cpuset", r.pid),
+               "proc/cpuinfo",
                "proc/mounts",
                "proc/self/smaps",
+               r.statFiles.cpuMax,
                r.statFiles.cpusetCpus,
                r.statFiles.cpuacctStat,
                r.statFiles.cpuStat,
index c369152b827a101addbd42d0c0f7d05e298d5b25..1bd5933ac1b3cfc6e973f27c9b5b492340a2ff5e 100644 (file)
@@ -85,7 +85,7 @@ func (s *suite) TestReportCurrent(c *C) {
                `(?ms).*rss.*`,
                `(?ms).*net:.*`,
                `(?ms).*blkio:.*`,
-               `(?ms).* [\d.]+ user [\d.]+ sys ` + fmt.Sprintf("%d", runtime.NumCPU()) + ` cpus -- .*`,
+               `(?ms).* [\d.]+ user [\d.]+ sys ` + fmt.Sprintf("%.2f", float64(runtime.NumCPU())) + ` cpus -- .*`,
        }
        for deadline := time.Now().Add(4 * time.Second); !c.Failed(); time.Sleep(time.Millisecond) {
                done := true
@@ -154,7 +154,7 @@ func (s *suite) TestAllTestdata(c *C) {
                c.Check(logs, Matches, `(?ms).* \d\d+ rss\\n.*`)
                c.Check(logs, Matches, `(?ms).*blkio:\d+:\d+ \d+ write \d+ read\\n.*`)
                c.Check(logs, Matches, `(?ms).*net:\S+ \d+ tx \d+ rx\\n.*`)
-               c.Check(logs, Matches, `(?ms).* [\d.]+ user [\d.]+ sys [2-9]\d* cpus.*`)
+               c.Check(logs, Matches, `(?ms).* [\d.]+ user [\d.]+ sys [2-9]\d*\.\d\d cpus.*`)
        }
 }
 
index 5152e577f5c5a17f3ef57b0c644592f5de14fcb6..868f07b684eedad0544723ab50cf5b90a86329bd 100644 (file)
@@ -1,7 +1,7 @@
 category       metric  task_max        task_max_rate   job_total
 blkio:0:0      read    0       0       0
 blkio:0:0      write   0       0       0
-cpu    cpus    20      -       -
+cpu    cpus    20.00   -       -
 cpu    sys     0.39    0.04    0.39
 cpu    user    2.06    0.20    2.06
 cpu    user+sys        2.45    0.24    2.45
index fc01ce9a8f124e2fe3d88ef20394e966700b2326..680af69470362595a9dd9e4bb6b75bcc04b0494a 100644 (file)
Binary files a/tools/crunchstat-summary/tests/container_request_9tee4-xvhdp-kk0ja1cl8b2kr1y-crunchstat.txt.gz and b/tools/crunchstat-summary/tests/container_request_9tee4-xvhdp-kk0ja1cl8b2kr1y-crunchstat.txt.gz differ
index b17c7005936cee279c69537cb94251845250d9cf..87db98bb37cc468c645b3ce9af03f78e5d024b3e 100644 (file)
@@ -1,5 +1,5 @@
 category       metric  task_max        task_max_rate   job_total
-cpu    cpus    20      -       -
+cpu    cpus    20.00   -       -
 cpu    sys     0.39    0.04    0.39
 cpu    user    2.06    0.20    2.06
 cpu    user+sys        2.45    0.24    2.45
index 5152e577f5c5a17f3ef57b0c644592f5de14fcb6..868f07b684eedad0544723ab50cf5b90a86329bd 100644 (file)
@@ -1,7 +1,7 @@
 category       metric  task_max        task_max_rate   job_total
 blkio:0:0      read    0       0       0
 blkio:0:0      write   0       0       0
-cpu    cpus    20      -       -
+cpu    cpus    20.00   -       -
 cpu    sys     0.39    0.04    0.39
 cpu    user    2.06    0.20    2.06
 cpu    user+sys        2.45    0.24    2.45
index 0042cc59bde2b889a532cb452ee9ca604adbe8f7..bfdcdff26fe32977746dfa9e4d6593d909db4085 100644 (file)
Binary files a/tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz and b/tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz differ
index 1fb56c7beba7a2e7345bc4580fd2e3d7962d6d1f..173e93fe2522bfbd131fe449da183fcc61b4e1fb 100644 (file)
@@ -1,7 +1,7 @@
 category       metric  task_max        task_max_rate   job_total
 blkio:0:0      read    0       0       0
 blkio:0:0      write   0       0       0
-cpu    cpus    8       -       -
+cpu    cpus    8.00    -       -
 cpu    sys     1.92    0.04    1.92
 cpu    user    3.83    0.09    3.83
 cpu    user+sys        5.75    0.13    5.75
index 78afb98de1c6861f3f8471a955714da63eeec4a6..17af5351081215d0014f1e97ee4db0d20d91092a 100644 (file)
Binary files a/tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz and b/tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz differ
index f567233fb7d6e0cd6fdd3716b1d7dd237a8824f6..b31a055e9f457ef42ad805a8518c09f067601e73 100644 (file)
@@ -1,5 +1,5 @@
 category       metric  task_max        task_max_rate   job_total
-cpu    cpus    8       -       -
+cpu    cpus    8.00    -       -
 cpu    sys     0       -       0.00
 cpu    user    0       -       0.00
 cpu    user+sys        0       -       0.00
index 49018f7e25513cae6dbc85ea2fcd23d88d26d428..8826f70470e482caa8f47cf9b8004e7a38e9fa63 100644 (file)
Binary files a/tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz and b/tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz differ
index ab0febbefa83fcddfe0519c3744f826be50eecc0..9ddf5acc3292ec0a353606a4469a4dd125c8242f 100644 (file)
@@ -1,5 +1,5 @@
 category       metric  task_max        task_max_rate   job_total
-cpu    cpus    8       -       -
+cpu    cpus    8.00    -       -
 cpu    sys     0       -       0.00
 cpu    user    0       -       0.00
 cpu    user+sys        0       -       0.00