20953: Adds TLS certificate expiration red/yellow/green light on main dashboard
[arvados.git] / tools / keep-exercise / keep-exercise.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 // Testing tool for Keep services.
6 //
7 // keepexercise helps measure throughput and test reliability under
8 // various usage patterns.
9 //
10 // By default, it reads and writes blocks containing 2^26 NUL
11 // bytes. This generates network traffic without consuming much disk
12 // space.
13 //
14 // For a more realistic test, enable -vary-request. Warning: this will
15 // fill your storage volumes with random data if you leave it running,
16 // which can cost you money or leave you with too little room for
17 // useful data.
18 package main
19
20 import (
21         "bufio"
22         "context"
23         "crypto/rand"
24         "encoding/binary"
25         "flag"
26         "fmt"
27         "io"
28         "io/ioutil"
29         "log"
30         mathRand "math/rand"
31         "net/http"
32         "os"
33         "os/signal"
34         "strings"
35         "sync/atomic"
36         "syscall"
37         "time"
38
39         "git.arvados.org/arvados.git/lib/cmd"
40         "git.arvados.org/arvados.git/lib/config"
41         "git.arvados.org/arvados.git/sdk/go/arvados"
42         "git.arvados.org/arvados.git/sdk/go/arvadosclient"
43         "git.arvados.org/arvados.git/sdk/go/keepclient"
44 )
45
46 var version = "dev"
47
48 // Command line config knobs
49 var (
50         BlockSize     = flag.Int("block-size", keepclient.BLOCKSIZE, "bytes per read/write op")
51         ReadThreads   = flag.Int("rthreads", 1, "number of concurrent readers")
52         WriteThreads  = flag.Int("wthreads", 1, "number of concurrent writers")
53         VaryRequest   = flag.Bool("vary-request", false, "vary the data for each request: consumes disk space, exercises write behavior")
54         VaryThread    = flag.Bool("vary-thread", false, "use -wthreads different data blocks")
55         Replicas      = flag.Int("replicas", 1, "replication level for writing")
56         StatsInterval = flag.Duration("stats-interval", time.Second, "time interval between IO stats reports, or 0 to disable")
57         ServiceURL    = flag.String("url", "", "specify scheme://host of a single keep service to exercise (instead of using all advertised services like normal clients)")
58         ServiceUUID   = flag.String("uuid", "", "specify UUID of a single advertised keep service to exercise")
59         getVersion    = flag.Bool("version", false, "Print version information and exit.")
60         RunTime       = flag.Duration("run-time", 0, "time to run (e.g. 60s), or 0 to run indefinitely (default)")
61         Repeat        = flag.Int("repeat", 1, "number of times to repeat the experiment (default 1)")
62         UseIndex      = flag.Bool("use-index", false, "use the GetIndex call to get a list of blocks to read. Requires the SystemRoot token. Use this to rule out caching effects when reading.")
63 )
64
65 func createKeepClient(lgr *log.Logger) (kc *keepclient.KeepClient) {
66         arv, err := arvadosclient.MakeArvadosClient()
67         if err != nil {
68                 lgr.Fatal(err)
69         }
70         kc, err = keepclient.MakeKeepClient(arv)
71         if err != nil {
72                 lgr.Fatal(err)
73         }
74         kc.Want_replicas = *Replicas
75
76         kc.HTTPClient = &http.Client{
77                 Timeout: 10 * time.Minute,
78                 // It's not safe to copy *http.DefaultTransport
79                 // because it has a mutex (which might be locked)
80                 // protecting a private map (which might not be nil).
81                 // So we build our own, using the Go 1.12 default
82                 // values.
83                 Transport: &http.Transport{
84                         TLSClientConfig: arvadosclient.MakeTLSConfig(arv.ApiInsecure),
85                 },
86         }
87         overrideServices(kc, lgr)
88         return kc
89 }
90
91 func main() {
92         if ok, code := cmd.ParseFlags(flag.CommandLine, os.Args[0], os.Args[1:], "", os.Stderr); !ok {
93                 os.Exit(code)
94         } else if *getVersion {
95                 fmt.Printf("%s %s\n", os.Args[0], version)
96                 return
97         }
98
99         lgr := log.New(os.Stderr, "", log.LstdFlags)
100
101         if *ReadThreads > 0 && *WriteThreads == 0 && !*UseIndex {
102                 lgr.Fatal("At least one write thread is required if rthreads is non-zero and -use-index is not enabled")
103         }
104
105         if *ReadThreads == 0 && *WriteThreads == 0 {
106                 lgr.Fatal("Nothing to do!")
107         }
108
109         kc := createKeepClient(lgr)
110
111         // When UseIndex is set, we need a KeepClient with SystemRoot powers to get
112         // the block index from the Keepstore. We use the SystemRootToken from
113         // the Arvados config.yml for that.
114         var cluster *arvados.Cluster
115         if *ReadThreads > 0 && *UseIndex {
116                 cluster = loadConfig(lgr)
117                 kc.Arvados.ApiToken = cluster.SystemRootToken
118         }
119
120         ctx, cancel := context.WithCancel(context.Background())
121         defer cancel()
122         sigChan := make(chan os.Signal, 1)
123         signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
124         go func() {
125                 <-sigChan
126                 // FIXME
127                 //fmt.Print("\r") // Suppress the ^C print
128                 cancel()
129         }()
130
131         csvHeader := "Timestamp,Elapsed,Read (bytes),Avg Read Speed (MiB/s),Peak Read Speed (MiB/s),Written (bytes),Avg Write Speed (MiB/s),Peak Write Speed (MiB/s),Errors,ReadThreads,WriteThreads,VaryRequest,VaryThread,BlockSize,Replicas,StatsInterval,ServiceURL,ServiceUUID,UseIndex,RunTime,Repeat"
132         var summary string
133
134         var nextBufs []chan []byte
135         for i := 0; i < *WriteThreads; i++ {
136                 nextBuf := make(chan []byte, 1)
137                 nextBufs = append(nextBufs, nextBuf)
138                 go makeBufs(nextBuf, i, lgr)
139         }
140
141         for i := 0; i < *Repeat && ctx.Err() == nil; i++ {
142                 summary = runExperiment(ctx, cluster, kc, nextBufs, summary, csvHeader, lgr)
143                 lgr.Printf("*************************** experiment %d complete ******************************\n", i)
144                 summary += fmt.Sprintf(",%d\n", i)
145         }
146
147         lgr.Println("Summary:")
148         lgr.Println()
149         fmt.Println()
150         fmt.Println(csvHeader + ",Experiment")
151         fmt.Println(summary)
152 }
153
154 func runExperiment(ctx context.Context, cluster *arvados.Cluster, kc *keepclient.KeepClient, nextBufs []chan []byte, summary string, csvHeader string, lgr *log.Logger) (newSummary string) {
155         // Send 1234 to bytesInChan when we receive 1234 bytes from keepstore.
156         var bytesInChan = make(chan uint64)
157         var bytesOutChan = make(chan uint64)
158         // Send struct{}{} to errorsChan when an error happens.
159         var errorsChan = make(chan struct{})
160
161         var nextLocator atomic.Value
162         // when UseIndex is set, this channel is used instead of nextLocator
163         var indexLocatorChan = make(chan string, 2)
164
165         newSummary = summary
166
167         // Start warmup
168         ready := make(chan struct{})
169         var warmup bool
170         if *ReadThreads > 0 {
171                 warmup = true
172                 if !*UseIndex {
173                         lgr.Printf("Start warmup phase, waiting for 1 available block before reading starts\n")
174                 } else {
175                         lgr.Printf("Start warmup phase, waiting for block index before reading starts\n")
176                 }
177         }
178         if warmup && !*UseIndex {
179                 go func() {
180                         locator, _, err := kc.PutB(<-nextBufs[0])
181                         if err != nil {
182                                 lgr.Print(err)
183                                 errorsChan <- struct{}{}
184                         }
185                         nextLocator.Store(locator)
186                         lgr.Println("Warmup complete!")
187                         close(ready)
188                 }()
189         } else if warmup && *UseIndex {
190                 // Get list of blocks to read
191                 go getIndexLocators(ctx, cluster, kc, indexLocatorChan, lgr)
192                 select {
193                 case <-ctx.Done():
194                         return
195                 case <-indexLocatorChan:
196                         lgr.Println("Warmup complete!")
197                         close(ready)
198                 }
199         } else {
200                 close(ready)
201         }
202         select {
203         case <-ctx.Done():
204                 return
205         case <-ready:
206         }
207
208         // Warmup complete
209         ctx, cancel := context.WithDeadline(ctx, time.Now().Add(*RunTime))
210         defer cancel()
211
212         for i := 0; i < *WriteThreads; i++ {
213                 go doWrites(ctx, kc, nextBufs[i], &nextLocator, bytesOutChan, errorsChan, lgr)
214         }
215         if *UseIndex {
216                 for i := 0; i < *ReadThreads; i++ {
217                         go doReads(ctx, kc, nil, indexLocatorChan, bytesInChan, errorsChan, lgr)
218                 }
219         } else {
220                 for i := 0; i < *ReadThreads; i++ {
221                         go doReads(ctx, kc, &nextLocator, nil, bytesInChan, errorsChan, lgr)
222                 }
223         }
224
225         t0 := time.Now()
226         var tickChan <-chan time.Time
227         if *StatsInterval > 0 {
228                 tickChan = time.NewTicker(*StatsInterval).C
229         }
230         var bytesIn uint64
231         var bytesOut uint64
232         var errors uint64
233         var rateIn, rateOut float64
234         var maxRateIn, maxRateOut float64
235         var exit, printCsv bool
236         csv := log.New(os.Stdout, "", 0)
237         csv.Println()
238         csv.Println(csvHeader)
239         for {
240                 select {
241                 case <-ctx.Done():
242                         printCsv = true
243                         exit = true
244                 case <-tickChan:
245                         printCsv = true
246                 case i := <-bytesInChan:
247                         bytesIn += i
248                 case o := <-bytesOutChan:
249                         bytesOut += o
250                 case <-errorsChan:
251                         errors++
252                 }
253                 if printCsv {
254                         elapsed := time.Since(t0)
255                         rateIn = float64(bytesIn) / elapsed.Seconds() / 1048576
256                         if rateIn > maxRateIn {
257                                 maxRateIn = rateIn
258                         }
259                         rateOut = float64(bytesOut) / elapsed.Seconds() / 1048576
260                         if rateOut > maxRateOut {
261                                 maxRateOut = rateOut
262                         }
263                         line := fmt.Sprintf("%v,%v,%v,%.1f,%.1f,%v,%.1f,%.1f,%d,%d,%d,%t,%t,%d,%d,%s,%s,%s,%t,%s,%d",
264                                 time.Now().Format("2006/01/02 15:04:05"),
265                                 elapsed,
266                                 bytesIn, rateIn, maxRateIn,
267                                 bytesOut, rateOut, maxRateOut,
268                                 errors,
269                                 *ReadThreads,
270                                 *WriteThreads,
271                                 *VaryRequest,
272                                 *VaryThread,
273                                 *BlockSize,
274                                 *Replicas,
275                                 *StatsInterval,
276                                 *ServiceURL,
277                                 *ServiceUUID,
278                                 *UseIndex,
279                                 *RunTime,
280                                 *Repeat,
281                         )
282                         csv.Println(line)
283                         if exit {
284                                 newSummary += line
285                                 return
286                         }
287                         printCsv = false
288                 }
289         }
290 }
291
292 func makeBufs(nextBuf chan<- []byte, threadID int, lgr *log.Logger) {
293         buf := make([]byte, *BlockSize)
294         if *VaryThread {
295                 binary.PutVarint(buf, int64(threadID))
296         }
297         randSize := 524288
298         if randSize > *BlockSize {
299                 randSize = *BlockSize
300         }
301         for {
302                 if *VaryRequest {
303                         rnd := make([]byte, randSize)
304                         if _, err := io.ReadFull(rand.Reader, rnd); err != nil {
305                                 lgr.Fatal(err)
306                         }
307                         buf = append(rnd, buf[randSize:]...)
308                 }
309                 nextBuf <- buf
310         }
311 }
312
313 func doWrites(ctx context.Context, kc *keepclient.KeepClient, nextBuf <-chan []byte, nextLocator *atomic.Value, bytesOutChan chan<- uint64, errorsChan chan<- struct{}, lgr *log.Logger) {
314         for ctx.Err() == nil {
315                 //lgr.Printf("%s nextbuf %s, waiting for nextBuf\n",nextBuf,time.Now())
316                 buf := <-nextBuf
317                 //lgr.Printf("%s nextbuf %s, done waiting for nextBuf\n",nextBuf,time.Now())
318                 locator, _, err := kc.PutB(buf)
319                 if err != nil {
320                         lgr.Print(err)
321                         errorsChan <- struct{}{}
322                         continue
323                 }
324                 bytesOutChan <- uint64(len(buf))
325                 nextLocator.Store(locator)
326         }
327 }
328
329 func getIndexLocators(ctx context.Context, cluster *arvados.Cluster, kc *keepclient.KeepClient, indexLocatorChan chan<- string, lgr *log.Logger) {
330         if ctx.Err() != nil {
331                 return
332         }
333         locatorsMap := make(map[string]bool)
334         var locators []string
335         var count int64
336         for uuid := range kc.LocalRoots() {
337                 reader, err := kc.GetIndex(uuid, "")
338                 if err != nil {
339                         lgr.Fatalf("Error getting index: %s\n", err)
340                 }
341                 scanner := bufio.NewScanner(reader)
342                 for scanner.Scan() {
343                         locatorsMap[strings.Split(scanner.Text(), " ")[0]] = true
344                         count++
345                 }
346         }
347         for l := range locatorsMap {
348                 locators = append(locators, l)
349         }
350         lgr.Printf("Found %d locators\n", count)
351         lgr.Printf("Found %d locators (deduplicated)\n", len(locators))
352         if len(locators) < 1 {
353                 lgr.Fatal("Error: no locators found. The keepstores do not seem to contain any data. Remove the -use-index cli argument.")
354         }
355
356         mathRand.Seed(time.Now().UnixNano())
357         mathRand.Shuffle(len(locators), func(i, j int) { locators[i], locators[j] = locators[j], locators[i] })
358
359         for _, locator := range locators {
360                 // We need the Collections.BlobSigningKey to sign our block requests. This requires access to /etc/arvados/config.yml
361                 signedLocator := arvados.SignLocator(locator, kc.Arvados.ApiToken, time.Now().Local().Add(1*time.Hour), cluster.Collections.BlobSigningTTL.Duration(), []byte(cluster.Collections.BlobSigningKey))
362                 select {
363                 case <-ctx.Done():
364                         return
365                 case indexLocatorChan <- signedLocator:
366                 }
367         }
368         lgr.Fatal("Error: ran out of locators to read!")
369 }
370
371 func loadConfig(lgr *log.Logger) (cluster *arvados.Cluster) {
372         loader := config.NewLoader(os.Stdin, nil)
373         loader.SkipLegacy = true
374
375         cfg, err := loader.Load()
376         if err != nil {
377                 lgr.Fatal(err)
378         }
379         cluster, err = cfg.GetCluster("")
380         if err != nil {
381                 lgr.Fatal(err)
382         }
383         return
384 }
385
386 func doReads(ctx context.Context, kc *keepclient.KeepClient, nextLocator *atomic.Value, indexLocatorChan <-chan string, bytesInChan chan<- uint64, errorsChan chan<- struct{}, lgr *log.Logger) {
387         for ctx.Err() == nil {
388                 var locator string
389                 if indexLocatorChan != nil {
390                         select {
391                         case <-ctx.Done():
392                                 return
393                         case locator = <-indexLocatorChan:
394                         }
395                 } else {
396                         locator = nextLocator.Load().(string)
397                 }
398                 rdr, size, url, err := kc.Get(locator)
399                 if err != nil {
400                         lgr.Print(err)
401                         errorsChan <- struct{}{}
402                         continue
403                 }
404                 n, err := io.Copy(ioutil.Discard, rdr)
405                 rdr.Close()
406                 if n != size || err != nil {
407                         lgr.Printf("Got %d bytes (expected %d) from %s: %v", n, size, url, err)
408                         errorsChan <- struct{}{}
409                         continue
410                         // Note we don't count the bytes received in
411                         // partial/corrupt responses: we are measuring
412                         // throughput, not resource consumption.
413                 }
414                 bytesInChan <- uint64(n)
415         }
416 }
417
418 func overrideServices(kc *keepclient.KeepClient, lgr *log.Logger) {
419         roots := make(map[string]string)
420         if *ServiceURL != "" {
421                 roots["zzzzz-bi6l4-000000000000000"] = *ServiceURL
422         } else if *ServiceUUID != "" {
423                 for uuid, url := range kc.GatewayRoots() {
424                         if uuid == *ServiceUUID {
425                                 roots[uuid] = url
426                                 break
427                         }
428                 }
429                 if len(roots) == 0 {
430                         lgr.Fatalf("Service %q was not in list advertised by API %+q", *ServiceUUID, kc.GatewayRoots())
431                 }
432         } else {
433                 return
434         }
435         kc.SetServiceRoots(roots, roots, roots)
436 }