16585: fix memory leak when -repeat is specified and -rthreads is not
[arvados.git] / tools / keep-exercise / keep-exercise.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 // Testing tool for Keep services.
6 //
7 // keepexercise helps measure throughput and test reliability under
8 // various usage patterns.
9 //
10 // By default, it reads and writes blocks containing 2^26 NUL
11 // bytes. This generates network traffic without consuming much disk
12 // space.
13 //
14 // For a more realistic test, enable -vary-request. Warning: this will
15 // fill your storage volumes with random data if you leave it running,
16 // which can cost you money or leave you with too little room for
17 // useful data.
18 //
19 package main
20
21 import (
22         "bufio"
23         "context"
24         "crypto/rand"
25         "encoding/binary"
26         "flag"
27         "fmt"
28         "io"
29         "io/ioutil"
30         "log"
31         mathRand "math/rand"
32         "net/http"
33         "os"
34         "os/signal"
35         "strings"
36         "sync/atomic"
37         "syscall"
38         "time"
39
40         "git.arvados.org/arvados.git/lib/config"
41         "git.arvados.org/arvados.git/sdk/go/arvados"
42         "git.arvados.org/arvados.git/sdk/go/arvadosclient"
43         "git.arvados.org/arvados.git/sdk/go/keepclient"
44 )
45
46 var version = "dev"
47
48 // Command line config knobs
49 var (
50         BlockSize     = flag.Int("block-size", keepclient.BLOCKSIZE, "bytes per read/write op")
51         ReadThreads   = flag.Int("rthreads", 1, "number of concurrent readers")
52         WriteThreads  = flag.Int("wthreads", 1, "number of concurrent writers")
53         VaryRequest   = flag.Bool("vary-request", false, "vary the data for each request: consumes disk space, exercises write behavior")
54         VaryThread    = flag.Bool("vary-thread", false, "use -wthreads different data blocks")
55         Replicas      = flag.Int("replicas", 1, "replication level for writing")
56         StatsInterval = flag.Duration("stats-interval", time.Second, "time interval between IO stats reports, or 0 to disable")
57         ServiceURL    = flag.String("url", "", "specify scheme://host of a single keep service to exercise (instead of using all advertised services like normal clients)")
58         ServiceUUID   = flag.String("uuid", "", "specify UUID of a single advertised keep service to exercise")
59         getVersion    = flag.Bool("version", false, "Print version information and exit.")
60         RunTime       = flag.Duration("run-time", 0, "time to run (e.g. 60s), or 0 to run indefinitely (default)")
61         Repeat        = flag.Int("repeat", 1, "number of times to repeat the experiment (default 1)")
62         UseIndex      = flag.Bool("useIndex", false, "use the GetIndex call to get a list of blocks to read. Requires the SystemRoot token. Use this to rule out caching effects when reading.")
63 )
64
65 func createKeepClient(stderr *log.Logger) (kc *keepclient.KeepClient) {
66         arv, err := arvadosclient.MakeArvadosClient()
67         if err != nil {
68                 stderr.Fatal(err)
69         }
70         kc, err = keepclient.MakeKeepClient(arv)
71         if err != nil {
72                 stderr.Fatal(err)
73         }
74         kc.Want_replicas = *Replicas
75
76         kc.HTTPClient = &http.Client{
77                 Timeout: 10 * time.Minute,
78                 // It's not safe to copy *http.DefaultTransport
79                 // because it has a mutex (which might be locked)
80                 // protecting a private map (which might not be nil).
81                 // So we build our own, using the Go 1.12 default
82                 // values.
83                 Transport: &http.Transport{
84                         TLSClientConfig: arvadosclient.MakeTLSConfig(arv.ApiInsecure),
85                 },
86         }
87         overrideServices(kc, stderr)
88         return kc
89 }
90
91 func main() {
92         flag.Parse()
93
94         // Print version information if requested
95         if *getVersion {
96                 fmt.Printf("keep-exercise %s\n", version)
97                 os.Exit(0)
98         }
99
100         stderr := log.New(os.Stderr, "", log.LstdFlags)
101
102         if *ReadThreads > 0 && *WriteThreads == 0 && !*UseIndex {
103                 stderr.Fatal("At least one write thread is required if rthreads is non-zero and useIndex is not enabled")
104         }
105
106         if *ReadThreads == 0 && *WriteThreads == 0 {
107                 stderr.Fatal("Nothing to do!")
108         }
109
110         kc := createKeepClient(stderr)
111
112         // When UseIndx is set, we need a KeepClient with SystemRoot powers to get
113         // the block index from the Keepstore. We use the SystemRootToken from
114         // the Arvados config.yml for that.
115         var cluster *arvados.Cluster
116         if *ReadThreads > 0 && *UseIndex {
117                 cluster = loadConfig(stderr)
118                 kc.Arvados.ApiToken = cluster.SystemRootToken
119         }
120
121         ctx, cancel := context.WithCancel(context.Background())
122         defer cancel()
123         sigChan := make(chan os.Signal, 1)
124         signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
125         go func() {
126                 <-sigChan
127                 fmt.Print("\r") // Suppress the ^C print
128                 cancel()
129         }()
130
131         csvHeader := "Timestamp,Elapsed,Read (bytes),Avg Read Speed (MiB/s),Peak Read Speed (MiB/s),Written (bytes),Avg Write Speed (MiB/s),Peak Write Speed (MiB/s),Errors,ReadThreads,WriteThreads,VaryRequest,VaryThread,BlockSize,Replicas,StatsInterval,ServiceURL,ServiceUUID,UseIndex,RunTime,Repeat"
132         var summary string
133
134         var nextBufs []chan []byte
135         for i := 0; i < *WriteThreads; i++ {
136                 nextBuf := make(chan []byte, 1)
137                 nextBufs = append(nextBufs, nextBuf)
138                 go makeBufs(nextBuf, i, stderr)
139         }
140
141         for i := 0; i < *Repeat; i++ {
142                 if ctx.Err() == nil {
143                         summary = runExperiment(ctx, cluster, kc, nextBufs, summary, csvHeader, stderr)
144                         stderr.Printf("*************************** experiment %d complete ******************************\n", i)
145                         summary += fmt.Sprintf(",%d\n", i)
146                 }
147         }
148         if ctx.Err() == nil {
149                 stderr.Println("Summary:")
150                 stderr.Println()
151                 fmt.Println()
152                 fmt.Println(csvHeader + ",Experiment")
153                 fmt.Println(summary)
154         }
155 }
156
157 func runExperiment(ctx context.Context, cluster *arvados.Cluster, kc *keepclient.KeepClient, nextBufs []chan []byte, summary string, csvHeader string, stderr *log.Logger) (newSummary string) {
158         // Send 1234 to bytesInChan when we receive 1234 bytes from keepstore.
159         var bytesInChan = make(chan uint64)
160         var bytesOutChan = make(chan uint64)
161         // Send struct{}{} to errorsChan when an error happens.
162         var errorsChan = make(chan struct{})
163
164         var nextLocator atomic.Value
165         // when UseIndex is set, this channel is used instead of nextLocator
166         var indexLocatorChan = make(chan string, 2)
167
168         newSummary = summary
169
170         // Start warmup
171         ready := make(chan struct{})
172         var warmup bool
173         if *ReadThreads > 0 {
174                 warmup = true
175                 if !*UseIndex {
176                         stderr.Printf("Start warmup phase, waiting for 1 available block before reading starts\n")
177                 } else {
178                         stderr.Printf("Start warmup phase, waiting for block index before reading starts\n")
179                 }
180         }
181         if warmup && !*UseIndex {
182                 go func() {
183                         locator, _, err := kc.PutB(<-nextBufs[0])
184                         if err != nil {
185                                 stderr.Print(err)
186                                 errorsChan <- struct{}{}
187                         }
188                         nextLocator.Store(locator)
189                         stderr.Println("Warmup complete!")
190                         close(ready)
191                 }()
192         } else if warmup && *UseIndex {
193                 // Get list of blocks to read
194                 go getIndexLocators(ctx, cluster, kc, indexLocatorChan, stderr)
195                 select {
196                 case <-ctx.Done():
197                         return
198                 case <-indexLocatorChan:
199                         stderr.Println("Warmup complete!")
200                         close(ready)
201                 }
202         } else {
203                 close(ready)
204         }
205         select {
206         case <-ctx.Done():
207                 return
208         case <-ready:
209         }
210
211         // Warmup complete
212         ctx, cancel := context.WithDeadline(ctx, time.Now().Add(*RunTime))
213         defer cancel()
214
215         for i := 0; i < *WriteThreads; i++ {
216                 go doWrites(ctx, kc, nextBufs[i], &nextLocator, bytesOutChan, errorsChan, stderr)
217         }
218         if *UseIndex {
219                 for i := 0; i < *ReadThreads; i++ {
220                         go doIndexReads(ctx, kc, cluster, indexLocatorChan, bytesInChan, errorsChan, stderr)
221                 }
222         } else {
223                 for i := 0; i < *ReadThreads; i++ {
224                         go doReads(ctx, kc, &nextLocator, bytesInChan, errorsChan, stderr)
225                 }
226         }
227
228         t0 := time.Now()
229         var tickChan <-chan time.Time
230         if *StatsInterval > 0 {
231                 tickChan = time.NewTicker(*StatsInterval).C
232         }
233         var bytesIn uint64
234         var bytesOut uint64
235         var errors uint64
236         var rateIn, rateOut float64
237         var maxRateIn, maxRateOut float64
238         var exit, printCsv bool
239         csv := log.New(os.Stdout, "", 0)
240         csv.Println()
241         csv.Println(csvHeader)
242         for {
243                 select {
244                 case <-ctx.Done():
245                         printCsv = true
246                         exit = true
247                 case <-tickChan:
248                         printCsv = true
249                 case i := <-bytesInChan:
250                         bytesIn += i
251                 case o := <-bytesOutChan:
252                         bytesOut += o
253                 case <-errorsChan:
254                         errors++
255                 }
256                 if printCsv {
257                         elapsed := time.Since(t0)
258                         rateIn = float64(bytesIn) / elapsed.Seconds() / 1048576
259                         if rateIn > maxRateIn {
260                                 maxRateIn = rateIn
261                         }
262                         rateOut = float64(bytesOut) / elapsed.Seconds() / 1048576
263                         if rateOut > maxRateOut {
264                                 maxRateOut = rateOut
265                         }
266                         line := fmt.Sprintf("%v,%v,%v,%.1f,%.1f,%v,%.1f,%.1f,%d,%d,%d,%t,%t,%d,%d,%s,%s,%s,%t,%s,%d",
267                                 time.Now().Format("2006/01/02 15:04:05"),
268                                 elapsed,
269                                 bytesIn, rateIn, maxRateIn,
270                                 bytesOut, rateOut, maxRateOut,
271                                 errors,
272                                 *ReadThreads,
273                                 *WriteThreads,
274                                 *VaryRequest,
275                                 *VaryThread,
276                                 *BlockSize,
277                                 *Replicas,
278                                 *StatsInterval,
279                                 *ServiceURL,
280                                 *ServiceUUID,
281                                 *UseIndex,
282                                 *RunTime,
283                                 *Repeat,
284                         )
285                         csv.Println(line)
286                         if exit {
287                                 newSummary += line
288                                 return
289                         }
290                         printCsv = false
291                 }
292         }
293 }
294
295 func makeBufs(nextBuf chan<- []byte, threadID int, stderr *log.Logger) {
296         buf := make([]byte, *BlockSize)
297         if *VaryThread {
298                 binary.PutVarint(buf, int64(threadID))
299         }
300         randSize := 524288
301         if randSize > *BlockSize {
302                 randSize = *BlockSize
303         }
304         for {
305                 if *VaryRequest {
306                         rnd := make([]byte, randSize)
307                         if _, err := io.ReadFull(rand.Reader, rnd); err != nil {
308                                 stderr.Fatal(err)
309                         }
310                         buf = append(rnd, buf[randSize:]...)
311                 }
312                 nextBuf <- buf
313         }
314 }
315
316 func doWrites(ctx context.Context, kc *keepclient.KeepClient, nextBuf <-chan []byte, nextLocator *atomic.Value, bytesOutChan chan<- uint64, errorsChan chan<- struct{}, stderr *log.Logger) {
317         for ctx.Err() == nil {
318                 buf := <-nextBuf
319                 locator, _, err := kc.PutB(buf)
320                 if err != nil {
321                         stderr.Print(err)
322                         errorsChan <- struct{}{}
323                         continue
324                 }
325                 bytesOutChan <- uint64(len(buf))
326                 nextLocator.Store(locator)
327         }
328 }
329
330 func getIndexLocators(ctx context.Context, cluster *arvados.Cluster, kc *keepclient.KeepClient, indexLocatorChan chan<- string, stderr *log.Logger) {
331         if ctx.Err() == nil {
332                 var locators []string
333                 for uuid := range kc.LocalRoots() {
334                         reader, err := kc.GetIndex(uuid, "")
335                         if err != nil {
336                                 stderr.Fatalf("Error getting index: %s\n", err)
337                         }
338                         scanner := bufio.NewScanner(reader)
339                         for scanner.Scan() {
340                                 locators = append(locators, strings.Split(scanner.Text(), " ")[0])
341                         }
342                 }
343                 stderr.Printf("Found %d locators\n", len(locators))
344                 if len(locators) < 1 {
345                         stderr.Fatal("Error: no locators found. The keepstores do not seem to contain any data. Remove the useIndex cli argument.")
346                 }
347
348                 mathRand.Seed(time.Now().UnixNano())
349                 mathRand.Shuffle(len(locators), func(i, j int) { locators[i], locators[j] = locators[j], locators[i] })
350
351                 for _, locator := range locators {
352                         // We need the Collections.BlobSigningKey to sign our block requests. This requires access to /etc/arvados/config.yml
353                         signedLocator := arvados.SignLocator(locator, kc.Arvados.ApiToken, time.Now().Local().Add(1*time.Hour), cluster.Collections.BlobSigningTTL.Duration(), []byte(cluster.Collections.BlobSigningKey))
354                         select {
355                         case <-ctx.Done():
356                                 return
357                         case indexLocatorChan <- signedLocator:
358                         }
359                 }
360                 stderr.Fatal("Error: ran out of locators to read!")
361         }
362 }
363
364 func loadConfig(stderr *log.Logger) (cluster *arvados.Cluster) {
365         loader := config.NewLoader(os.Stdin, nil)
366         loader.SkipLegacy = true
367
368         cfg, err := loader.Load()
369         if err != nil {
370                 stderr.Fatal(err)
371         }
372         cluster, err = cfg.GetCluster("")
373         if err != nil {
374                 stderr.Fatal(err)
375         }
376         return
377 }
378
379 func doIndexReads(ctx context.Context, kc *keepclient.KeepClient, cluster *arvados.Cluster, indexLocatorChan <-chan string, bytesInChan chan<- uint64, errorsChan chan<- struct{}, stderr *log.Logger) {
380         for ctx.Err() == nil {
381                 select {
382                 case <-ctx.Done():
383                         return
384                 case locator := <-indexLocatorChan:
385                         rdr, size, url, err := kc.Get(locator)
386                         if err != nil {
387                                 stderr.Print(err)
388                                 errorsChan <- struct{}{}
389                                 continue
390                         }
391                         n, err := io.Copy(ioutil.Discard, rdr)
392                         rdr.Close()
393                         if n != size || err != nil {
394                                 stderr.Printf("Got %d bytes (expected %d) from %s: %v", n, size, url, err)
395                                 errorsChan <- struct{}{}
396                                 continue
397                                 // Note we don't count the bytes received in
398                                 // partial/corrupt responses: we are measuring
399                                 // throughput, not resource consumption.
400                         }
401                         bytesInChan <- uint64(n)
402                 }
403         }
404 }
405
406 func doReads(ctx context.Context, kc *keepclient.KeepClient, nextLocator *atomic.Value, bytesInChan chan<- uint64, errorsChan chan<- struct{}, stderr *log.Logger) {
407         var locator string
408         for ctx.Err() == nil {
409                 locator = nextLocator.Load().(string)
410                 rdr, size, url, err := kc.Get(locator)
411                 if err != nil {
412                         stderr.Print(err)
413                         errorsChan <- struct{}{}
414                         continue
415                 }
416                 n, err := io.Copy(ioutil.Discard, rdr)
417                 rdr.Close()
418                 if n != size || err != nil {
419                         stderr.Printf("Got %d bytes (expected %d) from %s: %v", n, size, url, err)
420                         errorsChan <- struct{}{}
421                         continue
422                         // Note we don't count the bytes received in
423                         // partial/corrupt responses: we are measuring
424                         // throughput, not resource consumption.
425                 }
426                 bytesInChan <- uint64(n)
427         }
428 }
429
430 func overrideServices(kc *keepclient.KeepClient, stderr *log.Logger) {
431         roots := make(map[string]string)
432         if *ServiceURL != "" {
433                 roots["zzzzz-bi6l4-000000000000000"] = *ServiceURL
434         } else if *ServiceUUID != "" {
435                 for uuid, url := range kc.GatewayRoots() {
436                         if uuid == *ServiceUUID {
437                                 roots[uuid] = url
438                                 break
439                         }
440                 }
441                 if len(roots) == 0 {
442                         stderr.Fatalf("Service %q was not in list advertised by API %+q", *ServiceUUID, kc.GatewayRoots())
443                 }
444         } else {
445                 return
446         }
447         kc.SetServiceRoots(roots, roots, roots)
448 }