Merge branch 'master' into 7167-keep-rsync
[arvados.git] / tools / keep-rsync / keep-rsync.go
index eff8b9c47a7873939adaa93424d332149be4aec3..e848de8b8de425cdf2f50bdc2025966e3f6db084 100644 (file)
@@ -1,27 +1,27 @@
 package main
 
 import (
+       "bufio"
+       "errors"
        "flag"
+       "fmt"
        "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
        "git.curoverse.com/arvados.git/sdk/go/keepclient"
        "io/ioutil"
        "log"
+       "regexp"
        "strings"
+       "time"
 )
 
 // keep-rsync arguments
 var (
-       srcConfig            map[string]string
-       destConfig           map[string]string
-       srcKeepServicesJSON  string
-       destKeepServicesJSON string
-       replications         int
-       prefix               string
+       blobSigningKey string
 )
 
 func main() {
-       var srcConfigFile string
-       var destConfigFile string
+       var srcConfigFile, dstConfigFile, srcKeepServicesJSON, dstKeepServicesJSON, prefix string
+       var replications int
 
        flag.StringVar(
                &srcConfigFile,
@@ -29,15 +29,15 @@ func main() {
                "",
                "Source configuration filename with full path that contains "+
                        "an ARVADOS_API_TOKEN which is a valid datamanager token recognized by the source keep servers, "+
-                       "ARVADOS_API_HOST, ARVADOS_API_HOST_INSECURE, and ARVADOS_BLOB_SIGNING_KEY.")
+                       "ARVADOS_API_HOST, ARVADOS_API_HOST_INSECURE, ARVADOS_EXTERNAL_CLIENT and ARVADOS_BLOB_SIGNING_KEY.")
 
        flag.StringVar(
-               &destConfigFile,
-               "dest-config-file",
+               &dstConfigFile,
+               "dst-config-file",
                "",
                "Destination configuration filename with full path that contains "+
                        "an ARVADOS_API_TOKEN which is a valid datamanager token recognized by the destination keep servers, "+
-                       "ARVADOS_API_HOST, ARVADOS_API_HOST_INSECURE, and ARVADOS_BLOB_SIGNING_KEY.")
+                       "ARVADOS_API_HOST, ARVADOS_API_HOST_INSECURE, ARVADOS_EXTERNAL_CLIENT and ARVADOS_BLOB_SIGNING_KEY.")
 
        flag.StringVar(
                &srcKeepServicesJSON,
@@ -47,16 +47,16 @@ func main() {
                        "If not provided, this list is obtained from api server configured in src-config-file.")
 
        flag.StringVar(
-               &destKeepServicesJSON,
-               "dest-keep-services-json",
+               &dstKeepServicesJSON,
+               "dst-keep-services-json",
                "",
                "An optional list of available destination keepservices. "+
-                       "If not provided, this list is obtained from api server configured in dest-config-file.")
+                       "If not provided, this list is obtained from api server configured in dst-config-file.")
 
        flag.IntVar(
                &replications,
                "replications",
-               3,
+               0,
                "Number of replications to write to the destination.")
 
        flag.StringVar(
@@ -67,75 +67,224 @@ func main() {
 
        flag.Parse()
 
-       var err error
+       srcConfig, dstConfig, err := loadConfig(srcConfigFile, dstConfigFile)
+       if err != nil {
+               log.Fatalf("Error loading configuration from files: %s", err.Error())
+       }
 
-       if srcConfigFile == "" {
-               log.Fatal("-src-config-file must be specified.")
+       // setup src and dst keepclients
+       kcSrc, kcDst, err := setupKeepClients(srcConfig, dstConfig, srcKeepServicesJSON, dstKeepServicesJSON, replications)
+       if err != nil {
+               log.Fatalf("Error configuring keep-rsync: %s", err.Error())
        }
-       srcConfig, err = readConfigFromFile(srcConfigFile)
+
+       // Copy blocks not found in dst from src
+       err = performKeepRsync(kcSrc, kcDst, prefix)
        if err != nil {
-               log.Fatal("Error reading source configuration: %s", err.Error())
+               log.Fatalf("Error while syncing data: %s", err.Error())
        }
+}
 
-       if destConfigFile == "" {
-               log.Fatal("-dest-config-file must be specified.")
+// Load src and dst config from given files
+func loadConfig(srcConfigFile, dstConfigFile string) (srcConfig, dstConfig arvadosclient.APIConfig, err error) {
+       if srcConfigFile == "" {
+               return srcConfig, dstConfig, errors.New("-src-config-file must be specified")
        }
-       destConfig, err = readConfigFromFile(destConfigFile)
+
+       srcConfig, err = readConfigFromFile(srcConfigFile)
        if err != nil {
-               log.Fatal("Error reading destination configuration: %s", err.Error())
+               return srcConfig, dstConfig, fmt.Errorf("Error reading source configuration: %v", err)
        }
 
-       err = initializeKeepRsync()
+       if dstConfigFile == "" {
+               return srcConfig, dstConfig, errors.New("-dst-config-file must be specified")
+       }
+       dstConfig, err = readConfigFromFile(dstConfigFile)
        if err != nil {
-               log.Fatal("Error configurating keep-rsync: %s", err.Error())
+               return srcConfig, dstConfig, fmt.Errorf("Error reading destination configuration: %v", err)
        }
+
+       return srcConfig, dstConfig, err
 }
 
-// Reads config from file
-func readConfigFromFile(filename string) (map[string]string, error) {
+var matchTrue = regexp.MustCompile("^(?i:1|yes|true)$")
+
+// Read config from file
+func readConfigFromFile(filename string) (arvadosclient.APIConfig, error) {
+       var config arvadosclient.APIConfig
+
        content, err := ioutil.ReadFile(filename)
        if err != nil {
-               return nil, err
+               return config, err
        }
 
-       config := make(map[string]string)
        lines := strings.Split(string(content), "\n")
        for _, line := range lines {
                if line == "" {
                        continue
                }
-               kv := strings.Split(line, "=")
-               config[kv[0]] = kv[1]
+
+               kv := strings.SplitN(line, "=", 2)
+               key := strings.TrimSpace(kv[0])
+               value := strings.TrimSpace(kv[1])
+
+               switch key {
+               case "ARVADOS_API_TOKEN":
+                       config.APIToken = value
+               case "ARVADOS_API_HOST":
+                       config.APIHost = value
+               case "ARVADOS_API_HOST_INSECURE":
+                       config.APIHostInsecure = matchTrue.MatchString(value)
+               case "ARVADOS_EXTERNAL_CLIENT":
+                       config.ExternalClient = matchTrue.MatchString(value)
+               case "ARVADOS_BLOB_SIGNING_KEY":
+                       blobSigningKey = value
+               }
        }
        return config, nil
 }
 
-// keep-rsync source and destination clients
-var (
-       arvSrc  arvadosclient.ArvadosClient
-       arvDest arvadosclient.ArvadosClient
-       kcSrc   *keepclient.KeepClient
-       kcDest  *keepclient.KeepClient
-)
-
 // Initializes keep-rsync using the config provided
-func initializeKeepRsync() (err error) {
-       arvSrc, err = arvadosclient.MakeArvadosClientWithConfig(srcConfig)
+func setupKeepClients(srcConfig, dstConfig arvadosclient.APIConfig, srcKeepServicesJSON, dstKeepServicesJSON string, replications int) (kcSrc, kcDst *keepclient.KeepClient, err error) {
+       // arvSrc from srcConfig
+       arvSrc, err := arvadosclient.New(srcConfig)
        if err != nil {
-               return
+               return kcSrc, kcDst, err
        }
 
-       arvDest, err = arvadosclient.MakeArvadosClientWithConfig(destConfig)
+       // arvDst from dstConfig
+       arvDst, err := arvadosclient.New(dstConfig)
        if err != nil {
-               return
+               return kcSrc, kcDst, err
+       }
+
+       // Get default replications value from destination, if it is not already provided
+       if replications == 0 {
+               value, err := arvDst.Discovery("defaultCollectionReplication")
+               if err == nil {
+                       replications = int(value.(float64))
+               } else {
+                       replications = 2
+               }
+       }
+
+       // if srcKeepServicesJSON is provided, use it to load services; else, use DiscoverKeepServers
+       if srcKeepServicesJSON == "" {
+               kcSrc, err = keepclient.MakeKeepClient(&arvSrc)
+               if err != nil {
+                       return nil, nil, err
+               }
+       } else {
+               kcSrc, err = keepclient.MakeKeepClientFromJSON(&arvSrc, srcKeepServicesJSON)
+               if err != nil {
+                       return kcSrc, kcDst, err
+               }
+       }
+
+       // if dstKeepServicesJSON is provided, use it to load services; else, use DiscoverKeepServers
+       if dstKeepServicesJSON == "" {
+               kcDst, err = keepclient.MakeKeepClient(&arvDst)
+               if err != nil {
+                       return kcSrc, kcDst, err
+               }
+       } else {
+               kcDst, err = keepclient.MakeKeepClientFromJSON(&arvDst, dstKeepServicesJSON)
+               if err != nil {
+                       return kcSrc, kcDst, err
+               }
        }
+       kcDst.Want_replicas = replications
 
-       kcSrc, err = keepclient.MakeKeepClient(&arvSrc)
+       return kcSrc, kcDst, nil
+}
+
+// Get unique block locators from src and dst
+// Copy any blocks missing in dst
+func performKeepRsync(kcSrc, kcDst *keepclient.KeepClient, prefix string) error {
+       // Get unique locators from src
+       srcIndex, err := getUniqueLocators(kcSrc, prefix)
+       if err != nil {
+               return err
+       }
+
+       // Get unique locators from dst
+       dstIndex, err := getUniqueLocators(kcDst, prefix)
        if err != nil {
-               return
+               return err
+       }
+
+       // Get list of locators found in src, but missing in dst
+       toBeCopied := getMissingLocators(srcIndex, dstIndex)
+
+       // Copy each missing block to dst
+       err = copyBlocksToDst(toBeCopied, kcSrc, kcDst)
+
+       return err
+}
+
+// Get list of unique locators from the specified cluster
+func getUniqueLocators(kc *keepclient.KeepClient, prefix string) (map[string]bool, error) {
+       uniqueLocators := map[string]bool{}
+
+       // Get index and dedup
+       for uuid := range kc.LocalRoots() {
+               reader, err := kc.GetIndex(uuid, prefix)
+               if err != nil {
+                       return uniqueLocators, err
+               }
+               scanner := bufio.NewScanner(reader)
+               for scanner.Scan() {
+                       uniqueLocators[strings.Split(scanner.Text(), " ")[0]] = true
+               }
+       }
+
+       return uniqueLocators, nil
+}
+
+// Get list of locators that are in src but not in dst
+func getMissingLocators(srcLocators, dstLocators map[string]bool) []string {
+       var missingLocators []string
+       for locator := range srcLocators {
+               if _, ok := dstLocators[locator]; !ok {
+                       missingLocators = append(missingLocators, locator)
+               }
        }
+       return missingLocators
+}
+
+// Copy blocks from src to dst; only those that are missing in dst are copied
+func copyBlocksToDst(toBeCopied []string, kcSrc, kcDst *keepclient.KeepClient) error {
+       done := 0
+       total := len(toBeCopied)
 
-       kcDest, err = keepclient.MakeKeepClient(&arvDest)
+       for _, locator := range toBeCopied {
+               log.Printf("Getting block %d of %d: %v", done+1, total, locator)
+
+               getLocator := locator
+               expiresAt := time.Now().AddDate(0, 0, 1)
+               if blobSigningKey != "" {
+                       getLocator = keepclient.SignLocator(getLocator, kcSrc.Arvados.ApiToken, expiresAt, []byte(blobSigningKey))
+               }
+
+               reader, _, _, err := kcSrc.Get(getLocator)
+               if err != nil {
+                       return fmt.Errorf("Error getting block: %v %v", locator, err)
+               }
+               data, err := ioutil.ReadAll(reader)
+               if err != nil {
+                       return fmt.Errorf("Error reading block data: %v %v", locator, err)
+               }
+
+               log.Printf("Writing block%d of %d: %v", locator)
+               _, _, err = kcDst.PutB(data)
+               if err != nil {
+                       return fmt.Errorf("Error putting block data: %v %v", locator, err)
+               }
+
+               done++
+               log.Printf("%.2f%% done", float64(done)/float64(total)*100)
+       }
 
-       return
+       log.Printf("Successfully copied to destination %d blocks.", total)
+       return nil
 }